| 1 | #!/usr/bin/perl -w |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | |
| 5 | # Utilities for various character set issues. Currently handles ASCII and |
| 6 | # EBCDIC only. It is trivial to add support for new EBCDIC code pages (unless |
| 7 | # they have identical variant character signatures as existing ones, and there |
| 8 | # aren't other glitches that arise): just add a mapping table to |
| 9 | # %ebcdic_translations and regen everything that uses this. |
| 10 | |
| 11 | my %ebcdic_translations = ( |
| 12 | # Keys are code page name; values are arrays that map ASCII ordinals to |
| 13 | # the code page's ordinals |
| 14 | |
| 15 | 'EBCDIC 1047' => |
| 16 | [ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
| 17 | 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, |
| 18 | 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, |
| 19 | 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F, |
| 20 | 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, |
| 21 | 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xAD, 0xE0, 0xBD, 0x5F, 0x6D, |
| 22 | 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, |
| 23 | 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07, |
| 24 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B, |
| 25 | 0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0xFF, |
| 26 | 0x41, 0xAA, 0x4A, 0xB1, 0x9F, 0xB2, 0x6A, 0xB5, 0xBB, 0xB4, 0x9A, 0x8A, 0xB0, 0xCA, 0xAF, 0xBC, |
| 27 | 0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB, |
| 28 | 0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77, |
| 29 | 0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xFD, 0xFE, 0xFB, 0xFC, 0xBA, 0xAE, 0x59, |
| 30 | 0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57, |
| 31 | 0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xDD, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF |
| 32 | ], |
| 33 | |
| 34 | 'EBCDIC POSIX-BC' => |
| 35 | [ |
| 36 | 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
| 37 | 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, |
| 38 | 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, |
| 39 | 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F, |
| 40 | 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, |
| 41 | 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xBB, 0xBC, 0xBD, 0x6A, 0x6D, |
| 42 | 0x4A, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, |
| 43 | 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xFB, 0x4F, 0xFD, 0xFF, 0x07, |
| 44 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B, |
| 45 | 0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0x5F, |
| 46 | 0x41, 0xAA, 0xB0, 0xB1, 0x9F, 0xB2, 0xD0, 0xB5, 0x79, 0xB4, 0x9A, 0x8A, 0xBA, 0xCA, 0xAF, 0xA1, |
| 47 | 0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB, |
| 48 | 0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77, |
| 49 | 0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xE0, 0xFE, 0xDD, 0xFC, 0xAD, 0xAE, 0x59, |
| 50 | 0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57, |
| 51 | 0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xC0, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF |
| 52 | ], |
| 53 | |
| 54 | 'EBCDIC 037' => |
| 55 | [ |
| 56 | 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x25, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
| 57 | 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, |
| 58 | 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, |
| 59 | 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F, |
| 60 | 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, |
| 61 | 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xBA, 0xE0, 0xBB, 0xB0, 0x6D, |
| 62 | 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, |
| 63 | 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07, |
| 64 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x15, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B, |
| 65 | 0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0xFF, |
| 66 | 0x41, 0xAA, 0x4A, 0xB1, 0x9F, 0xB2, 0x6A, 0xB5, 0xBD, 0xB4, 0x9A, 0x8A, 0x5F, 0xCA, 0xAF, 0xBC, |
| 67 | 0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB, |
| 68 | 0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77, |
| 69 | 0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xFD, 0xFE, 0xFB, 0xFC, 0xAD, 0xAE, 0x59, |
| 70 | 0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57, |
| 71 | 0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xDD, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF |
| 72 | ], |
| 73 | ); |
| 74 | |
| 75 | my $ascii_key = 'ASCII/Latin1'; |
| 76 | |
| 77 | my %I8_TO_NATIVE_UTF8; # Maps I8 UTF to final UTF-EBCDIC |
| 78 | # See http://www.unicode.org/reports/tr16/ |
| 79 | |
| 80 | sub get_supported_code_pages() { |
| 81 | # Returns an ordered array of the currently supported code pages, |
| 82 | # including ASCII as the 0th element, 1047 as the 1th, and the others |
| 83 | # sorted lexically by code page name. |
| 84 | |
| 85 | # Create an ASCII table. |
| 86 | unless (exists $ebcdic_translations{$ascii_key}) { |
| 87 | for my $i (0 .. 255) { |
| 88 | $ebcdic_translations{$ascii_key}->[$i] = $i; |
| 89 | } |
| 90 | } |
| 91 | |
| 92 | return sort { |
| 93 | $a eq $ascii_key |
| 94 | ? -1 |
| 95 | : $b eq $ascii_key |
| 96 | ? 1 |
| 97 | : $a =~ /1047/ |
| 98 | ? -1 |
| 99 | : $b =~ /1047/ |
| 100 | ? 1 |
| 101 | : $a cmp $b |
| 102 | } keys %ebcdic_translations; |
| 103 | } |
| 104 | |
| 105 | sub get_a2n($) { |
| 106 | # Returns the mapping array for ASCII to code page for the code page named |
| 107 | # by the input parameter. |
| 108 | |
| 109 | my $charset = shift; |
| 110 | |
| 111 | if (! exists $ebcdic_translations{$charset}) { |
| 112 | die "Unknown character set '$charset'"; |
| 113 | } |
| 114 | |
| 115 | return @{$ebcdic_translations{$charset}}; |
| 116 | } |
| 117 | |
| 118 | sub get_I8_2_utf($) { |
| 119 | # Returns the mapping array for I8 to code page UTF-EBCDIC for the code |
| 120 | # page named by the input parameter. This is Table 2 of TR16 customized |
| 121 | # for the code page. See utfebcdic.h for why, contrary to TR16, it has to |
| 122 | # be code-page-specific. |
| 123 | |
| 124 | my $charset = shift; |
| 125 | |
| 126 | die "I8 not a valid concept for ASCII" if $charset eq $ascii_key; |
| 127 | die "'$charset' unknown" unless exists $ebcdic_translations{$charset}; |
| 128 | |
| 129 | # Generate the table if not already present |
| 130 | if (! exists $I8_TO_NATIVE_UTF8{$charset}) { |
| 131 | |
| 132 | # The code points not used for invariants. Initialized to everything, |
| 133 | # then entries are removed as we go along. |
| 134 | my %unused_cps; |
| 135 | for my $i (0 .. 255) { |
| 136 | $unused_cps{$i} = 1; |
| 137 | } |
| 138 | |
| 139 | # These are the invariants. The output has them mapped to the |
| 140 | # original EBCDIC code point. |
| 141 | for my $i (0 .. 0x9F) { |
| 142 | use charnames (); |
| 143 | my $ebcdic_value = $ebcdic_translations{$charset}[$i]; |
| 144 | #printf "$charset: using %02x which is %02x ascii, %s\n", $ebcdic_value, $i, charnames::viacode($i); |
| 145 | $I8_TO_NATIVE_UTF8{$charset}[$i] = $ebcdic_value; |
| 146 | if (! defined delete $unused_cps{$ebcdic_value}) { |
| 147 | die "Two code points map to $ebcdic_value; one is $i"; |
| 148 | } |
| 149 | } |
| 150 | |
| 151 | # Put the unused code points in order |
| 152 | my @unused_cps = sort { $a <=> $b } keys %unused_cps; |
| 153 | |
| 154 | # Fill in the rest of the map with these ordered code points, as TR16 |
| 155 | # specifies |
| 156 | for my $i (0xA0 .. 255) { |
| 157 | $I8_TO_NATIVE_UTF8{$charset}[$i] = shift @unused_cps; |
| 158 | #printf "$charset: filling in %02x which is %02x ascii, %s\n", $I8_TO_NATIVE_UTF8{$charset}[$i], $i, charnames::viacode($i); |
| 159 | } |
| 160 | |
| 161 | if (@unused_cps) { |
| 162 | die "Left-over code points"; |
| 163 | } |
| 164 | } |
| 165 | |
| 166 | return @{$I8_TO_NATIVE_UTF8{$charset}}; |
| 167 | } |
| 168 | |
| 169 | { # Closure |
| 170 | |
| 171 | my $charset; # We use these to do some error checking that the #if and |
| 172 | # #endif are matched. |
| 173 | my $indent; |
| 174 | |
| 175 | sub get_conditional_compile_line_start($;$) { |
| 176 | # Returns the '#if' line to put into C code to compile for the code |
| 177 | # page given by the first parameter. The second parameter, if |
| 178 | # present, is the indentation level, like '# if ...' |
| 179 | |
| 180 | if (defined $charset || defined $indent) { |
| 181 | die "Missing call to get_conditional_compile_line_end()" |
| 182 | } |
| 183 | |
| 184 | $charset = shift; |
| 185 | my $indent_level = shift // 0; |
| 186 | |
| 187 | die "This is designed to run only on an ASCII platform" unless ord "A" == 65; |
| 188 | |
| 189 | if ($indent_level == 0) { |
| 190 | $indent = ""; |
| 191 | } |
| 192 | else { |
| 193 | $indent = " " x (($indent_level * 4) - 1); |
| 194 | } |
| 195 | |
| 196 | die "Unknown character set '$charset'" unless exists $ebcdic_translations{$charset}; |
| 197 | |
| 198 | my $return = ""; |
| 199 | { |
| 200 | no warnings 'qw'; |
| 201 | my $count = -1; |
| 202 | |
| 203 | # We use all the typical variant characters to construct the #if, |
| 204 | # so that it is unlikely that a different code page will match |
| 205 | # this #if |
| 206 | for my $char (qw/A \\\ [ ] { } ^ ~ ! # | $ @ `/) { |
| 207 | my $compare; |
| 208 | my $ascii_ord = ord $char; |
| 209 | my $first_time = $return eq ""; |
| 210 | |
| 211 | $compare = $ebcdic_translations{$charset}[$ascii_ord]; |
| 212 | $return .= " && " unless $first_time; |
| 213 | $return .= "'$char' == $compare"; |
| 214 | $return .= " /* $charset */" if $first_time; |
| 215 | last if $charset eq $ascii_key; |
| 216 | $count++; |
| 217 | $return .= " \\\n " if $first_time || $count % 5 == 0; |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | return "#${indent}if $return\n"; |
| 222 | } |
| 223 | |
| 224 | sub get_conditional_compile_line_end () { |
| 225 | # Returns the #endif for the currently open #if |
| 226 | |
| 227 | my $return = "#${indent}endif\t/* $charset */\n"; |
| 228 | undef $charset; |
| 229 | undef $indent; |
| 230 | return $return; |
| 231 | } |
| 232 | } |
| 233 | |
| 234 | sub _UTF_START_MASK($) { |
| 235 | # Internal |
| 236 | my $len = shift; |
| 237 | return ((($len) >= 6) ? 0x01 : (0x1F >> (($len)-2))); |
| 238 | } |
| 239 | |
| 240 | sub _UTF_START_MARK($) { |
| 241 | # Internal |
| 242 | return (0xFF & (0xFE << (7-(shift)))); |
| 243 | } |
| 244 | |
| 245 | sub cp_2_utfbytes($$) { |
| 246 | # Returns a string consisting of the UTF-EBCDIC for the code page given by |
| 247 | # the first parameter and the code point within it given by the 2nd, using |
| 248 | # the UTF-MOD algorithm published in TR16. (If the "code page" is ASCII, |
| 249 | # straight UTF-8 is returned.) |
| 250 | |
| 251 | my ($ucp, $charset) = @_; |
| 252 | |
| 253 | if ($charset eq $ascii_key) { |
| 254 | my $str = chr $ucp; |
| 255 | utf8::upgrade($str); |
| 256 | utf8::encode($str); |
| 257 | return $str; |
| 258 | } |
| 259 | elsif (exists $ebcdic_translations{$charset}) { |
| 260 | |
| 261 | if ($ucp < 0xA0) { |
| 262 | return chr $ebcdic_translations{$charset}[$ucp]; |
| 263 | } |
| 264 | |
| 265 | my @I8_2_utf = get_I8_2_utf($charset); |
| 266 | |
| 267 | my $len = $ucp < 0xA0 ? 1 : |
| 268 | $ucp < 0x400 ? 2 : |
| 269 | $ucp < 0x4000 ? 3 : |
| 270 | $ucp < 0x40000 ? 4 : |
| 271 | $ucp < 0x400000 ? 5 : |
| 272 | $ucp < 0x4000000 ? 6 : 7; |
| 273 | |
| 274 | my @str; |
| 275 | for (1 .. $len - 1) { |
| 276 | unshift @str, chr $I8_2_utf[($ucp & 0x1f) | 0xA0]; |
| 277 | $ucp >>= 5; |
| 278 | } |
| 279 | |
| 280 | unshift @str, chr $I8_2_utf[($ucp & _UTF_START_MASK($len)) | _UTF_START_MARK($len)]; |
| 281 | |
| 282 | return join "", @str; |
| 283 | } |
| 284 | else { |
| 285 | die "Unknown character set '$charset'"; |
| 286 | } |
| 287 | } |
| 288 | |
| 289 | 1; |