regen/charset_translations.pl

   1 #!/usr/bin/perl -w
   2 use strict;
   3 use warnings;
   4
   5 # WARNING: This must be kept in sync with the UTF8_MAXBYTES value in
   6 # utfebcdic.h
   7 $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES = 14;
   8
   9 # Utilities for various character set issues.  Currently handles ASCII and
  10 # EBCDIC only.  It is trivial to add support for new EBCDIC code pages (unless
  11 # they have identical variant character signatures as existing ones, and there
  12 # aren't other glitches that arise): just add a mapping table to
  13 # %ebcdic_translations and regen everything that uses this.
  14
  15 my %ebcdic_translations = (
  16     # Keys are code page name; values are arrays that map ASCII ordinals to
  17     # the code page's ordinals
  18
  19     'EBCDIC 1047' =>
  20       [ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  21         0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
  22         0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
  23         0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F,
  24         0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
  25         0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xAD, 0xE0, 0xBD, 0x5F, 0x6D,
  26         0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
  27         0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07,
  28         0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B,
  29         0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0xFF,
  30         0x41, 0xAA, 0x4A, 0xB1, 0x9F, 0xB2, 0x6A, 0xB5, 0xBB, 0xB4, 0x9A, 0x8A, 0xB0, 0xCA, 0xAF, 0xBC,
  31         0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB,
  32         0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77,
  33         0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xFD, 0xFE, 0xFB, 0xFC, 0xBA, 0xAE, 0x59,
  34         0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57,
  35         0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xDD, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF
  36       ],
  37
  38 #    'EBCDIC POSIX-BC' =>
  39 #      [
  40 #        0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  41 #        0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
  42 #        0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
  43 #        0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F,
  44 #        0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
  45 #        0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xBB, 0xBC, 0xBD, 0x6A, 0x6D,
  46 #        0x4A, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
  47 #        0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xFB, 0x4F, 0xFD, 0xFF, 0x07,
  48 #        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B,
  49 #        0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0x5F,
  50 #        0x41, 0xAA, 0xB0, 0xB1, 0x9F, 0xB2, 0xD0, 0xB5, 0x79, 0xB4, 0x9A, 0x8A, 0xBA, 0xCA, 0xAF, 0xA1,
  51 #        0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB,
  52 #        0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77,
  53 #        0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xE0, 0xFE, 0xDD, 0xFC, 0xAD, 0xAE, 0x59,
  54 #        0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57,
  55 #        0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xC0, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF
  56 #      ],
  57
  58     'EBCDIC 037' =>
  59       [
  60         0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x25, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  61         0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
  62         0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
  63         0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F,
  64         0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
  65         0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xBA, 0xE0, 0xBB, 0xB0, 0x6D,
  66         0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
  67         0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07,
  68         0x20, 0x21, 0x22, 0x23, 0x24, 0x15, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B,
  69         0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0xFF,
  70         0x41, 0xAA, 0x4A, 0xB1, 0x9F, 0xB2, 0x6A, 0xB5, 0xBD, 0xB4, 0x9A, 0x8A, 0x5F, 0xCA, 0xAF, 0xBC,
  71         0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB,
  72         0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77,
  73         0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xFD, 0xFE, 0xFB, 0xFC, 0xAD, 0xAE, 0x59,
  74         0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57,
  75         0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xDD, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF
  76       ],
  77 );
  78
  79 my $ascii_key = 'ASCII/Latin1';
  80
  81 my %I8_TO_NATIVE_UTF8;  # Maps I8 UTF to final UTF-EBCDIC
  82                         # See http://www.unicode.org/reports/tr16/
  83
  84 sub get_supported_code_pages() {
  85     # Returns an ordered array of the currently supported code pages,
  86     # including ASCII as the 0th element, 1047 as the 1th, and the others
  87     # sorted lexically by code page name.
  88
  89     # Create an ASCII table.
  90     unless (exists $ebcdic_translations{$ascii_key}) {
  91         for my $i (0 .. 255) {
  92             $ebcdic_translations{$ascii_key}->[$i] = $i;
  93         }
  94     }
  95
  96     return sort {
  97                   $a eq $ascii_key
  98                   ? -1
  99                   : $b eq $ascii_key
 100                     ? 1
 101                     : $a =~ /1047/
 102                       ? -1
 103                       : $b =~ /1047/
 104                         ? 1
 105                         : $a cmp $b
 106                 } keys %ebcdic_translations;
 107 }
 108
 109 sub get_a2n($) {
 110     # Returns the mapping array for ASCII to code page for the code page named
 111     # by the input parameter.
 112
 113     my $charset = shift;
 114
 115     if (! exists $ebcdic_translations{$charset}) {
 116         die "Unknown character set '$charset'";
 117     }
 118
 119     return $ebcdic_translations{$charset};
 120 }
 121
 122 sub get_I8_2_utf($) {
 123     # Returns the mapping array for I8 to code page UTF-EBCDIC for the code
 124     # page named by the input parameter.  This is Table 2 of TR16 customized
 125     # for the code page.  See utfebcdic.h for why, contrary to TR16, it has to
 126     # be code-page-specific.
 127
 128     my $charset = shift;
 129
 130     die "I8 not a valid concept for ASCII" if $charset eq $ascii_key;
 131     die "'$charset' unknown" unless exists $ebcdic_translations{$charset};
 132
 133     # Generate the table if not already present
 134     if (! exists $I8_TO_NATIVE_UTF8{$charset}) {
 135
 136         # The code points not used for invariants.  Initialized to everything,
 137         # then entries are removed as we go along.
 138         my %unused_cps;
 139         for my $i (0 .. 255) {
 140             $unused_cps{$i} = 1;
 141         }
 142
 143         # These are the invariants.  The output has them mapped to the
 144         # original EBCDIC code point.
 145         for my $i (0 .. 0x9F) {
 146             use charnames ();
 147             my $ebcdic_value = $ebcdic_translations{$charset}[$i];
 148             #printf "$charset: using %02x which is %02x ascii, %s\n", $ebcdic_value, $i, charnames::viacode($i);
 149             $I8_TO_NATIVE_UTF8{$charset}[$i] = $ebcdic_value;
 150             if (! defined delete $unused_cps{$ebcdic_value}) {
 151                 die "Two code points map to $ebcdic_value; one is $i";
 152             }
 153         }
 154
 155         # Put the unused code points in order
 156         my @unused_cps = sort { $a <=> $b } keys %unused_cps;
 157
 158         # Fill in the rest of the map with these ordered code points, as TR16
 159         # specifies
 160         for my $i (0xA0 .. 255) {
 161             $I8_TO_NATIVE_UTF8{$charset}[$i] = shift @unused_cps;
 162             #printf "$charset: filling in %02x which is %02x ascii, %s\n", $I8_TO_NATIVE_UTF8{$charset}[$i], $i, charnames::viacode($i);
 163         }
 164
 165         if (@unused_cps) {
 166             die "Left-over code points";
 167         }
 168     }
 169
 170     return $I8_TO_NATIVE_UTF8{$charset};
 171 }
 172
 173 { # Closure
 174
 175     my $charset;    # We use these to do some error checking that the #if and
 176                     # #endif are matched.
 177     my $indent;
 178
 179     sub get_conditional_compile_line_start($;$) {
 180         # Returns the '#if' line to put into C code to compile for the code
 181         # page given by the first parameter.  The second parameter, if
 182         # present, is the indentation level, like '#   if ...'
 183
 184         if (defined $charset || defined $indent) {
 185             die "Missing call to get_conditional_compile_line_end()"
 186         }
 187
 188         $charset = shift;
 189         my $indent_level = shift // 0;
 190
 191         die "This is designed to run only on an ASCII platform" unless ord "A" == 65;
 192
 193         if ($indent_level == 0) {
 194             $indent = "";
 195         }
 196         else {
 197             $indent = "  " x $indent_level;
 198         }
 199
 200         die "Unknown character set '$charset'" unless exists $ebcdic_translations{$charset};
 201
 202         my $return = "";
 203         {
 204             no warnings 'qw';
 205             my $count = -1;
 206
 207             # We use all the typical variant characters to construct the #if,
 208             # so that it is unlikely that a different code page will match
 209             # this #if
 210             for my $char (qw/A \\\ [ ] { } ^ ~ ! # | $ @ `/) {
 211                 my $compare;
 212                 my $ascii_ord = ord $char;
 213                 my $first_time = $return eq "";
 214
 215                 $compare = $ebcdic_translations{$charset}[$ascii_ord];
 216                 $return .=  " && " unless $first_time;
 217                 $return .= "'$char' == $compare";
 218                 $return .= " /* $charset */" if $first_time;
 219                 last if $charset eq $ascii_key;
 220                 $count++;
 221                 $return .= " \\\n    " if $first_time || $count % 5 == 0;
 222             }
 223         }
 224
 225         return "#${indent}if $return\n";
 226     }
 227
 228     sub get_conditional_compile_line_end () {
 229         # Returns the #endif for the currently open #if
 230
 231         my $return = "#${indent}endif\t/* $charset */\n";
 232         undef $charset;
 233         undef $indent;
 234         return $return;
 235     }
 236 }
 237
 238 sub _UTF_START_MASK($) {
 239     # Internal
 240     my $len = shift;
 241     return (($len >= 7) ? 0x00 : (0x1F >> ($len - 2)));
 242 }
 243
 244 sub _UTF_START_MARK($) {
 245     # Internal
 246     my $len = shift;
 247     return (($len >  7) ? 0xFF : (0xFF & (0xFE << (7- $len))));
 248 }
 249
 250 sub cp_2_utfbytes($$) {
 251     # Returns a string consisting of the UTF-EBCDIC for the code page given by
 252     # the 2nd parameter, of the Unicode code point given by the first
 253     # parameter, using the UTF-MOD algorithm published in TR16.  (If the "code
 254     # page" is ASCII, straight UTF-8 is returned.)
 255
 256     my ($ucp, $charset) = @_;
 257
 258     if ($charset eq $ascii_key) {
 259         my $str = chr $ucp;
 260         utf8::upgrade($str);
 261         utf8::encode($str);
 262         return $str;
 263     }
 264     elsif (exists $ebcdic_translations{$charset}) {
 265
 266         if ($ucp < 0xA0) {
 267             return chr $ebcdic_translations{$charset}[$ucp];
 268         }
 269
 270         my $I8_2_utf = get_I8_2_utf($charset);
 271
 272         my $len = $ucp < 0xA0      ? 1 :
 273                   $ucp < 0x400     ? 2 :
 274                   $ucp < 0x4000    ? 3 :
 275                   $ucp < 0x40000   ? 4 :
 276                   $ucp < 0x400000  ? 5 :
 277                   $ucp < 0x4000000 ? 6 :
 278                   $ucp < 0x40000000? 7 :
 279                                     $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES;
 280
 281         my @str;
 282         for (1 .. $len - 1) {
 283             unshift @str, chr $I8_2_utf->[($ucp & 0x1f) | 0xA0];
 284             $ucp >>= 5;
 285         }
 286
 287         unshift @str, chr $I8_2_utf->[($ucp & _UTF_START_MASK($len)) | _UTF_START_MARK($len)];
 288
 289         return join "", @str;
 290     }
 291     else {
 292         die "Unknown character set '$charset'";
 293     }
 294 }
 295
 296 1;