regen/ebcdic.pl

   1 use v5.16.0;
   2 use strict;
   3 use warnings;
   4
   5 BEGIN { unshift @INC, '.' }
   6
   7 require './regen/regen_lib.pl';
   8 require './regen/charset_translations.pl';
   9
  10 # Generates the EBCDIC translation tables that were formerly hard-coded into
  11 # utfebcdic.h
  12
  13 my $out_fh = open_new('ebcdic_tables.h', '>',
  14         {style => '*', by => $0, });
  15
  16 sub output_table ($$;$) {
  17     my $table_ref = shift;
  18     my $name = shift;
  19
  20     # Tables in hex easier to debug, but don't fit into 80 columns
  21     my $print_in_hex = shift // 1;
  22
  23     die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256;
  24
  25     print $out_fh "EXTCONST U8 $name\[\] = {\n";
  26
  27     my $column_numbers= "/*_0   _1   _2   _3   _4   _5   _6   _7   _8   _9   _A   _B   _C   _D   _E  _F*/\n";
  28     print $out_fh $column_numbers if $print_in_hex;
  29     for my $i (0 .. 255) {
  30         if ($print_in_hex) {
  31             # No row headings, so will fit in 80 cols.
  32             #printf $out_fh "/* %X_ */ ", $i / 16 if $i % 16 == 0;
  33             printf $out_fh "0x%02X", $table_ref->[$i];
  34         }
  35         else {
  36             printf $out_fh "%4d", $table_ref->[$i];
  37         }
  38         print $out_fh ",", if $i < 255;
  39         #print $out_fh ($i < 255) ? "," : " ";
  40         #printf $out_fh " /* %X_ */", $i / 16 if $print_in_hex && $i % 16 == 15;
  41         print $out_fh "\n" if $i % 16 == 15;
  42     }
  43     print $out_fh $column_numbers if $print_in_hex;
  44     print $out_fh "};\n\n";
  45 }
  46
  47 print $out_fh <<END;
  48
  49 #ifndef H_EBCDIC_TABLES   /* Guard against nested #includes */
  50 #define H_EBCDIC_TABLES   1
  51
  52 /* This file contains definitions for various tables used in EBCDIC handling.
  53  * More info is in utfebcdic.h */
  54 END
  55
  56 my @charsets = get_supported_code_pages();
  57 shift @charsets;    # ASCII is the 0th, and we don't deal with that here.
  58 foreach my $charset (@charsets) {
  59     # we process the whole array several times, make a copy
  60     my @a2e = @{get_a2n($charset)};
  61
  62     print $out_fh "\n" . get_conditional_compile_line_start($charset);
  63     print $out_fh "\n";
  64
  65     print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
  66     output_table(\@a2e, "PL_a2e");
  67
  68     { # Construct the inverse
  69         my @e2a;
  70         for my $i (0 .. 255) {
  71             $e2a[$a2e[$i]] = $i;
  72         }
  73         print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
  74         output_table(\@e2a, "PL_e2a");
  75     }
  76
  77     my @i82utf = @{get_I8_2_utf($charset)};
  78     print $out_fh <<END;
  79 /* (Confusingly named) Index is $charset I8 byte; value is
  80  * $charset UTF-EBCDIC equivalent */
  81 END
  82     output_table(\@i82utf, "PL_utf2e");
  83
  84     { #Construct the inverse
  85         my @utf2i8;
  86         for my $i (0 .. 255) {
  87             $utf2i8[$i82utf[$i]] = $i;
  88         }
  89         print $out_fh <<END;
  90 /* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
  91  * $charset I8 equivalent */
  92 END
  93         output_table(\@utf2i8, "PL_e2utf");
  94     }
  95
  96     {
  97         my @utf8skip;
  98
  99         # These are invariants or continuation bytes.
 100         for my $i (0 .. 0xBF) {
 101             $utf8skip[$i82utf[$i]] = 1;
 102         }
 103
 104         # These are start bytes;  The skip is the number of consecutive highest
 105         # order 1-bits (up to 7)
 106         for my $i (0xC0 .. 255) {
 107             my $count;
 108             if ($i == 0b11111111) {
 109                 no warnings 'once';
 110                 $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES;
 111             }
 112             elsif (($i & 0b11111110) == 0b11111110) {
 113                 $count= 7;
 114             }
 115             elsif (($i & 0b11111100) == 0b11111100) {
 116                 $count= 6;
 117             }
 118             elsif (($i & 0b11111000) == 0b11111000) {
 119                 $count= 5;
 120             }
 121             elsif (($i & 0b11110000) == 0b11110000) {
 122                 $count= 4;
 123             }
 124             elsif (($i & 0b11100000) == 0b11100000) {
 125                 $count= 3;
 126             }
 127             elsif (($i & 0b11000000) == 0b11000000) {
 128                 $count= 2;
 129             }
 130             else {
 131                 die "Something wrong for UTF8SKIP calculation for $i";
 132             }
 133             $utf8skip[$i82utf[$i]] = $count;
 134         }
 135
 136         print $out_fh <<END;
 137 /* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes;
 138  * 1 for continuation.  Adapted from the shadow flags table in tr16.  The
 139  * entries marked 9 in tr16 are continuation bytes and are marked as length 1
 140  * here so that we can recover. */
 141 END
 142         output_table(\@utf8skip, "PL_utf8skip", 0);  # The 0 means don't print
 143                                                      # in hex
 144     }
 145
 146     use feature 'unicode_strings';
 147
 148     {
 149         my @lc;
 150         for my $i (0 .. 255) {
 151             $lc[$a2e[$i]] = $a2e[ord lc chr $i];
 152         }
 153         print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n";
 154         output_table(\@lc, "PL_latin1_lc");
 155     }
 156
 157     {
 158         my @uc;
 159         for my $i (0 .. 255) {
 160             my $uc = uc chr $i;
 161             if (length $uc > 1 || ord $uc > 255) {
 162                 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
 163             }
 164             $uc[$a2e[$i]] = $a2e[ord $uc];
 165         }
 166         print $out_fh <<END;
 167 /* Index is $charset code point; value is its uppercase equivalent.
 168  * The 'mod' in the name means that codepoints whose uppercase is above 255 or
 169  * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
 170 END
 171         output_table(\@uc, "PL_mod_latin1_uc");
 172     }
 173
 174     { # PL_fold
 175         my @ascii_fold;
 176         for my $i (0 .. 255) {  # Initialise to identity map
 177             $ascii_fold[$i] = $i;
 178         }
 179
 180         # Overwrite the entries that aren't identity
 181         for my $chr ('A' .. 'Z') {
 182             $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
 183         }
 184         for my $chr ('a' .. 'z') {
 185             $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
 186         }
 187         print $out_fh <<END;
 188 /* Index is $charset code point; For A-Z, value is a-z; for a-z, value
 189  * is A-Z; all other code points map to themselves */
 190 END
 191         output_table(\@ascii_fold, "PL_fold");
 192     }
 193
 194     {
 195         my @latin1_fold;
 196         for my $i (0 .. 255) {
 197             my $char = chr $i;
 198             my $lc = lc $char;
 199
 200             # lc and uc adequately proxy for fold-case pairs in this 0-255
 201             # range
 202             my $uc = uc $char;
 203             $uc = $char if length $uc > 1 || ord $uc > 255;
 204             if ($lc ne $char) {
 205                 $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
 206             }
 207             elsif ($uc ne $char) {
 208                 $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
 209             }
 210             else {
 211                 $latin1_fold[$a2e[$i]] = $a2e[$i];
 212             }
 213         }
 214         print $out_fh <<END;
 215 /* Index is $charset code point; value is its other fold-pair equivalent
 216  * (A => a; a => A, etc) in the 0-255 range.  If no such equivalent, value is
 217  * the code point itself */
 218 END
 219         output_table(\@latin1_fold, "PL_fold_latin1");
 220     }
 221
 222     print $out_fh get_conditional_compile_line_end();
 223 }
 224
 225 print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n";
 226
 227 read_only_bottom_close_and_rename($out_fh);