regen/ebcdic.pl

   1 use v5.16.0;
   2 use strict;
   3 use warnings;
   4 require 'regen/regen_lib.pl';
   5 require 'regen/charset_translations.pl';
   6
   7 # Generates the EBCDIC translation tables that were formerly hard-coded into
   8 # utfebcdic.h
   9
  10 my $out_fh = open_new('ebcdic_tables.h', '>',
  11         {style => '*', by => $0, });
  12
  13 sub output_table ($$) {
  14     my $table_ref = shift;
  15     my $name = shift;
  16
  17     # Tables in hex easier to debug, but don't fit into 80 columns
  18     my $print_in_hex = 0;
  19
  20     die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256;
  21
  22     print $out_fh "EXTCONST U8 $name\[\] = {\n";
  23
  24     print $out_fh "/*          _0    _1    _2    _3    _4    _5    _6    _7    _8    _9    _A    _B    _C    _D    _E    _F        */\n" if $print_in_hex;
  25     for my $i (0 .. 255) {
  26         if ($print_in_hex) {
  27             printf $out_fh "/* %X_ */ ", $i / 16 if $i % 16 == 0;
  28             printf $out_fh " 0x%02X", $table_ref->[$i];
  29         }
  30         else {
  31             printf $out_fh "%4d", $table_ref->[$i];
  32         }
  33         printf $out_fh "  /* %X_ */", $i / 16 if $print_in_hex && $i % 16 == 15;
  34         print $out_fh ",", if $i < 255;
  35         print $out_fh "\n" if $i % 16 == 15;
  36     }
  37     print $out_fh "/*          _0    _1    _2    _3    _4    _5    _6    _7    _8    _9    _A    _B    _C    _D    _E    _F        */\n" if $print_in_hex;
  38     print $out_fh "};\n\n";
  39 }
  40
  41 print $out_fh <<END;
  42
  43 #ifndef H_EBCDIC_TABLES   /* Guard against nested #includes */
  44 #define H_EBCDIC_TABLES   1
  45
  46 /* This file contains definitions for various tables used in EBCDIC handling.
  47  * More info is in utfebcdic.h */
  48 END
  49
  50 my @charsets = get_supported_code_pages();
  51 shift @charsets;    # ASCII is the 0th, and we don't deal with that here.
  52 foreach my $charset (@charsets) {
  53     # we process the whole array several times, make a copy
  54     my @a2e = @{get_a2n($charset)};
  55
  56     print $out_fh "\n" . get_conditional_compile_line_start($charset);
  57     print $out_fh "\n";
  58
  59     print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
  60     output_table(\@a2e, "PL_a2e");
  61
  62     { # Construct the inverse
  63         my @e2a;
  64         for my $i (0 .. 255) {
  65             $e2a[$a2e[$i]] = $i;
  66         }
  67         print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
  68         output_table(\@e2a, "PL_e2a");
  69     }
  70
  71     my @i82utf = @{get_I8_2_utf($charset)};
  72     print $out_fh <<END;
  73 /* (Confusingly named) Index is $charset I8 byte; value is
  74  * $charset UTF-EBCDIC equivalent */
  75 END
  76     output_table(\@i82utf, "PL_utf2e");
  77
  78     { #Construct the inverse
  79         my @utf2i8;
  80         for my $i (0 .. 255) {
  81             $utf2i8[$i82utf[$i]] = $i;
  82         }
  83         print $out_fh <<END;
  84 /* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
  85  * $charset I8 equivalent */
  86 END
  87         output_table(\@utf2i8, "PL_e2utf");
  88     }
  89
  90     {
  91         my @utf8skip;
  92
  93         # These are invariants or continuation bytes.
  94         for my $i (0 .. 0xBF) {
  95             $utf8skip[$i82utf[$i]] = 1;
  96         }
  97
  98         # These are start bytes;  The skip is the number of consecutive highest
  99         # order 1-bits (up to 7)
 100         for my $i (0xC0 .. 255) {
 101             my $count;
 102             if (($i & 0b11111110) == 0b11111110) {
 103                 $count= 7;
 104             }
 105             elsif (($i & 0b11111100) == 0b11111100) {
 106                 $count= 6;
 107             }
 108             elsif (($i & 0b11111000) == 0b11111000) {
 109                 $count= 5;
 110             }
 111             elsif (($i & 0b11110000) == 0b11110000) {
 112                 $count= 4;
 113             }
 114             elsif (($i & 0b11100000) == 0b11100000) {
 115                 $count= 3;
 116             }
 117             elsif (($i & 0b11000000) == 0b11000000) {
 118                 $count= 2;
 119             }
 120             else {
 121                 die "Something wrong for UTF8SKIP calculation for $i";
 122             }
 123             $utf8skip[$i82utf[$i]] = $count;
 124         }
 125
 126         print $out_fh <<END;
 127 /* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes;
 128  * 1 for continuation.  Adapted from the shadow flags table in tr16.  The
 129  * entries marked 9 in tr16 are continuation bytes and are marked as length 1
 130  * here so that we can recover. */
 131 END
 132         output_table(\@utf8skip, "PL_utf8skip");
 133     }
 134
 135     use feature 'unicode_strings';
 136
 137     {
 138         my @lc;
 139         for my $i (0 .. 255) {
 140             $lc[$a2e[$i]] = $a2e[ord lc chr $i];
 141         }
 142         print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n";
 143         output_table(\@lc, "PL_latin1_lc");
 144     }
 145
 146     {
 147         my @uc;
 148         for my $i (0 .. 255) {
 149             my $uc = uc chr $i;
 150             if (length $uc > 1 || ord $uc > 255) {
 151                 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
 152             }
 153             $uc[$a2e[$i]] = $a2e[ord $uc];
 154         }
 155         print $out_fh <<END;
 156 /* Index is $charset code point; value is its uppercase equivalent.
 157  * The 'mod' in the name means that codepoints whose uppercase is above 255 or
 158  * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
 159 END
 160         output_table(\@uc, "PL_mod_latin1_uc");
 161     }
 162
 163     { # PL_fold
 164         my @ascii_fold;
 165         for my $i (0 .. 255) {  # Initialise to identity map
 166             $ascii_fold[$i] = $i;
 167         }
 168
 169         # Overwrite the entries that aren't identity
 170         for my $chr ('A' .. 'Z') {
 171             $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
 172         }
 173         for my $chr ('a' .. 'z') {
 174             $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
 175         }
 176         print $out_fh <<END;
 177 /* Index is $charset code point; For A-Z, value is a-z; for a-z, value
 178  * is A-Z; all other code points map to themselves */
 179 END
 180         output_table(\@ascii_fold, "PL_fold");
 181     }
 182
 183     {
 184         my @latin1_fold;
 185         for my $i (0 .. 255) {
 186             my $char = chr $i;
 187             my $lc = lc $char;
 188
 189             # lc and uc adequately proxy for fold-case pairs in this 0-255
 190             # range
 191             my $uc = uc $char;
 192             $uc = $char if length $uc > 1 || ord $uc > 255;
 193             if ($lc ne $char) {
 194                 $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
 195             }
 196             elsif ($uc ne $char) {
 197                 $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
 198             }
 199             else {
 200                 $latin1_fold[$a2e[$i]] = $a2e[$i];
 201             }
 202         }
 203         print $out_fh <<END;
 204 /* Index is $charset code point; value is its other fold-pair equivalent
 205  * (A => a; a => A, etc) in the 0-255 range.  If no such equivalent, value is
 206  * the code point itself */
 207 END
 208         output_table(\@latin1_fold, "PL_fold_latin1");
 209     }
 210
 211     print $out_fh get_conditional_compile_line_end();
 212 }
 213
 214 print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n";
 215
 216 read_only_bottom_close_and_rename($out_fh);