regen/ebcdic.pl

   1 use v5.16.0;
   2 use strict;
   3 use warnings;
   4 require 'regen/regen_lib.pl';
   5 require 'regen/charset_translations.pl';
   6
   7 # Generates the EBCDIC translation tables that were formerly hard-coded into
   8 # utfebcdic.h
   9
  10 my $out_fh = open_new('ebcdic_tables.h', '>',
  11         {style => '*', by => $0, });
  12
  13 sub output_table ($$) {
  14     my $table_ref = shift;
  15     my $name = shift;
  16
  17     die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256;
  18
  19     print $out_fh "EXTCONST U8 $name\[\] = {\n";
  20
  21     for my $i (0 .. 255) {
  22         printf $out_fh "%4d", $table_ref->[$i];
  23         #printf $out_fh " 0x%02X", $table_ref->[$i];
  24         print $out_fh ",", if $i < 255;
  25         print $out_fh "\n" if $i % 16 == 15;
  26     }
  27     print $out_fh "};\n\n";
  28 }
  29
  30 print $out_fh <<END;
  31
  32 #ifndef H_EBCDIC_TABLES   /* Guard against nested #includes */
  33 #define H_EBCDIC_TABLES   1
  34
  35 /* This file contains definitions for various tables used in EBCDIC handling.
  36  * More info is in utfebcdic.h */
  37 END
  38
  39 my @charsets = get_supported_code_pages();
  40 shift @charsets;    # ASCII is the 0th, and we don't deal with that here.
  41 foreach my $charset (@charsets) {
  42     # we process the whole array several times, make a copy
  43     my @a2e = @{get_a2n($charset)};
  44
  45     print $out_fh "\n" . get_conditional_compile_line_start($charset);
  46     print $out_fh "\n";
  47
  48     print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
  49     output_table(\@a2e, "PL_a2e");
  50
  51     { # Construct the inverse
  52         my @e2a;
  53         for my $i (0 .. 255) {
  54             $e2a[$a2e[$i]] = $i;
  55         }
  56         print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
  57         output_table(\@e2a, "PL_e2a");
  58     }
  59
  60     my @i82utf = @{get_I8_2_utf($charset)};
  61     print $out_fh <<END;
  62 /* (Confusingly named) Index is $charset I8 byte; value is
  63  * $charset UTF-EBCDIC equivalent */
  64 END
  65     output_table(\@i82utf, "PL_utf2e");
  66
  67     { #Construct the inverse
  68         my @utf2i8;
  69         for my $i (0 .. 255) {
  70             $utf2i8[$i82utf[$i]] = $i;
  71         }
  72         print $out_fh <<END;
  73 /* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
  74  * $charset I8 equivalent */
  75 END
  76         output_table(\@utf2i8, "PL_e2utf");
  77     }
  78
  79     {
  80         my @utf8skip;
  81
  82         # These are invariants or continuation bytes.
  83         for my $i (0 .. 0xBF) {
  84             $utf8skip[$i82utf[$i]] = 1;
  85         }
  86
  87         # These are start bytes;  The skip is the number of consecutive highest
  88         # order 1-bits (up to 7)
  89         for my $i (0xC0 .. 255) {
  90             my $count;
  91             if (($i & 0b11111110) == 0b11111110) {
  92                 $count= 7;
  93             }
  94             elsif (($i & 0b11111100) == 0b11111100) {
  95                 $count= 6;
  96             }
  97             elsif (($i & 0b11111000) == 0b11111000) {
  98                 $count= 5;
  99             }
 100             elsif (($i & 0b11110000) == 0b11110000) {
 101                 $count= 4;
 102             }
 103             elsif (($i & 0b11100000) == 0b11100000) {
 104                 $count= 3;
 105             }
 106             elsif (($i & 0b11000000) == 0b11000000) {
 107                 $count= 2;
 108             }
 109             else {
 110                 die "Something wrong for UTF8SKIP calculation for $i";
 111             }
 112             $utf8skip[$i82utf[$i]] = $count;
 113         }
 114
 115         print $out_fh <<END;
 116 /* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes;
 117  * 1 for continuation.  Adapted from the shadow flags table in tr16.  The
 118  * entries marked 9 in tr16 are continuation bytes and are marked as length 1
 119  * here so that we can recover. */
 120 END
 121         output_table(\@utf8skip, "PL_utf8skip");
 122     }
 123
 124     use feature 'unicode_strings';
 125
 126     {
 127         my @lc;
 128         for my $i (0 .. 255) {
 129             $lc[$a2e[$i]] = $a2e[ord lc chr $i];
 130         }
 131         print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n";
 132         output_table(\@lc, "PL_latin1_lc");
 133     }
 134
 135     {
 136         my @uc;
 137         for my $i (0 .. 255) {
 138             my $uc = uc chr $i;
 139             if (length $uc > 1 || ord $uc > 255) {
 140                 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
 141             }
 142             $uc[$a2e[$i]] = $a2e[ord $uc];
 143         }
 144         print $out_fh <<END;
 145 /* Index is $charset code point; value is its uppercase equivalent.
 146  * The 'mod' in the name means that codepoints whose uppercase is above 255 or
 147  * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
 148 END
 149         output_table(\@uc, "PL_mod_latin1_uc");
 150     }
 151
 152     { # PL_fold
 153         my @ascii_fold;
 154         for my $i (0 .. 255) {  # Initialise to identity map
 155             $ascii_fold[$i] = $i;
 156         }
 157
 158         # Overwrite the entries that aren't identity
 159         for my $chr ('A' .. 'Z') {
 160             $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
 161         }
 162         for my $chr ('a' .. 'z') {
 163             $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
 164         }
 165         print $out_fh <<END;
 166 /* Index is $charset code point; For A-Z, value is a-z; for a-z, value
 167  * is A-Z; all other code points map to themselves */
 168 END
 169         output_table(\@ascii_fold, "PL_fold");
 170     }
 171
 172     {
 173         my @latin1_fold;
 174         for my $i (0 .. 255) {
 175             my $char = chr $i;
 176             my $lc = lc $char;
 177
 178             # lc and uc adequately proxy for fold-case pairs in this 0-255
 179             # range
 180             my $uc = uc $char;
 181             $uc = $char if length $uc > 1 || ord $uc > 255;
 182             if ($lc ne $char) {
 183                 $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
 184             }
 185             elsif ($uc ne $char) {
 186                 $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
 187             }
 188             else {
 189                 $latin1_fold[$a2e[$i]] = $a2e[$i];
 190             }
 191         }
 192         print $out_fh <<END;
 193 /* Index is $charset code point; value is its other fold-pair equivalent
 194  * (A => a; a => A, etc) in the 0-255 range.  If no such equivalent, value is
 195  * the code point itself */
 196 END
 197         output_table(\@latin1_fold, "PL_fold_latin1");
 198     }
 199
 200     print $out_fh get_conditional_compile_line_end();
 201 }
 202
 203 print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n";
 204
 205 read_only_bottom_close_and_rename($out_fh);