4 require 'regen/regen_lib.pl';
5 require 'regen/charset_translations.pl';
7 # Generates the EBCDIC translation tables that were formerly hard-coded into
10 my $out_fh = open_new('ebcdic_tables.h', '>',
11 {style => '*', by => $0, });
13 sub output_table ($$) {
14 my $table_ref = shift;
17 die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256;
19 print $out_fh "EXTCONST U8 $name\[\] = {\n";
21 for my $i (0 .. 255) {
22 printf $out_fh "%4d", $table_ref->[$i];
23 #printf $out_fh " 0x%02X", $table_ref->[$i];
24 print $out_fh ",", if $i < 255;
25 print $out_fh "\n" if $i % 16 == 15;
27 print $out_fh "};\n\n";
32 #ifndef H_EBCDIC_TABLES /* Guard against nested #includes */
33 #define H_EBCDIC_TABLES 1
35 /* This file contains definitions for various tables used in EBCDIC handling.
36 * More info is in utfebcdic.h */
39 my @charsets = get_supported_code_pages();
40 shift @charsets; # ASCII is the 0th, and we don't deal with that here.
41 foreach my $charset (@charsets) {
42 # we process the whole array several times, make a copy
43 my @a2e = @{get_a2n($charset)};
45 print $out_fh "\n" . get_conditional_compile_line_start($charset);
48 print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
49 output_table(\@a2e, "PL_a2e");
51 { # Construct the inverse
53 for my $i (0 .. 255) {
56 print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
57 output_table(\@e2a, "PL_e2a");
60 my @i82utf = @{get_I8_2_utf($charset)};
62 /* (Confusingly named) Index is $charset I8 byte; value is
63 * $charset UTF-EBCDIC equivalent */
65 output_table(\@i82utf, "PL_utf2e");
67 { #Construct the inverse
69 for my $i (0 .. 255) {
70 $utf2i8[$i82utf[$i]] = $i;
73 /* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
74 * $charset I8 equivalent */
76 output_table(\@utf2i8, "PL_e2utf");
82 # These are invariants or continuation bytes.
83 for my $i (0 .. 0xBF) {
84 $utf8skip[$i82utf[$i]] = 1;
87 # These are start bytes; The skip is the number of consecutive highest
88 # order 1-bits (up to 7)
89 for my $i (0xC0 .. 255) {
91 if (($i & 0b11111110) == 0b11111110) {
94 elsif (($i & 0b11111100) == 0b11111100) {
97 elsif (($i & 0b11111000) == 0b11111000) {
100 elsif (($i & 0b11110000) == 0b11110000) {
103 elsif (($i & 0b11100000) == 0b11100000) {
106 elsif (($i & 0b11000000) == 0b11000000) {
110 die "Something wrong for UTF8SKIP calculation for $i";
112 $utf8skip[$i82utf[$i]] = $count;
116 /* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes;
117 * 1 for continuation. Adapted from the shadow flags table in tr16. The
118 * entries marked 9 in tr16 are continuation bytes and are marked as length 1
119 * here so that we can recover. */
121 output_table(\@utf8skip, "PL_utf8skip");
124 use feature 'unicode_strings';
128 for my $i (0 .. 255) {
129 $lc[$a2e[$i]] = $a2e[ord lc chr $i];
131 print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n";
132 output_table(\@lc, "PL_latin1_lc");
137 for my $i (0 .. 255) {
139 if (length $uc > 1 || ord $uc > 255) {
140 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
142 $uc[$a2e[$i]] = $a2e[ord $uc];
145 /* Index is $charset code point; value is its uppercase equivalent.
146 * The 'mod' in the name means that codepoints whose uppercase is above 255 or
147 * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
149 output_table(\@uc, "PL_mod_latin1_uc");
154 for my $i (0 .. 255) { # Initialise to identity map
155 $ascii_fold[$i] = $i;
158 # Overwrite the entries that aren't identity
159 for my $chr ('A' .. 'Z') {
160 $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
162 for my $chr ('a' .. 'z') {
163 $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
166 /* Index is $charset code point; For A-Z, value is a-z; for a-z, value
167 * is A-Z; all other code points map to themselves */
169 output_table(\@ascii_fold, "PL_fold");
174 for my $i (0 .. 255) {
178 # lc and uc adequately proxy for fold-case pairs in this 0-255
181 $uc = $char if length $uc > 1 || ord $uc > 255;
183 $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
185 elsif ($uc ne $char) {
186 $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
189 $latin1_fold[$a2e[$i]] = $a2e[$i];
193 /* Index is $charset code point; value is its other fold-pair equivalent
194 * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is
195 * the code point itself */
197 output_table(\@latin1_fold, "PL_fold_latin1");
200 print $out_fh get_conditional_compile_line_end();
203 print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n";
205 read_only_bottom_close_and_rename($out_fh);