Commit | Line | Data |
---|---|---|
4bc3dcfa KW |
1 | use v5.16.0; |
2 | use strict; | |
3 | use warnings; | |
3d7c117d MB |
4 | |
5 | BEGIN { unshift @INC, '.' } | |
6 | ||
7 | require './regen/regen_lib.pl'; | |
8 | require './regen/charset_translations.pl'; | |
4bc3dcfa KW |
9 | |
10 | # Generates the EBCDIC translation tables that were formerly hard-coded into | |
11 | # utfebcdic.h | |
12 | ||
13 | my $out_fh = open_new('ebcdic_tables.h', '>', | |
14 | {style => '*', by => $0, }); | |
15 | ||
702cfe48 | 16 | sub output_table ($$;$) { |
4bc3dcfa KW |
17 | my $table_ref = shift; |
18 | my $name = shift; | |
19 | ||
43bf0d50 | 20 | # Tables in hex easier to debug, but don't fit into 80 columns |
702cfe48 | 21 | my $print_in_hex = shift // 1; |
43bf0d50 | 22 | |
4bc3dcfa KW |
23 | die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256; |
24 | ||
25 | print $out_fh "EXTCONST U8 $name\[\] = {\n"; | |
26 | ||
702cfe48 KW |
27 | my $column_numbers= "/*_0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/\n"; |
28 | print $out_fh $column_numbers if $print_in_hex; | |
4bc3dcfa | 29 | for my $i (0 .. 255) { |
43bf0d50 | 30 | if ($print_in_hex) { |
702cfe48 KW |
31 | # No row headings, so will fit in 80 cols. |
32 | #printf $out_fh "/* %X_ */ ", $i / 16 if $i % 16 == 0; | |
33 | printf $out_fh "0x%02X", $table_ref->[$i]; | |
43bf0d50 KW |
34 | } |
35 | else { | |
36 | printf $out_fh "%4d", $table_ref->[$i]; | |
37 | } | |
4bc3dcfa | 38 | print $out_fh ",", if $i < 255; |
702cfe48 KW |
39 | #print $out_fh ($i < 255) ? "," : " "; |
40 | #printf $out_fh " /* %X_ */", $i / 16 if $print_in_hex && $i % 16 == 15; | |
4bc3dcfa KW |
41 | print $out_fh "\n" if $i % 16 == 15; |
42 | } | |
702cfe48 | 43 | print $out_fh $column_numbers if $print_in_hex; |
4bc3dcfa KW |
44 | print $out_fh "};\n\n"; |
45 | } | |
46 | ||
47 | print $out_fh <<END; | |
48 | ||
49 | #ifndef H_EBCDIC_TABLES /* Guard against nested #includes */ | |
50 | #define H_EBCDIC_TABLES 1 | |
51 | ||
52 | /* This file contains definitions for various tables used in EBCDIC handling. | |
53 | * More info is in utfebcdic.h */ | |
54 | END | |
55 | ||
56 | my @charsets = get_supported_code_pages(); | |
57 | shift @charsets; # ASCII is the 0th, and we don't deal with that here. | |
58 | foreach my $charset (@charsets) { | |
c30a0cf2 TC |
59 | # we process the whole array several times, make a copy |
60 | my @a2e = @{get_a2n($charset)}; | |
4bc3dcfa KW |
61 | |
62 | print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
63 | print $out_fh "\n"; | |
64 | ||
65 | print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n"; | |
66 | output_table(\@a2e, "PL_a2e"); | |
67 | ||
68 | { # Construct the inverse | |
69 | my @e2a; | |
70 | for my $i (0 .. 255) { | |
71 | $e2a[$a2e[$i]] = $i; | |
72 | } | |
73 | print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n"; | |
74 | output_table(\@e2a, "PL_e2a"); | |
75 | } | |
76 | ||
e0dcdb0a | 77 | my @i82utf = @{get_I8_2_utf($charset)}; |
4bc3dcfa KW |
78 | print $out_fh <<END; |
79 | /* (Confusingly named) Index is $charset I8 byte; value is | |
80 | * $charset UTF-EBCDIC equivalent */ | |
81 | END | |
82 | output_table(\@i82utf, "PL_utf2e"); | |
83 | ||
84 | { #Construct the inverse | |
85 | my @utf2i8; | |
86 | for my $i (0 .. 255) { | |
87 | $utf2i8[$i82utf[$i]] = $i; | |
88 | } | |
89 | print $out_fh <<END; | |
90 | /* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is | |
91 | * $charset I8 equivalent */ | |
92 | END | |
93 | output_table(\@utf2i8, "PL_e2utf"); | |
94 | } | |
95 | ||
96 | { | |
97 | my @utf8skip; | |
98 | ||
99 | # These are invariants or continuation bytes. | |
100 | for my $i (0 .. 0xBF) { | |
101 | $utf8skip[$i82utf[$i]] = 1; | |
102 | } | |
103 | ||
104 | # These are start bytes; The skip is the number of consecutive highest | |
105 | # order 1-bits (up to 7) | |
106 | for my $i (0xC0 .. 255) { | |
107 | my $count; | |
c0236afe KW |
108 | if ($i == 0b11111111) { |
109 | no warnings 'once'; | |
110 | $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES; | |
111 | } | |
112 | elsif (($i & 0b11111110) == 0b11111110) { | |
4bc3dcfa KW |
113 | $count= 7; |
114 | } | |
115 | elsif (($i & 0b11111100) == 0b11111100) { | |
116 | $count= 6; | |
117 | } | |
118 | elsif (($i & 0b11111000) == 0b11111000) { | |
119 | $count= 5; | |
120 | } | |
121 | elsif (($i & 0b11110000) == 0b11110000) { | |
122 | $count= 4; | |
123 | } | |
124 | elsif (($i & 0b11100000) == 0b11100000) { | |
125 | $count= 3; | |
126 | } | |
127 | elsif (($i & 0b11000000) == 0b11000000) { | |
128 | $count= 2; | |
129 | } | |
130 | else { | |
131 | die "Something wrong for UTF8SKIP calculation for $i"; | |
132 | } | |
133 | $utf8skip[$i82utf[$i]] = $count; | |
134 | } | |
135 | ||
136 | print $out_fh <<END; | |
137 | /* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes; | |
138 | * 1 for continuation. Adapted from the shadow flags table in tr16. The | |
139 | * entries marked 9 in tr16 are continuation bytes and are marked as length 1 | |
140 | * here so that we can recover. */ | |
141 | END | |
702cfe48 KW |
142 | output_table(\@utf8skip, "PL_utf8skip", 0); # The 0 means don't print |
143 | # in hex | |
4bc3dcfa KW |
144 | } |
145 | ||
146 | use feature 'unicode_strings'; | |
147 | ||
148 | { | |
149 | my @lc; | |
150 | for my $i (0 .. 255) { | |
151 | $lc[$a2e[$i]] = $a2e[ord lc chr $i]; | |
152 | } | |
153 | print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n"; | |
154 | output_table(\@lc, "PL_latin1_lc"); | |
155 | } | |
156 | ||
157 | { | |
158 | my @uc; | |
159 | for my $i (0 .. 255) { | |
160 | my $uc = uc chr $i; | |
161 | if (length $uc > 1 || ord $uc > 255) { | |
162 | $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; | |
163 | } | |
164 | $uc[$a2e[$i]] = $a2e[ord $uc]; | |
165 | } | |
166 | print $out_fh <<END; | |
167 | /* Index is $charset code point; value is its uppercase equivalent. | |
168 | * The 'mod' in the name means that codepoints whose uppercase is above 255 or | |
169 | * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */ | |
170 | END | |
171 | output_table(\@uc, "PL_mod_latin1_uc"); | |
172 | } | |
173 | ||
174 | { # PL_fold | |
175 | my @ascii_fold; | |
176 | for my $i (0 .. 255) { # Initialise to identity map | |
177 | $ascii_fold[$i] = $i; | |
178 | } | |
179 | ||
180 | # Overwrite the entries that aren't identity | |
181 | for my $chr ('A' .. 'Z') { | |
182 | $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr]; | |
183 | } | |
184 | for my $chr ('a' .. 'z') { | |
185 | $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr]; | |
186 | } | |
187 | print $out_fh <<END; | |
188 | /* Index is $charset code point; For A-Z, value is a-z; for a-z, value | |
189 | * is A-Z; all other code points map to themselves */ | |
190 | END | |
191 | output_table(\@ascii_fold, "PL_fold"); | |
192 | } | |
193 | ||
194 | { | |
195 | my @latin1_fold; | |
196 | for my $i (0 .. 255) { | |
197 | my $char = chr $i; | |
198 | my $lc = lc $char; | |
199 | ||
200 | # lc and uc adequately proxy for fold-case pairs in this 0-255 | |
201 | # range | |
202 | my $uc = uc $char; | |
203 | $uc = $char if length $uc > 1 || ord $uc > 255; | |
204 | if ($lc ne $char) { | |
205 | $latin1_fold[$a2e[$i]] = $a2e[ord $lc]; | |
206 | } | |
207 | elsif ($uc ne $char) { | |
208 | $latin1_fold[$a2e[$i]] = $a2e[ord $uc]; | |
209 | } | |
210 | else { | |
211 | $latin1_fold[$a2e[$i]] = $a2e[$i]; | |
212 | } | |
213 | } | |
214 | print $out_fh <<END; | |
215 | /* Index is $charset code point; value is its other fold-pair equivalent | |
216 | * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is | |
217 | * the code point itself */ | |
218 | END | |
219 | output_table(\@latin1_fold, "PL_fold_latin1"); | |
220 | } | |
221 | ||
222 | print $out_fh get_conditional_compile_line_end(); | |
223 | } | |
224 | ||
225 | print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n"; | |
226 | ||
227 | read_only_bottom_close_and_rename($out_fh); |