This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/mk_invlists.pl: Add dependency
[perl5.git] / regen / ebcdic.pl
CommitLineData
4bc3dcfa
KW
1use v5.16.0;
2use strict;
3use warnings;
4require 'regen/regen_lib.pl';
5require 'regen/charset_translations.pl';
6
7# Generates the EBCDIC translation tables that were formerly hard-coded into
8# utfebcdic.h
9
10my $out_fh = open_new('ebcdic_tables.h', '>',
11 {style => '*', by => $0, });
12
13sub output_table ($$) {
14 my $table_ref = shift;
15 my $name = shift;
16
43bf0d50
KW
17 # Tables in hex easier to debug, but don't fit into 80 columns
18 my $print_in_hex = 0;
19
4bc3dcfa
KW
20 die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256;
21
22 print $out_fh "EXTCONST U8 $name\[\] = {\n";
23
43bf0d50 24 print $out_fh "/* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F */\n" if $print_in_hex;
4bc3dcfa 25 for my $i (0 .. 255) {
43bf0d50
KW
26 if ($print_in_hex) {
27 printf $out_fh "/* %X_ */ ", $i / 16 if $i % 16 == 0;
28 printf $out_fh " 0x%02X", $table_ref->[$i];
29 }
30 else {
31 printf $out_fh "%4d", $table_ref->[$i];
32 }
33 printf $out_fh " /* %X_ */", $i / 16 if $print_in_hex && $i % 16 == 15;
4bc3dcfa
KW
34 print $out_fh ",", if $i < 255;
35 print $out_fh "\n" if $i % 16 == 15;
36 }
43bf0d50 37 print $out_fh "/* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F */\n" if $print_in_hex;
4bc3dcfa
KW
38 print $out_fh "};\n\n";
39}
40
41print $out_fh <<END;
42
43#ifndef H_EBCDIC_TABLES /* Guard against nested #includes */
44#define H_EBCDIC_TABLES 1
45
46/* This file contains definitions for various tables used in EBCDIC handling.
47 * More info is in utfebcdic.h */
48END
49
50my @charsets = get_supported_code_pages();
51shift @charsets; # ASCII is the 0th, and we don't deal with that here.
52foreach my $charset (@charsets) {
c30a0cf2
TC
53 # we process the whole array several times, make a copy
54 my @a2e = @{get_a2n($charset)};
4bc3dcfa
KW
55
56 print $out_fh "\n" . get_conditional_compile_line_start($charset);
57 print $out_fh "\n";
58
59 print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
60 output_table(\@a2e, "PL_a2e");
61
62 { # Construct the inverse
63 my @e2a;
64 for my $i (0 .. 255) {
65 $e2a[$a2e[$i]] = $i;
66 }
67 print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
68 output_table(\@e2a, "PL_e2a");
69 }
70
e0dcdb0a 71 my @i82utf = @{get_I8_2_utf($charset)};
4bc3dcfa
KW
72 print $out_fh <<END;
73/* (Confusingly named) Index is $charset I8 byte; value is
74 * $charset UTF-EBCDIC equivalent */
75END
76 output_table(\@i82utf, "PL_utf2e");
77
78 { #Construct the inverse
79 my @utf2i8;
80 for my $i (0 .. 255) {
81 $utf2i8[$i82utf[$i]] = $i;
82 }
83 print $out_fh <<END;
84/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
85 * $charset I8 equivalent */
86END
87 output_table(\@utf2i8, "PL_e2utf");
88 }
89
90 {
91 my @utf8skip;
92
93 # These are invariants or continuation bytes.
94 for my $i (0 .. 0xBF) {
95 $utf8skip[$i82utf[$i]] = 1;
96 }
97
98 # These are start bytes; The skip is the number of consecutive highest
99 # order 1-bits (up to 7)
100 for my $i (0xC0 .. 255) {
101 my $count;
102 if (($i & 0b11111110) == 0b11111110) {
103 $count= 7;
104 }
105 elsif (($i & 0b11111100) == 0b11111100) {
106 $count= 6;
107 }
108 elsif (($i & 0b11111000) == 0b11111000) {
109 $count= 5;
110 }
111 elsif (($i & 0b11110000) == 0b11110000) {
112 $count= 4;
113 }
114 elsif (($i & 0b11100000) == 0b11100000) {
115 $count= 3;
116 }
117 elsif (($i & 0b11000000) == 0b11000000) {
118 $count= 2;
119 }
120 else {
121 die "Something wrong for UTF8SKIP calculation for $i";
122 }
123 $utf8skip[$i82utf[$i]] = $count;
124 }
125
126 print $out_fh <<END;
127/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes;
128 * 1 for continuation. Adapted from the shadow flags table in tr16. The
129 * entries marked 9 in tr16 are continuation bytes and are marked as length 1
130 * here so that we can recover. */
131END
132 output_table(\@utf8skip, "PL_utf8skip");
133 }
134
135 use feature 'unicode_strings';
136
137 {
138 my @lc;
139 for my $i (0 .. 255) {
140 $lc[$a2e[$i]] = $a2e[ord lc chr $i];
141 }
142 print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n";
143 output_table(\@lc, "PL_latin1_lc");
144 }
145
146 {
147 my @uc;
148 for my $i (0 .. 255) {
149 my $uc = uc chr $i;
150 if (length $uc > 1 || ord $uc > 255) {
151 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
152 }
153 $uc[$a2e[$i]] = $a2e[ord $uc];
154 }
155 print $out_fh <<END;
156/* Index is $charset code point; value is its uppercase equivalent.
157 * The 'mod' in the name means that codepoints whose uppercase is above 255 or
158 * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
159END
160 output_table(\@uc, "PL_mod_latin1_uc");
161 }
162
163 { # PL_fold
164 my @ascii_fold;
165 for my $i (0 .. 255) { # Initialise to identity map
166 $ascii_fold[$i] = $i;
167 }
168
169 # Overwrite the entries that aren't identity
170 for my $chr ('A' .. 'Z') {
171 $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
172 }
173 for my $chr ('a' .. 'z') {
174 $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
175 }
176 print $out_fh <<END;
177/* Index is $charset code point; For A-Z, value is a-z; for a-z, value
178 * is A-Z; all other code points map to themselves */
179END
180 output_table(\@ascii_fold, "PL_fold");
181 }
182
183 {
184 my @latin1_fold;
185 for my $i (0 .. 255) {
186 my $char = chr $i;
187 my $lc = lc $char;
188
189 # lc and uc adequately proxy for fold-case pairs in this 0-255
190 # range
191 my $uc = uc $char;
192 $uc = $char if length $uc > 1 || ord $uc > 255;
193 if ($lc ne $char) {
194 $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
195 }
196 elsif ($uc ne $char) {
197 $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
198 }
199 else {
200 $latin1_fold[$a2e[$i]] = $a2e[$i];
201 }
202 }
203 print $out_fh <<END;
204/* Index is $charset code point; value is its other fold-pair equivalent
205 * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is
206 * the code point itself */
207END
208 output_table(\@latin1_fold, "PL_fold_latin1");
209 }
210
211 print $out_fh get_conditional_compile_line_end();
212}
213
214print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n";
215
216read_only_bottom_close_and_rename($out_fh);