This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Add internal function to abort parsing
[perl5.git] / regen / ebcdic.pl
CommitLineData
4bc3dcfa
KW
1use v5.16.0;
2use strict;
3use warnings;
3d7c117d
MB
4
5BEGIN { unshift @INC, '.' }
6
7require './regen/regen_lib.pl';
8require './regen/charset_translations.pl';
4bc3dcfa
KW
9
10# Generates the EBCDIC translation tables that were formerly hard-coded into
11# utfebcdic.h
12
13my $out_fh = open_new('ebcdic_tables.h', '>',
14 {style => '*', by => $0, });
15
702cfe48 16sub output_table ($$;$) {
4bc3dcfa
KW
17 my $table_ref = shift;
18 my $name = shift;
19
43bf0d50 20 # Tables in hex easier to debug, but don't fit into 80 columns
702cfe48 21 my $print_in_hex = shift // 1;
43bf0d50 22
4bc3dcfa
KW
23 die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256;
24
25 print $out_fh "EXTCONST U8 $name\[\] = {\n";
26
702cfe48
KW
27 my $column_numbers= "/*_0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/\n";
28 print $out_fh $column_numbers if $print_in_hex;
4bc3dcfa 29 for my $i (0 .. 255) {
43bf0d50 30 if ($print_in_hex) {
702cfe48
KW
31 # No row headings, so will fit in 80 cols.
32 #printf $out_fh "/* %X_ */ ", $i / 16 if $i % 16 == 0;
33 printf $out_fh "0x%02X", $table_ref->[$i];
43bf0d50
KW
34 }
35 else {
36 printf $out_fh "%4d", $table_ref->[$i];
37 }
4bc3dcfa 38 print $out_fh ",", if $i < 255;
702cfe48
KW
39 #print $out_fh ($i < 255) ? "," : " ";
40 #printf $out_fh " /* %X_ */", $i / 16 if $print_in_hex && $i % 16 == 15;
4bc3dcfa
KW
41 print $out_fh "\n" if $i % 16 == 15;
42 }
702cfe48 43 print $out_fh $column_numbers if $print_in_hex;
4bc3dcfa
KW
44 print $out_fh "};\n\n";
45}
46
47print $out_fh <<END;
48
49#ifndef H_EBCDIC_TABLES /* Guard against nested #includes */
50#define H_EBCDIC_TABLES 1
51
52/* This file contains definitions for various tables used in EBCDIC handling.
53 * More info is in utfebcdic.h */
54END
55
56my @charsets = get_supported_code_pages();
57shift @charsets; # ASCII is the 0th, and we don't deal with that here.
58foreach my $charset (@charsets) {
c30a0cf2
TC
59 # we process the whole array several times, make a copy
60 my @a2e = @{get_a2n($charset)};
4bc3dcfa
KW
61
62 print $out_fh "\n" . get_conditional_compile_line_start($charset);
63 print $out_fh "\n";
64
65 print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
66 output_table(\@a2e, "PL_a2e");
67
68 { # Construct the inverse
69 my @e2a;
70 for my $i (0 .. 255) {
71 $e2a[$a2e[$i]] = $i;
72 }
73 print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
74 output_table(\@e2a, "PL_e2a");
75 }
76
e0dcdb0a 77 my @i82utf = @{get_I8_2_utf($charset)};
4bc3dcfa
KW
78 print $out_fh <<END;
79/* (Confusingly named) Index is $charset I8 byte; value is
80 * $charset UTF-EBCDIC equivalent */
81END
82 output_table(\@i82utf, "PL_utf2e");
83
84 { #Construct the inverse
85 my @utf2i8;
86 for my $i (0 .. 255) {
87 $utf2i8[$i82utf[$i]] = $i;
88 }
89 print $out_fh <<END;
90/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
91 * $charset I8 equivalent */
92END
93 output_table(\@utf2i8, "PL_e2utf");
94 }
95
96 {
97 my @utf8skip;
98
99 # These are invariants or continuation bytes.
100 for my $i (0 .. 0xBF) {
101 $utf8skip[$i82utf[$i]] = 1;
102 }
103
104 # These are start bytes; The skip is the number of consecutive highest
105 # order 1-bits (up to 7)
106 for my $i (0xC0 .. 255) {
107 my $count;
c0236afe
KW
108 if ($i == 0b11111111) {
109 no warnings 'once';
110 $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES;
111 }
112 elsif (($i & 0b11111110) == 0b11111110) {
4bc3dcfa
KW
113 $count= 7;
114 }
115 elsif (($i & 0b11111100) == 0b11111100) {
116 $count= 6;
117 }
118 elsif (($i & 0b11111000) == 0b11111000) {
119 $count= 5;
120 }
121 elsif (($i & 0b11110000) == 0b11110000) {
122 $count= 4;
123 }
124 elsif (($i & 0b11100000) == 0b11100000) {
125 $count= 3;
126 }
127 elsif (($i & 0b11000000) == 0b11000000) {
128 $count= 2;
129 }
130 else {
131 die "Something wrong for UTF8SKIP calculation for $i";
132 }
133 $utf8skip[$i82utf[$i]] = $count;
134 }
135
136 print $out_fh <<END;
137/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes;
138 * 1 for continuation. Adapted from the shadow flags table in tr16. The
139 * entries marked 9 in tr16 are continuation bytes and are marked as length 1
140 * here so that we can recover. */
141END
702cfe48
KW
142 output_table(\@utf8skip, "PL_utf8skip", 0); # The 0 means don't print
143 # in hex
4bc3dcfa
KW
144 }
145
146 use feature 'unicode_strings';
147
148 {
149 my @lc;
150 for my $i (0 .. 255) {
151 $lc[$a2e[$i]] = $a2e[ord lc chr $i];
152 }
153 print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n";
154 output_table(\@lc, "PL_latin1_lc");
155 }
156
157 {
158 my @uc;
159 for my $i (0 .. 255) {
160 my $uc = uc chr $i;
161 if (length $uc > 1 || ord $uc > 255) {
162 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
163 }
164 $uc[$a2e[$i]] = $a2e[ord $uc];
165 }
166 print $out_fh <<END;
167/* Index is $charset code point; value is its uppercase equivalent.
168 * The 'mod' in the name means that codepoints whose uppercase is above 255 or
169 * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
170END
171 output_table(\@uc, "PL_mod_latin1_uc");
172 }
173
174 { # PL_fold
175 my @ascii_fold;
176 for my $i (0 .. 255) { # Initialise to identity map
177 $ascii_fold[$i] = $i;
178 }
179
180 # Overwrite the entries that aren't identity
181 for my $chr ('A' .. 'Z') {
182 $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
183 }
184 for my $chr ('a' .. 'z') {
185 $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
186 }
187 print $out_fh <<END;
188/* Index is $charset code point; For A-Z, value is a-z; for a-z, value
189 * is A-Z; all other code points map to themselves */
190END
191 output_table(\@ascii_fold, "PL_fold");
192 }
193
194 {
195 my @latin1_fold;
196 for my $i (0 .. 255) {
197 my $char = chr $i;
198 my $lc = lc $char;
199
200 # lc and uc adequately proxy for fold-case pairs in this 0-255
201 # range
202 my $uc = uc $char;
203 $uc = $char if length $uc > 1 || ord $uc > 255;
204 if ($lc ne $char) {
205 $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
206 }
207 elsif ($uc ne $char) {
208 $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
209 }
210 else {
211 $latin1_fold[$a2e[$i]] = $a2e[$i];
212 }
213 }
214 print $out_fh <<END;
215/* Index is $charset code point; value is its other fold-pair equivalent
216 * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is
217 * the code point itself */
218END
219 output_table(\@latin1_fold, "PL_fold_latin1");
220 }
221
222 print $out_fh get_conditional_compile_line_end();
223}
224
225print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n";
226
227read_only_bottom_close_and_rename($out_fh);