This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
pat_advanced.t: Update test
[perl5.git] / regen / ebcdic.pl
CommitLineData
4bc3dcfa
KW
1use v5.16.0;
2use strict;
3use warnings;
94e72741 4use integer;
3d7c117d
MB
5
6BEGIN { unshift @INC, '.' }
7
8require './regen/regen_lib.pl';
9require './regen/charset_translations.pl';
4bc3dcfa
KW
10
11# Generates the EBCDIC translation tables that were formerly hard-coded into
12# utfebcdic.h
13
14my $out_fh = open_new('ebcdic_tables.h', '>',
15 {style => '*', by => $0, });
16
94e72741
KW
17sub get_column_headers ($$;$) {
18 my ($row_hdr_len, $field_width, $dfa_columns) = @_;
19 my $format;
20 my $final_column_format;
21 my $num_columns;
22
23 if (defined $dfa_columns) {
24 $num_columns = $dfa_columns;
25
26 # Trailing blank to correspond with commas in the rows below
27 $format = "%${field_width}d ";
28 }
29 else { # Is a regular table
30 $num_columns = 16;
31
32 # Use blanks to separate the fields
33 $format = " " x ( $field_width
34 - 2); # For the '_X'
35 $format .= "_%X "; # Again, trailing blank over the commas below
36 }
37
38 my $header = "/*" . " " x ($row_hdr_len - length "/*");
39
40 # All but the final column
41 $header .= sprintf($format, $_) for 0 .. $num_columns - 2;
42
43 # Get rid of trailing blank, so that the final column takes up one less
44 # space so that the "*/" doesn't extend past the commas in the rows below
45 chop $header;
46 $header .= sprintf $format, $num_columns - 1;
47
48 # Again, remove trailing blank
49 chop $header;
50
51 return $header . "*/\n";
52}
53
8c3c6496
KW
54sub output_table_start($$$;$) {
55 my ($out_fh, $TYPE, $name, $size) = @_;
635ff1f9 56
8c3c6496
KW
57 $size = "" unless defined $size;
58 my $declaration = "EXTCONST $TYPE $name\[$size\]";
635ff1f9
KW
59 print $out_fh <<EOF;
60# ifndef DOINIT
0a142f46 61 $declaration;
635ff1f9 62# else
0a142f46 63 $declaration = {
635ff1f9
KW
64EOF
65}
66
67sub output_table_end($) {
68 print $out_fh "};\n# endif\n\n";
69}
70
702cfe48 71sub output_table ($$;$) {
4bc3dcfa
KW
72 my $table_ref = shift;
73 my $name = shift;
74
94e72741
KW
75 # 0 => print in decimal
76 # 1 => print in hex (translates code point to code point)
f6521f7c 77 # >= 2 => is a dfa table, like https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
94e72741
KW
78 # The number is how many columns in the part after the code point
79 # portion.
80 #
81 # code point tables in hex areasier to debug, but don't fit into 80
82 # columns
83 my $type = shift // 1;
84
85 my $print_in_hex = $type == 1;
86 my $is_dfa = ($type >= 2) ? $type : 0;
87 my $columns_after_256 = 16;
88
89 die "Requres 256 entries in table $name, got @$table_ref"
90 if ! $is_dfa && @$table_ref != 256;
91 if (! $is_dfa) {
92 die "Requres 256 entries in table $name, got @$table_ref"
93 if @$table_ref != 256;
94 }
95 else {
96 $columns_after_256 = $is_dfa;
97
98 print $out_fh <<'EOF';
99
100/* The table below is adapted from
f6521f7c 101 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
94e72741
KW
102 * See copyright notice at the beginning of this file.
103 */
104
105EOF
106 }
107
108 # Highest number in the table
109 my $max_entry = 0;
110 $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref;
43bf0d50 111
94e72741
KW
112 # We assume that every table has at least one two digit entry, and none
113 # are more than three digit.
114 my $field_width = ($print_in_hex)
115 ? 4
116 : (($max_entry) > 99 ? 3 : 2);
4bc3dcfa 117
94e72741
KW
118 my $row_hdr_length;
119 my $node_number_field_width;
120 my $node_value_field_width;
121
122 # dfa tables have a special header for the rows in the transitions part of
123 # the table. It is longer than the regular one.
124 if ($is_dfa) {
125 my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1;
126 $node_number_field_width = ($max_node_number > 9) ? 2 : 1;
127 $node_value_field_width = ($max_node_number * $columns_after_256 > 99)
128 ? 3 : 2;
129 # The header starts with this template, and adds in the number of
130 # digits needed to represent the maximum node number and its value
131 $row_hdr_length = length("/*N=*/")
132 + $node_number_field_width
133 + $node_value_field_width;
134 }
135 else {
136 $row_hdr_length = length "/*_X*/"; # Template for what the header
137 # looks like
138 }
139
140 # The table may not be representable in 8 bits.
141 my $TYPE = 'U8';
142 $TYPE = 'U16' if grep { $_ > 255 } @$table_ref;
143
635ff1f9 144 output_table_start $out_fh, $TYPE, $name;
4bc3dcfa 145
94e72741
KW
146 # First the headers for the columns
147 print $out_fh get_column_headers($row_hdr_length, $field_width);
148
149 # Now the table body
150 my $count = @$table_ref;
151 my $last_was_nl = 1;
152
153 # Print each element individually, arranged in rows of columns
154 for my $i (0 .. $count - 1) {
155
156 # Node number for here is -1 until get into the dfa state transitions
157 my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256;
158
159 # Print row header at beginning of each row
160 if ($last_was_nl) {
161 if ($node >= 0) {
162 printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node,
163 $node_value_field_width, $i - 256;
164 }
165 else { # Otherwise is regular row; print its number
166 printf $out_fh "/*%X_", $i / 16;
167
168 # These rows in a dfa table require extra space so columns
169 # will align vertically (because the Ndd=ddd requires extra
170 # space)
171 if ($is_dfa) {
172 print $out_fh " " x ( $node_number_field_width
173 + $node_value_field_width);
174 }
175 print $out_fh "*/";
176 }
177 }
178
43bf0d50 179 if ($print_in_hex) {
702cfe48 180 printf $out_fh "0x%02X", $table_ref->[$i];
43bf0d50
KW
181 }
182 else {
94e72741
KW
183 printf $out_fh "%${field_width}d", $table_ref->[$i];
184 }
185
186 print $out_fh ",", if $i < $count -1; # No comma on final entry
187
188 # Add \n if at end of row, which is 16 columns until we get to the
189 # transitions part
190 if ( ($node < 0 && $i % 16 == 15)
191 || ($node >= 0 && ($i -256) % $columns_after_256
192 == $columns_after_256 - 1))
193 {
194 print $out_fh "\n";
195 $last_was_nl = 1;
196 }
197 else {
198 $last_was_nl = 0;
43bf0d50 199 }
4bc3dcfa 200 }
94e72741
KW
201
202 # Print column footer
203 print $out_fh get_column_headers($row_hdr_length, $field_width,
204 ($is_dfa) ? $columns_after_256 : undef);
205
635ff1f9 206 output_table_end($out_fh);
4bc3dcfa
KW
207}
208
3de6d141 209print $out_fh <<'END';
4bc3dcfa 210
6a5bc5ac
KW
211#ifndef PERL_EBCDIC_TABLES_H_ /* Guard against nested #includes */
212#define PERL_EBCDIC_TABLES_H_ 1
4bc3dcfa
KW
213
214/* This file contains definitions for various tables used in EBCDIC handling.
3de6d141
KW
215 * More info is in utfebcdic.h
216 *
217 * Some of the tables are adapted from
f6521f7c 218 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
3de6d141
KW
219 * which requires this copyright notice:
220
221Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
222
223Permission is hereby granted, free of charge, to any person obtaining a copy of
224this software and associated documentation files (the "Software"), to deal in
225the Software without restriction, including without limitation the rights to
226use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
227of the Software, and to permit persons to whom the Software is furnished to do
228so, subject to the following conditions:
229
230The above copyright notice and this permission notice shall be included in all
231copies or substantial portions of the Software.
232
233THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
234IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
235FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
236AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
237LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
238OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
239SOFTWARE.
240
241*/
4bc3dcfa
KW
242END
243
244my @charsets = get_supported_code_pages();
245shift @charsets; # ASCII is the 0th, and we don't deal with that here.
246foreach my $charset (@charsets) {
c30a0cf2
TC
247 # we process the whole array several times, make a copy
248 my @a2e = @{get_a2n($charset)};
b37fc6e8 249 my @e2a;
4bc3dcfa
KW
250
251 print $out_fh "\n" . get_conditional_compile_line_start($charset);
252 print $out_fh "\n";
253
254 print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
255 output_table(\@a2e, "PL_a2e");
256
257 { # Construct the inverse
4bc3dcfa
KW
258 for my $i (0 .. 255) {
259 $e2a[$a2e[$i]] = $i;
260 }
261 print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
262 output_table(\@e2a, "PL_e2a");
263 }
264
e0dcdb0a 265 my @i82utf = @{get_I8_2_utf($charset)};
4bc3dcfa
KW
266 print $out_fh <<END;
267/* (Confusingly named) Index is $charset I8 byte; value is
268 * $charset UTF-EBCDIC equivalent */
269END
270 output_table(\@i82utf, "PL_utf2e");
271
272 { #Construct the inverse
273 my @utf2i8;
274 for my $i (0 .. 255) {
275 $utf2i8[$i82utf[$i]] = $i;
276 }
277 print $out_fh <<END;
278/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
279 * $charset I8 equivalent */
280END
281 output_table(\@utf2i8, "PL_e2utf");
282 }
283
284 {
285 my @utf8skip;
286
287 # These are invariants or continuation bytes.
288 for my $i (0 .. 0xBF) {
289 $utf8skip[$i82utf[$i]] = 1;
290 }
291
292 # These are start bytes; The skip is the number of consecutive highest
293 # order 1-bits (up to 7)
294 for my $i (0xC0 .. 255) {
295 my $count;
c0236afe
KW
296 if ($i == 0b11111111) {
297 no warnings 'once';
298 $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES;
299 }
300 elsif (($i & 0b11111110) == 0b11111110) {
4bc3dcfa
KW
301 $count= 7;
302 }
303 elsif (($i & 0b11111100) == 0b11111100) {
304 $count= 6;
305 }
306 elsif (($i & 0b11111000) == 0b11111000) {
307 $count= 5;
308 }
309 elsif (($i & 0b11110000) == 0b11110000) {
310 $count= 4;
311 }
312 elsif (($i & 0b11100000) == 0b11100000) {
313 $count= 3;
314 }
315 elsif (($i & 0b11000000) == 0b11000000) {
316 $count= 2;
317 }
318 else {
319 die "Something wrong for UTF8SKIP calculation for $i";
320 }
321 $utf8skip[$i82utf[$i]] = $count;
322 }
323
324 print $out_fh <<END;
4719093e
KW
325/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes
326 * (including for overlongs); 1 for continuation. Adapted from the shadow
327 * flags table in tr16. The entries marked 9 in tr16 are continuation bytes
328 * and are marked as length 1 here so that we can recover. */
4bc3dcfa 329END
702cfe48
KW
330 output_table(\@utf8skip, "PL_utf8skip", 0); # The 0 means don't print
331 # in hex
4bc3dcfa
KW
332 }
333
334 use feature 'unicode_strings';
335
336 {
337 my @lc;
338 for my $i (0 .. 255) {
339 $lc[$a2e[$i]] = $a2e[ord lc chr $i];
340 }
94e72741
KW
341 print $out_fh
342 "/* Index is $charset code point; value is its lowercase equivalent */\n";
4bc3dcfa
KW
343 output_table(\@lc, "PL_latin1_lc");
344 }
345
346 {
347 my @uc;
348 for my $i (0 .. 255) {
349 my $uc = uc chr $i;
350 if (length $uc > 1 || ord $uc > 255) {
351 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
352 }
353 $uc[$a2e[$i]] = $a2e[ord $uc];
354 }
355 print $out_fh <<END;
356/* Index is $charset code point; value is its uppercase equivalent.
357 * The 'mod' in the name means that codepoints whose uppercase is above 255 or
358 * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
359END
360 output_table(\@uc, "PL_mod_latin1_uc");
361 }
362
363 { # PL_fold
364 my @ascii_fold;
365 for my $i (0 .. 255) { # Initialise to identity map
366 $ascii_fold[$i] = $i;
367 }
368
369 # Overwrite the entries that aren't identity
370 for my $chr ('A' .. 'Z') {
371 $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
372 }
373 for my $chr ('a' .. 'z') {
374 $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
375 }
376 print $out_fh <<END;
377/* Index is $charset code point; For A-Z, value is a-z; for a-z, value
378 * is A-Z; all other code points map to themselves */
379END
380 output_table(\@ascii_fold, "PL_fold");
381 }
382
383 {
384 my @latin1_fold;
385 for my $i (0 .. 255) {
386 my $char = chr $i;
387 my $lc = lc $char;
388
389 # lc and uc adequately proxy for fold-case pairs in this 0-255
390 # range
391 my $uc = uc $char;
392 $uc = $char if length $uc > 1 || ord $uc > 255;
393 if ($lc ne $char) {
394 $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
395 }
396 elsif ($uc ne $char) {
397 $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
398 }
399 else {
400 $latin1_fold[$a2e[$i]] = $a2e[$i];
401 }
402 }
403 print $out_fh <<END;
404/* Index is $charset code point; value is its other fold-pair equivalent
405 * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is
406 * the code point itself */
407END
408 output_table(\@latin1_fold, "PL_fold_latin1");
409 }
410
3de6d141
KW
411 {
412 # This generates the dfa table for perl extended UTF-8, which accepts
413 # surrogates, non-characters, and accepts start bytes up through FE
414 # (start byte FF has to be handled outside this dfa). The class numbers
415 # for start bytes are constrained so that they can be used as a shift
416 # count for masking off the leading one bits
417 #
418 # The classes are
419 # 00-9F 0
420 # A0-A1 7 Not legal immediately after start bytes F0 F8 FC
421 # FE
422 # A2-A3 8 Not legal immediately after start bytes F0 F8 FC
423 # A4-A7 9 Not legal immediately after start bytes F0 F8
424 # A8-AF 10 Not legal immediately after start bytes F0
425 # B0-BF 11
426 # C0-C4 1
427 # C5-DF 2
428 # E0 1
429 # E1-EF 3
430 # F0 12
431 # F1-F7 4
432 # F8 13
433 # F9-FB 5
434 # FC 14
435 # FD 6
436 # FE 15
437 # FF 1
438 #
439 # Here's the I8 for the code points before which overlongs occur:
440 # U+4000: \xF0\xB0\xA0\xA0
441 # U+40000: \xF8\xA8\xA0\xA0\xA0
442 # U+400000: \xFC\xA4\xA0\xA0\xA0\xA0
443 # U+4000000: \xFE\xA2\xA0\xA0\xA0\xA0\xA0
444 #
445 # The first part of the table maps bytes to character classes to reduce
446 # the size of the transition table and create bitmasks.
447 #
448 # The second part is a transition table that maps a combination of a
449 # state of the automaton and a character class to a new state. The
450 # numbering of the original nodes is retained, but some have been split
451 # so that there are new nodes. They mean:
452 # N0 The initial state, and final accepting one.
453 # N1 One continuation byte (A0-BF) left. This is transitioned to
454 # immediately when the start byte indicates a two-byte sequence
455 # N2 Two continuation bytes left.
456 # N3 Three continuation bytes left.
457 # N4 Four continuation bytes left.
458 # N5 Five continuation bytes left.
459 # N6 Start byte is F0. Continuation bytes A[0-F] are illegal
460 # (overlong); the other continuations transition to N2
461 # N7 Start byte is F8. Continuation bytes A[0-7] are illegal
462 # (overlong); the other continuations transition to N3
463 # N8 Start byte is FC. Continuation bytes A[0-3] are illegal
464 # (overlong); the other continuations transition to N4
465 # N9 Start byte is FE. Continuation bytes A[01] are illegal
466 # (overlong); the other continuations transition to N5
467 # 1 Reject. All transitions not mentioned above (except the single
468 # byte ones (as they are always legal) are to this state.
469
470 my $NUM_CLASSES = 16;
471 my $N0 = 0;
472 my $N1 = $N0 + $NUM_CLASSES;
473 my $N2 = $N1 + $NUM_CLASSES;
474 my $N3 = $N2 + $NUM_CLASSES;
475 my $N4 = $N3 + $NUM_CLASSES;
476 my $N5 = $N4 + $NUM_CLASSES;
477 my $N6 = $N5 + $NUM_CLASSES;
478 my $N7 = $N6 + $NUM_CLASSES;
479 my $N8 = $N7 + $NUM_CLASSES;
480 my $N9 = $N8 + $NUM_CLASSES;
481 my $N10 = $N9 + $NUM_CLASSES;
482
483 my @perl_extended_utf8_dfa;
484 my @i8 = (
485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
495 7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF
496 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF
497 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
498 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
499 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
500 12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF
501 );
502 $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
503 push @perl_extended_utf8_dfa, (
504 # Class:
505 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
506 0, 1,$N1,$N2,$N3,$N4,$N5, 1, 1, 1, 1, 1,$N6,$N7,$N8,$N9, # N0
507 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, # N1
508 1, 1, 1, 1, 1, 1, 1,$N1,$N1,$N1,$N1,$N1, 1, 1, 1, 1, # N2
509 1, 1, 1, 1, 1, 1, 1,$N2,$N2,$N2,$N2,$N2, 1, 1, 1, 1, # N3
510 1, 1, 1, 1, 1, 1, 1,$N3,$N3,$N3,$N3,$N3, 1, 1, 1, 1, # N4
511 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4,$N4,$N4, 1, 1, 1, 1, # N5
512
513 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N2, 1, 1, 1, 1, # N6
514 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N3,$N3, 1, 1, 1, 1, # N7
515 1, 1, 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4, 1, 1, 1, 1, # N8
516 1, 1, 1, 1, 1, 1, 1, 1,$N5,$N5,$N5,$N5, 1, 1, 1, 1, # N9
517 );
71525f77 518 output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab",
3de6d141
KW
519 $NUM_CLASSES);
520 }
521
123deead
KW
522 {
523 # This generates the dfa table for strict UTF-8, which rejects
524 # surrogates, non-characters, and above Unicode.
525 #
526 # The classes are
527 # 00-9F 0 Always legal at start
528 # A0 10 Not legal immediately after start bytes F0 F8
529 # A1 11 Not legal immediately after start bytes F0 F8,
530 # A2-A7 12 Not legal immediately after start bytes F0 F8 F9
531 # A8,AA,AC 13 Not legal immediately after start bytes F0 F9
532 # A9,AB,AD 14 Not legal immediately after start byte F0
533 # AE 15 Not legal immediately after start byte F0
534 # AF 16 Not legal immediately after start bytes F0
535 # B[0248AC] 17 Not legal immediately after start byte F9
536 # B[1359D] 18 Not legal immediately after start byte F9
537 # B6 19 Not legal immediately after start byte F9
538 # B7 20 Not legal immediately after start byte F9
539 # BE 21 Not legal immediately after start byte F9
540 # BF 22 Not legal immediately after start byte F9
541 # C0-C4 1 (reject, all are overlong)
542 # C5-DF 2 Accepts any legal continuation
543 # E0 1 (reject, all are overlong)
544 # E1-EF 3 Accepts any legal continuation
545 # F0 8 (has overlongs)
546 # F1 6 (has surrogates, non-chars)
547 # F2,F4,F6 4 Accepts any legal continuation
548 # F3,F5,F7 5 (has non-chars)
549 # F8 9 (has overlongs, non-chars)
550 # F9 7 (has non-chars, non-Unicode)
551 # FA-FF 1 (reject, all are non-Unicode)
552 #
553 # Here's the I8 for enough code points so that you can figure out what's
554 # going on:
555 #
556 # U+D800: \xF1\xB6\xA0\xA0
557 # U+DFFF: \xF1\xB7\xBF\xBF
558 # U+FDD0: \xF1\xBF\xAE\xB0
559 # U+FDEF: \xF1\xBF\xAF\xAF
560 # U+FFFE: \xF1\xBF\xBF\xBE
561 # U+1FFFE: \xF3\xBF\xBF\xBE
562 # U+2FFFE: \xF5\xBF\xBF\xBE
563 # U+3FFFE: \xF7\xBF\xBF\xBE
564 # U+4FFFE: \xF8\xA9\xBF\xBF\xBE
565 # U+5FFFE: \xF8\xAB\xBF\xBF\xBE
566 # U+6FFFE: \xF8\xAD\xBF\xBF\xBE
567 # U+7FFFE: \xF8\xAF\xBF\xBF\xBE
568 # U+8FFFE: \xF8\xB1\xBF\xBF\xBE
569 # U+9FFFE: \xF8\xB3\xBF\xBF\xBE
570 # U+AFFFE: \xF8\xB5\xBF\xBF\xBE
571 # U+BFFFE: \xF8\xB7\xBF\xBF\xBE
572 # U+CFFFE: \xF8\xB9\xBF\xBF\xBE
573 # U+DFFFE: \xF8\xBB\xBF\xBF\xBE
574 # U+EFFFE: \xF8\xBD\xBF\xBF\xBE
575 # U+FFFFE: \xF8\xBF\xBF\xBF\xBE
576 # U+10FFFE: \xF9\xA1\xBF\xBF\xBE
577 #
578 # The first part of the table maps bytes to character classes to reduce
579 # the size of the transition table and create bitmasks.
580 #
581 # The second part is a transition table that maps a combination of a
582 # state of the automaton and a character class to a new state. The
583 # numbering of the original nodes is retained, but some have been split
584 # so that there are new nodes. They mean:
585 # N0 The initial state, and final accepting one.
586 # N1 One continuation byte (A0-BF) left. This is transitioned to
587 # immediately when the start byte indicates a two-byte sequence
588 # N2 Two continuation bytes left.
589 # N3 Three continuation bytes left.
590 # N4 Start byte is F0. Continuation bytes A[0-F] are illegal
591 # (overlong); the other continuations transition to N2
592 # N5 Start byte is F1. Continuation bytes B6 and B7 are illegal
593 # (surrogates); BF transitions to N9; the other continuations to
594 # N2
595 # N6 Start byte is F[357]. Continuation byte BF transitions to N12;
596 # other continuations to N2
597 # N5 Start byte is F8. Continuation bytes A[0-7] are illegal
598 # (overlong); continuations A[9BDF] and B[13579BDF] transition to
599 # N14; the other continuations to N3
600 # N8 Start byte is F9. Continuation byte A0 transitions to N3; A1
601 # to N14; the other continuation bytes are illegal.
602 # N9 Initial sequence is F1 BF. Continuation byte AE transitions to
603 # state N10; AF to N11; BF to N13; the other continuations to N1.
604 # N10 Initial sequence is F1 BF AE. Continuation bytes B0-BF are
605 # illegal (non-chars); the other continuations are legal
606 # N11 Initial sequence is F1 BF AF. Continuation bytes A0-AF are
607 # illegal (non-chars); the other continuations are legal
608 # N12 Initial sequence is F[357] BF. Continuation bytes BF
609 # transitions to N13; the other continuations to N1
610 # N13 Initial sequence is F[1357] BF BF or F8 x BF (where x is
611 # something that can lead to a non-char. Continuation bytes BE
612 # and BF are illegal (non-chars); the other continuations are
613 # legal
614 # N14 Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1.
615 # Continuation byte BF transitions to N13; the other
616 # continuations to N2
617 # 1 Reject. All transitions not mentioned above (except the single
618 # byte ones (as they are always legal) are to this state.
619
620 my $NUM_CLASSES = 23;
621 my $N0 = 0;
622 my $N1 = $N0 + $NUM_CLASSES;
623 my $N2 = $N1 + $NUM_CLASSES;
624 my $N3 = $N2 + $NUM_CLASSES;
625 my $N4 = $N3 + $NUM_CLASSES;
626 my $N5 = $N4 + $NUM_CLASSES;
627 my $N6 = $N5 + $NUM_CLASSES;
628 my $N7 = $N6 + $NUM_CLASSES;
629 my $N8 = $N7 + $NUM_CLASSES;
630 my $N9 = $N8 + $NUM_CLASSES;
631 my $N10 = $N9 + $NUM_CLASSES;
632 my $N11 = $N10 + $NUM_CLASSES;
633 my $N12 = $N11 + $NUM_CLASSES;
634 my $N13 = $N12 + $NUM_CLASSES;
635 my $N14 = $N13 + $NUM_CLASSES;
636
637 my @strict_utf8_dfa;
638 my @i8 = (
639 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
640 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
641 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
642 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
643 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
644 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
645 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
649 10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF
650 17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF
651 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
652 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
653 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
654 8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF
655 );
656 $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
657 push @strict_utf8_dfa, (
658 # Class:
659 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
660 0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N0
661 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # N1
662 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2
663 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3
664
665 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, # N4
666 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, 1, 1, $N2, $N9, # N5
667 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6
668 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7
669 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N8
670 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9
671 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, # N10
672 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, # N11
673 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12
674 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, # N13
675 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N13, # N14
676 );
71525f77 677 output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES);
123deead
KW
678 }
679
c5bfbb64
KW
680 {
681 # This generates the dfa table for C9 strict UTF-8, which rejects
682 # surrogates and above Unicode, but allows non-characters,.
683 #
684 # The classes are
685 # 00-9F 0 Always legal at start
686 # A0-A1 9 Not legal immediately after start bytes F0 F8
687 # A2-A7 10 Not legal immediately after start bytes F0 F8 F9
688 # A8-AF 11 Not legal immediately after start bytes F0 F9
689 # B0-B5,B8-BF 12 Not legal immediately after start byte F9
690 # B6,B7 13
691 # C0-C4 1 (reject, all are overlong)
692 # C5-DF 2 Accepts any legal continuation
693 # E0 1 (reject, all are overlong)
694 # E1-EF 3 Accepts any legal continuation
695 # F0 6 (has overlongs)
696 # F1 5 (has surrogates)
697 # F2-F7 4 Accepts any legal continuation
698 # F8 8 (has overlongs)
699 # F9 7 (has non-Unicode)
700 # FA-FF 1 (reject, all are non-Unicode)
701 #
702 # The first part of the table maps bytes to character classes to reduce
703 # the size of the transition table and create bitmasks.
704 #
705 # The second part is a transition table that maps a combination of a
706 # state of the automaton and a character class to a new state. The
707 # numbering of the original nodes is retained, but some have been split
708 # so that there are new nodes. They mean:
709 # N0 The initial state, and final accepting one.
710 # N1 One continuation byte (A0-BF) left. This is transitioned to
711 # immediately when the start byte indicates a two-byte sequence
712 # N2 Two continuation bytes left.
713 # N3 Three continuation bytes left.
714 # N4 Start byte is F0. Continuation bytes A[0-F] are illegal
715 # (overlong); the other continuations transition to N2
716 # N5 Start byte is F1. B6 and B7 are illegal (surrogates); the
717 # other continuations transition to N2
718 # N6 Start byte is F8. Continuation bytes A[0-7] are illegal
719 # (overlong); the other continuations transition to N3
720 # N7 Start byte is F9. Continuation bytes A0 and A1 transition to
721 # N3; the other continuation bytes are illegal (non-Unicode)
722 # 1 Reject. All transitions not mentioned above (except the single
723 # byte ones (as they are always legal) are to this state.
724
725 my $NUM_CLASSES = 14;
726 my $N0 = 0;
727 my $N1 = $N0 + $NUM_CLASSES;
728 my $N2 = $N1 + $NUM_CLASSES;
729 my $N3 = $N2 + $NUM_CLASSES;
730 my $N4 = $N3 + $NUM_CLASSES;
731 my $N5 = $N4 + $NUM_CLASSES;
732 my $N6 = $N5 + $NUM_CLASSES;
733 my $N7 = $N6 + $NUM_CLASSES;
734
735 my @C9_utf8_dfa;
736 my @i8 = (
737 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
738 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
739 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
740 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
741 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
742 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
743 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
744 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
745 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
747 9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF
748 12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF
749 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
750 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
751 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
752 6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF
753 );
754 $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
755 push @C9_utf8_dfa, (
756 # Class:
757 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13
758 0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6, 1, 1, 1, 1, 1, # N0
759 1,1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, # N1
760 1,1, 1, 1, 1, 1, 1, 1, 1,$N1, $N1, $N1, $N1, $N1, # N2
761 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, $N2, # N3
762
763 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, # N4
764 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, 1, # N5
765 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3, $N3, $N3, # N6
766 1,1, 1, 1, 1, 1, 1, 1, 1,$N3, 1, 1, 1, 1, # N7
767 );
71525f77 768 output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
c5bfbb64
KW
769 }
770
b37fc6e8
KW
771 {
772 print $out_fh <<EOF;
773/* This table partitions all the code points of the platform into ranges which
774 * have the property that all the code points in each range have the same
775 * number of bytes in their UTF-EBCDIC representations, and the adjacent
776 * ranges have a different number of bytes.
777 *
778 * Each number in the table begins such a range, which extends up to just
779 * before the following table entry, except the final entry is understood to
780 * extend to the platform's infinity
781 */
782EOF
b37fc6e8
KW
783 # The lengths of the characters between 0 and 255 are either 1 or 2,
784 # with those whose ASCII platform equivalents below 160 being 1, and
785 # the rest being 2.
786 my @list;
787 push @list, 0;
788 my $pushed_range_is_length_1 = 1;
789
790 for my $i (1 .. 0xFF) {
791 my $this_code_point_is_length_1 = ($e2a[$i] < 160);
792 if ($pushed_range_is_length_1 != $this_code_point_is_length_1) {
793 push @list, $i;
794 $pushed_range_is_length_1 = $this_code_point_is_length_1;
795 }
796 }
797
798 # Starting at 256, the length is 2.
799 push @list, 0x100 if $pushed_range_is_length_1;
800
801 # These are based on the fundamental properties of UTF-EBCDIC. Each
802 # continuation byte has 5 bits of information. Comments in utf8.h
803 # explain the rest.
804 my $UTF_ACCUMULATION_SHIFT = 5;
805 push @list, (32 * (1 << ( $UTF_ACCUMULATION_SHIFT)));
806 push @list, (16 * (1 << (2 * $UTF_ACCUMULATION_SHIFT)));
807 push @list, ( 8 * (1 << (3 * $UTF_ACCUMULATION_SHIFT)));
808 push @list, ( 4 * (1 << (4 * $UTF_ACCUMULATION_SHIFT)));
809 push @list, ( 2 * (1 << (5 * $UTF_ACCUMULATION_SHIFT)));
810 push @list, ( (1 << (6 * $UTF_ACCUMULATION_SHIFT)));
811
8c3c6496
KW
812 output_table_start($out_fh, "UV", "PL_partition_by_byte_length", scalar @list);
813 print $out_fh "\t";
814
b37fc6e8
KW
815 print $out_fh join ",\n\t", map { sprintf "0x%02x", $_ } @list;
816 print $out_fh "\n";
817
818 output_table_end($out_fh);
819 }
820
4bc3dcfa
KW
821 print $out_fh get_conditional_compile_line_end();
822}
823
6a5bc5ac 824print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n";
4bc3dcfa
KW
825
826read_only_bottom_close_and_rename($out_fh);