This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Change docs display for PERL_UNUSED_foo
[perl5.git] / regen / ebcdic.pl
CommitLineData
4bc3dcfa
KW
1use v5.16.0;
2use strict;
3use warnings;
94e72741 4use integer;
3d7c117d
MB
5
6BEGIN { unshift @INC, '.' }
7
8require './regen/regen_lib.pl';
9require './regen/charset_translations.pl';
4bc3dcfa
KW
10
11# Generates the EBCDIC translation tables that were formerly hard-coded into
12# utfebcdic.h
13
14my $out_fh = open_new('ebcdic_tables.h', '>',
15 {style => '*', by => $0, });
16
94e72741
KW
17sub get_column_headers ($$;$) {
18 my ($row_hdr_len, $field_width, $dfa_columns) = @_;
19 my $format;
20 my $final_column_format;
21 my $num_columns;
22
23 if (defined $dfa_columns) {
24 $num_columns = $dfa_columns;
25
26 # Trailing blank to correspond with commas in the rows below
27 $format = "%${field_width}d ";
28 }
29 else { # Is a regular table
30 $num_columns = 16;
31
32 # Use blanks to separate the fields
33 $format = " " x ( $field_width
34 - 2); # For the '_X'
35 $format .= "_%X "; # Again, trailing blank over the commas below
36 }
37
38 my $header = "/*" . " " x ($row_hdr_len - length "/*");
39
40 # All but the final column
41 $header .= sprintf($format, $_) for 0 .. $num_columns - 2;
42
43 # Get rid of trailing blank, so that the final column takes up one less
44 # space so that the "*/" doesn't extend past the commas in the rows below
45 chop $header;
46 $header .= sprintf $format, $num_columns - 1;
47
48 # Again, remove trailing blank
49 chop $header;
50
51 return $header . "*/\n";
52}
53
8c3c6496
KW
54sub output_table_start($$$;$) {
55 my ($out_fh, $TYPE, $name, $size) = @_;
635ff1f9 56
8c3c6496 57 $size = "" unless defined $size;
f2782df8
KW
58
59 # Anything locale related will be written on
60 my $const = ($name !~ /locale/i) ? 'CONST' : "";
61
62 my $declaration = "EXT$const $TYPE $name\[$size\]";
635ff1f9
KW
63 print $out_fh <<EOF;
64# ifndef DOINIT
0a142f46 65 $declaration;
635ff1f9 66# else
0a142f46 67 $declaration = {
635ff1f9
KW
68EOF
69}
70
71sub output_table_end($) {
72 print $out_fh "};\n# endif\n\n";
73}
74
702cfe48 75sub output_table ($$;$) {
4bc3dcfa
KW
76 my $table_ref = shift;
77 my $name = shift;
78
683f7990
KW
79 my $size = @$table_ref;
80
94e72741
KW
81 # 0 => print in decimal
82 # 1 => print in hex (translates code point to code point)
f6521f7c 83 # >= 2 => is a dfa table, like https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
94e72741
KW
84 # The number is how many columns in the part after the code point
85 # portion.
86 #
87 # code point tables in hex areasier to debug, but don't fit into 80
88 # columns
89 my $type = shift // 1;
90
91 my $print_in_hex = $type == 1;
92 my $is_dfa = ($type >= 2) ? $type : 0;
93 my $columns_after_256 = 16;
94
95 die "Requres 256 entries in table $name, got @$table_ref"
96 if ! $is_dfa && @$table_ref != 256;
97 if (! $is_dfa) {
98 die "Requres 256 entries in table $name, got @$table_ref"
99 if @$table_ref != 256;
100 }
101 else {
102 $columns_after_256 = $is_dfa;
103
104 print $out_fh <<'EOF';
105
106/* The table below is adapted from
f6521f7c 107 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
94e72741
KW
108 * See copyright notice at the beginning of this file.
109 */
110
111EOF
112 }
113
114 # Highest number in the table
115 my $max_entry = 0;
116 $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref;
43bf0d50 117
94e72741
KW
118 # We assume that every table has at least one two digit entry, and none
119 # are more than three digit.
120 my $field_width = ($print_in_hex)
121 ? 4
122 : (($max_entry) > 99 ? 3 : 2);
4bc3dcfa 123
94e72741
KW
124 my $row_hdr_length;
125 my $node_number_field_width;
126 my $node_value_field_width;
127
128 # dfa tables have a special header for the rows in the transitions part of
129 # the table. It is longer than the regular one.
130 if ($is_dfa) {
131 my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1;
132 $node_number_field_width = ($max_node_number > 9) ? 2 : 1;
133 $node_value_field_width = ($max_node_number * $columns_after_256 > 99)
134 ? 3 : 2;
135 # The header starts with this template, and adds in the number of
136 # digits needed to represent the maximum node number and its value
137 $row_hdr_length = length("/*N=*/")
138 + $node_number_field_width
139 + $node_value_field_width;
140 }
141 else {
142 $row_hdr_length = length "/*_X*/"; # Template for what the header
143 # looks like
144 }
145
146 # The table may not be representable in 8 bits.
147 my $TYPE = 'U8';
148 $TYPE = 'U16' if grep { $_ > 255 } @$table_ref;
149
683f7990 150 output_table_start $out_fh, $TYPE, $name, $size;
4bc3dcfa 151
94e72741
KW
152 # First the headers for the columns
153 print $out_fh get_column_headers($row_hdr_length, $field_width);
154
155 # Now the table body
156 my $count = @$table_ref;
157 my $last_was_nl = 1;
158
159 # Print each element individually, arranged in rows of columns
160 for my $i (0 .. $count - 1) {
161
162 # Node number for here is -1 until get into the dfa state transitions
163 my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256;
164
165 # Print row header at beginning of each row
166 if ($last_was_nl) {
167 if ($node >= 0) {
168 printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node,
169 $node_value_field_width, $i - 256;
170 }
171 else { # Otherwise is regular row; print its number
172 printf $out_fh "/*%X_", $i / 16;
173
174 # These rows in a dfa table require extra space so columns
175 # will align vertically (because the Ndd=ddd requires extra
176 # space)
177 if ($is_dfa) {
178 print $out_fh " " x ( $node_number_field_width
179 + $node_value_field_width);
180 }
181 print $out_fh "*/";
182 }
183 }
184
43bf0d50 185 if ($print_in_hex) {
702cfe48 186 printf $out_fh "0x%02X", $table_ref->[$i];
43bf0d50
KW
187 }
188 else {
94e72741
KW
189 printf $out_fh "%${field_width}d", $table_ref->[$i];
190 }
191
192 print $out_fh ",", if $i < $count -1; # No comma on final entry
193
194 # Add \n if at end of row, which is 16 columns until we get to the
195 # transitions part
196 if ( ($node < 0 && $i % 16 == 15)
197 || ($node >= 0 && ($i -256) % $columns_after_256
198 == $columns_after_256 - 1))
199 {
200 print $out_fh "\n";
201 $last_was_nl = 1;
202 }
203 else {
204 $last_was_nl = 0;
43bf0d50 205 }
4bc3dcfa 206 }
94e72741
KW
207
208 # Print column footer
209 print $out_fh get_column_headers($row_hdr_length, $field_width,
210 ($is_dfa) ? $columns_after_256 : undef);
211
635ff1f9 212 output_table_end($out_fh);
4bc3dcfa
KW
213}
214
3de6d141 215print $out_fh <<'END';
4bc3dcfa 216
6a5bc5ac
KW
217#ifndef PERL_EBCDIC_TABLES_H_ /* Guard against nested #includes */
218#define PERL_EBCDIC_TABLES_H_ 1
4bc3dcfa
KW
219
220/* This file contains definitions for various tables used in EBCDIC handling.
3de6d141
KW
221 * More info is in utfebcdic.h
222 *
223 * Some of the tables are adapted from
f6521f7c 224 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
3de6d141
KW
225 * which requires this copyright notice:
226
227Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
228
229Permission is hereby granted, free of charge, to any person obtaining a copy of
230this software and associated documentation files (the "Software"), to deal in
231the Software without restriction, including without limitation the rights to
232use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
233of the Software, and to permit persons to whom the Software is furnished to do
234so, subject to the following conditions:
235
236The above copyright notice and this permission notice shall be included in all
237copies or substantial portions of the Software.
238
239THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
240IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
241FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
242AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
243LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
244OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
245SOFTWARE.
246
247*/
4bc3dcfa
KW
248END
249
250my @charsets = get_supported_code_pages();
251shift @charsets; # ASCII is the 0th, and we don't deal with that here.
252foreach my $charset (@charsets) {
c30a0cf2
TC
253 # we process the whole array several times, make a copy
254 my @a2e = @{get_a2n($charset)};
b37fc6e8 255 my @e2a;
4bc3dcfa
KW
256
257 print $out_fh "\n" . get_conditional_compile_line_start($charset);
258 print $out_fh "\n";
259
260 print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
261 output_table(\@a2e, "PL_a2e");
262
263 { # Construct the inverse
4bc3dcfa
KW
264 for my $i (0 .. 255) {
265 $e2a[$a2e[$i]] = $i;
266 }
267 print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
268 output_table(\@e2a, "PL_e2a");
269 }
270
e0dcdb0a 271 my @i82utf = @{get_I8_2_utf($charset)};
4bc3dcfa
KW
272 print $out_fh <<END;
273/* (Confusingly named) Index is $charset I8 byte; value is
274 * $charset UTF-EBCDIC equivalent */
275END
276 output_table(\@i82utf, "PL_utf2e");
277
278 { #Construct the inverse
279 my @utf2i8;
280 for my $i (0 .. 255) {
281 $utf2i8[$i82utf[$i]] = $i;
282 }
283 print $out_fh <<END;
284/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
285 * $charset I8 equivalent */
286END
287 output_table(\@utf2i8, "PL_e2utf");
288 }
289
290 {
291 my @utf8skip;
292
293 # These are invariants or continuation bytes.
294 for my $i (0 .. 0xBF) {
295 $utf8skip[$i82utf[$i]] = 1;
296 }
297
298 # These are start bytes; The skip is the number of consecutive highest
299 # order 1-bits (up to 7)
300 for my $i (0xC0 .. 255) {
301 my $count;
c0236afe
KW
302 if ($i == 0b11111111) {
303 no warnings 'once';
304 $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES;
305 }
306 elsif (($i & 0b11111110) == 0b11111110) {
4bc3dcfa
KW
307 $count= 7;
308 }
309 elsif (($i & 0b11111100) == 0b11111100) {
310 $count= 6;
311 }
312 elsif (($i & 0b11111000) == 0b11111000) {
313 $count= 5;
314 }
315 elsif (($i & 0b11110000) == 0b11110000) {
316 $count= 4;
317 }
318 elsif (($i & 0b11100000) == 0b11100000) {
319 $count= 3;
320 }
321 elsif (($i & 0b11000000) == 0b11000000) {
322 $count= 2;
323 }
324 else {
325 die "Something wrong for UTF8SKIP calculation for $i";
326 }
327 $utf8skip[$i82utf[$i]] = $count;
328 }
329
330 print $out_fh <<END;
4719093e
KW
331/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes
332 * (including for overlongs); 1 for continuation. Adapted from the shadow
333 * flags table in tr16. The entries marked 9 in tr16 are continuation bytes
334 * and are marked as length 1 here so that we can recover. */
4bc3dcfa 335END
702cfe48
KW
336 output_table(\@utf8skip, "PL_utf8skip", 0); # The 0 means don't print
337 # in hex
4bc3dcfa
KW
338 }
339
340 use feature 'unicode_strings';
341
342 {
343 my @lc;
344 for my $i (0 .. 255) {
345 $lc[$a2e[$i]] = $a2e[ord lc chr $i];
346 }
94e72741
KW
347 print $out_fh
348 "/* Index is $charset code point; value is its lowercase equivalent */\n";
4bc3dcfa
KW
349 output_table(\@lc, "PL_latin1_lc");
350 }
351
352 {
353 my @uc;
354 for my $i (0 .. 255) {
355 my $uc = uc chr $i;
356 if (length $uc > 1 || ord $uc > 255) {
357 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
358 }
359 $uc[$a2e[$i]] = $a2e[ord $uc];
360 }
361 print $out_fh <<END;
362/* Index is $charset code point; value is its uppercase equivalent.
363 * The 'mod' in the name means that codepoints whose uppercase is above 255 or
364 * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
365END
366 output_table(\@uc, "PL_mod_latin1_uc");
367 }
368
369 { # PL_fold
370 my @ascii_fold;
371 for my $i (0 .. 255) { # Initialise to identity map
372 $ascii_fold[$i] = $i;
373 }
374
375 # Overwrite the entries that aren't identity
376 for my $chr ('A' .. 'Z') {
377 $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
378 }
379 for my $chr ('a' .. 'z') {
380 $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
381 }
382 print $out_fh <<END;
383/* Index is $charset code point; For A-Z, value is a-z; for a-z, value
384 * is A-Z; all other code points map to themselves */
385END
386 output_table(\@ascii_fold, "PL_fold");
f2e44f08
KW
387
388 # This table is also the correct folding for the default C locale
389 output_table(\@ascii_fold, "PL_fold_locale");
4bc3dcfa
KW
390 }
391
392 {
393 my @latin1_fold;
394 for my $i (0 .. 255) {
395 my $char = chr $i;
396 my $lc = lc $char;
397
398 # lc and uc adequately proxy for fold-case pairs in this 0-255
399 # range
400 my $uc = uc $char;
401 $uc = $char if length $uc > 1 || ord $uc > 255;
402 if ($lc ne $char) {
403 $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
404 }
405 elsif ($uc ne $char) {
406 $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
407 }
408 else {
409 $latin1_fold[$a2e[$i]] = $a2e[$i];
410 }
411 }
412 print $out_fh <<END;
413/* Index is $charset code point; value is its other fold-pair equivalent
414 * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is
415 * the code point itself */
416END
417 output_table(\@latin1_fold, "PL_fold_latin1");
418 }
419
3de6d141
KW
420 {
421 # This generates the dfa table for perl extended UTF-8, which accepts
422 # surrogates, non-characters, and accepts start bytes up through FE
423 # (start byte FF has to be handled outside this dfa). The class numbers
424 # for start bytes are constrained so that they can be used as a shift
425 # count for masking off the leading one bits
426 #
427 # The classes are
428 # 00-9F 0
429 # A0-A1 7 Not legal immediately after start bytes F0 F8 FC
430 # FE
431 # A2-A3 8 Not legal immediately after start bytes F0 F8 FC
432 # A4-A7 9 Not legal immediately after start bytes F0 F8
433 # A8-AF 10 Not legal immediately after start bytes F0
434 # B0-BF 11
435 # C0-C4 1
436 # C5-DF 2
437 # E0 1
438 # E1-EF 3
439 # F0 12
440 # F1-F7 4
441 # F8 13
442 # F9-FB 5
443 # FC 14
444 # FD 6
445 # FE 15
446 # FF 1
447 #
448 # Here's the I8 for the code points before which overlongs occur:
449 # U+4000: \xF0\xB0\xA0\xA0
450 # U+40000: \xF8\xA8\xA0\xA0\xA0
451 # U+400000: \xFC\xA4\xA0\xA0\xA0\xA0
452 # U+4000000: \xFE\xA2\xA0\xA0\xA0\xA0\xA0
453 #
454 # The first part of the table maps bytes to character classes to reduce
455 # the size of the transition table and create bitmasks.
456 #
457 # The second part is a transition table that maps a combination of a
458 # state of the automaton and a character class to a new state. The
459 # numbering of the original nodes is retained, but some have been split
460 # so that there are new nodes. They mean:
461 # N0 The initial state, and final accepting one.
462 # N1 One continuation byte (A0-BF) left. This is transitioned to
463 # immediately when the start byte indicates a two-byte sequence
464 # N2 Two continuation bytes left.
465 # N3 Three continuation bytes left.
466 # N4 Four continuation bytes left.
467 # N5 Five continuation bytes left.
468 # N6 Start byte is F0. Continuation bytes A[0-F] are illegal
469 # (overlong); the other continuations transition to N2
470 # N7 Start byte is F8. Continuation bytes A[0-7] are illegal
471 # (overlong); the other continuations transition to N3
472 # N8 Start byte is FC. Continuation bytes A[0-3] are illegal
473 # (overlong); the other continuations transition to N4
474 # N9 Start byte is FE. Continuation bytes A[01] are illegal
475 # (overlong); the other continuations transition to N5
476 # 1 Reject. All transitions not mentioned above (except the single
477 # byte ones (as they are always legal) are to this state.
478
479 my $NUM_CLASSES = 16;
480 my $N0 = 0;
481 my $N1 = $N0 + $NUM_CLASSES;
482 my $N2 = $N1 + $NUM_CLASSES;
483 my $N3 = $N2 + $NUM_CLASSES;
484 my $N4 = $N3 + $NUM_CLASSES;
485 my $N5 = $N4 + $NUM_CLASSES;
486 my $N6 = $N5 + $NUM_CLASSES;
487 my $N7 = $N6 + $NUM_CLASSES;
488 my $N8 = $N7 + $NUM_CLASSES;
489 my $N9 = $N8 + $NUM_CLASSES;
490 my $N10 = $N9 + $NUM_CLASSES;
491
492 my @perl_extended_utf8_dfa;
493 my @i8 = (
c5975126 494 # 0 1 2 3 4 5 6 7 8 9 A B C D E F
3de6d141
KW
495 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
497 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
498 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
499 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
500 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
501 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
502 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
503 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
504 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
505 7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF
506 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF
507 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
508 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
509 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
510 12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF
511 );
512 $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
513 push @perl_extended_utf8_dfa, (
514 # Class:
515 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
516 0, 1,$N1,$N2,$N3,$N4,$N5, 1, 1, 1, 1, 1,$N6,$N7,$N8,$N9, # N0
517 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, # N1
518 1, 1, 1, 1, 1, 1, 1,$N1,$N1,$N1,$N1,$N1, 1, 1, 1, 1, # N2
519 1, 1, 1, 1, 1, 1, 1,$N2,$N2,$N2,$N2,$N2, 1, 1, 1, 1, # N3
520 1, 1, 1, 1, 1, 1, 1,$N3,$N3,$N3,$N3,$N3, 1, 1, 1, 1, # N4
521 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4,$N4,$N4, 1, 1, 1, 1, # N5
522
523 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N2, 1, 1, 1, 1, # N6
524 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N3,$N3, 1, 1, 1, 1, # N7
525 1, 1, 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4, 1, 1, 1, 1, # N8
526 1, 1, 1, 1, 1, 1, 1, 1,$N5,$N5,$N5,$N5, 1, 1, 1, 1, # N9
527 );
71525f77 528 output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab",
3de6d141
KW
529 $NUM_CLASSES);
530 }
531
123deead
KW
532 {
533 # This generates the dfa table for strict UTF-8, which rejects
534 # surrogates, non-characters, and above Unicode.
535 #
536 # The classes are
537 # 00-9F 0 Always legal at start
538 # A0 10 Not legal immediately after start bytes F0 F8
539 # A1 11 Not legal immediately after start bytes F0 F8,
540 # A2-A7 12 Not legal immediately after start bytes F0 F8 F9
541 # A8,AA,AC 13 Not legal immediately after start bytes F0 F9
542 # A9,AB,AD 14 Not legal immediately after start byte F0
543 # AE 15 Not legal immediately after start byte F0
544 # AF 16 Not legal immediately after start bytes F0
545 # B[0248AC] 17 Not legal immediately after start byte F9
546 # B[1359D] 18 Not legal immediately after start byte F9
547 # B6 19 Not legal immediately after start byte F9
548 # B7 20 Not legal immediately after start byte F9
549 # BE 21 Not legal immediately after start byte F9
550 # BF 22 Not legal immediately after start byte F9
551 # C0-C4 1 (reject, all are overlong)
552 # C5-DF 2 Accepts any legal continuation
553 # E0 1 (reject, all are overlong)
554 # E1-EF 3 Accepts any legal continuation
555 # F0 8 (has overlongs)
556 # F1 6 (has surrogates, non-chars)
557 # F2,F4,F6 4 Accepts any legal continuation
558 # F3,F5,F7 5 (has non-chars)
559 # F8 9 (has overlongs, non-chars)
560 # F9 7 (has non-chars, non-Unicode)
561 # FA-FF 1 (reject, all are non-Unicode)
562 #
563 # Here's the I8 for enough code points so that you can figure out what's
564 # going on:
565 #
566 # U+D800: \xF1\xB6\xA0\xA0
567 # U+DFFF: \xF1\xB7\xBF\xBF
568 # U+FDD0: \xF1\xBF\xAE\xB0
569 # U+FDEF: \xF1\xBF\xAF\xAF
570 # U+FFFE: \xF1\xBF\xBF\xBE
571 # U+1FFFE: \xF3\xBF\xBF\xBE
572 # U+2FFFE: \xF5\xBF\xBF\xBE
573 # U+3FFFE: \xF7\xBF\xBF\xBE
574 # U+4FFFE: \xF8\xA9\xBF\xBF\xBE
575 # U+5FFFE: \xF8\xAB\xBF\xBF\xBE
576 # U+6FFFE: \xF8\xAD\xBF\xBF\xBE
577 # U+7FFFE: \xF8\xAF\xBF\xBF\xBE
578 # U+8FFFE: \xF8\xB1\xBF\xBF\xBE
579 # U+9FFFE: \xF8\xB3\xBF\xBF\xBE
580 # U+AFFFE: \xF8\xB5\xBF\xBF\xBE
581 # U+BFFFE: \xF8\xB7\xBF\xBF\xBE
582 # U+CFFFE: \xF8\xB9\xBF\xBF\xBE
583 # U+DFFFE: \xF8\xBB\xBF\xBF\xBE
584 # U+EFFFE: \xF8\xBD\xBF\xBF\xBE
585 # U+FFFFE: \xF8\xBF\xBF\xBF\xBE
586 # U+10FFFE: \xF9\xA1\xBF\xBF\xBE
587 #
588 # The first part of the table maps bytes to character classes to reduce
589 # the size of the transition table and create bitmasks.
590 #
591 # The second part is a transition table that maps a combination of a
592 # state of the automaton and a character class to a new state. The
593 # numbering of the original nodes is retained, but some have been split
594 # so that there are new nodes. They mean:
595 # N0 The initial state, and final accepting one.
596 # N1 One continuation byte (A0-BF) left. This is transitioned to
597 # immediately when the start byte indicates a two-byte sequence
598 # N2 Two continuation bytes left.
599 # N3 Three continuation bytes left.
600 # N4 Start byte is F0. Continuation bytes A[0-F] are illegal
601 # (overlong); the other continuations transition to N2
602 # N5 Start byte is F1. Continuation bytes B6 and B7 are illegal
603 # (surrogates); BF transitions to N9; the other continuations to
604 # N2
605 # N6 Start byte is F[357]. Continuation byte BF transitions to N12;
606 # other continuations to N2
c5975126 607 # N7 Start byte is F8. Continuation bytes A[0-7] are illegal
123deead
KW
608 # (overlong); continuations A[9BDF] and B[13579BDF] transition to
609 # N14; the other continuations to N3
610 # N8 Start byte is F9. Continuation byte A0 transitions to N3; A1
611 # to N14; the other continuation bytes are illegal.
612 # N9 Initial sequence is F1 BF. Continuation byte AE transitions to
613 # state N10; AF to N11; BF to N13; the other continuations to N1.
614 # N10 Initial sequence is F1 BF AE. Continuation bytes B0-BF are
615 # illegal (non-chars); the other continuations are legal
616 # N11 Initial sequence is F1 BF AF. Continuation bytes A0-AF are
617 # illegal (non-chars); the other continuations are legal
618 # N12 Initial sequence is F[357] BF. Continuation bytes BF
619 # transitions to N13; the other continuations to N1
c997da5e
KW
620 # N13 Initial sequence is F[1357] BF BF or F8 x y BF (where x and y
621 # are something that can lead to a non-char. Continuation bytes
622 # BE and BF are illegal (non-chars); the other continuations are
123deead
KW
623 # legal
624 # N14 Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1.
c997da5e 625 # Continuation byte BF transitions to N15; the other
123deead 626 # continuations to N2
c997da5e
KW
627 # N15 Initial sequence is F8 A[9BDF] BF; or F8 B[13579BDF] BF; or
628 # F9 A1 BF. Continuation byte BF transitions to N16; the other
629 # continuations to N2
630 # N16 Initial sequence is F8 A[9BDF] BF BF; or F8 B[13579BDF] BF BF;
631 # or F9 A1 BF BF. Continuation bytes BE and BF are illegal
632 # (non-chars); the other continuations are legal
123deead
KW
633 # 1 Reject. All transitions not mentioned above (except the single
634 # byte ones (as they are always legal) are to this state.
635
636 my $NUM_CLASSES = 23;
637 my $N0 = 0;
638 my $N1 = $N0 + $NUM_CLASSES;
639 my $N2 = $N1 + $NUM_CLASSES;
640 my $N3 = $N2 + $NUM_CLASSES;
641 my $N4 = $N3 + $NUM_CLASSES;
642 my $N5 = $N4 + $NUM_CLASSES;
643 my $N6 = $N5 + $NUM_CLASSES;
644 my $N7 = $N6 + $NUM_CLASSES;
645 my $N8 = $N7 + $NUM_CLASSES;
646 my $N9 = $N8 + $NUM_CLASSES;
647 my $N10 = $N9 + $NUM_CLASSES;
648 my $N11 = $N10 + $NUM_CLASSES;
649 my $N12 = $N11 + $NUM_CLASSES;
650 my $N13 = $N12 + $NUM_CLASSES;
651 my $N14 = $N13 + $NUM_CLASSES;
c997da5e 652 my $N15 = $N14 + $NUM_CLASSES;
123deead
KW
653
654 my @strict_utf8_dfa;
655 my @i8 = (
c5975126 656 # 0 1 2 3 4 5 6 7 8 9 A B C D E F
123deead
KW
657 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
658 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
662 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
667 10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF
668 17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF
669 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
670 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
671 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
672 8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF
673 );
674 $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
675 push @strict_utf8_dfa, (
676 # Class:
677 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
678 0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N0
679 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # N1
680 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2
681 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3
682
683 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, # N4
684 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, 1, 1, $N2, $N9, # N5
685 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6
686 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7
687 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N8
688 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9
689 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, # N10
690 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, # N11
691 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12
692 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, # N13
c997da5e
KW
693 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N15, # N14
694 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N15
123deead 695 );
71525f77 696 output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES);
123deead
KW
697 }
698
c5bfbb64
KW
699 {
700 # This generates the dfa table for C9 strict UTF-8, which rejects
701 # surrogates and above Unicode, but allows non-characters,.
702 #
703 # The classes are
704 # 00-9F 0 Always legal at start
705 # A0-A1 9 Not legal immediately after start bytes F0 F8
706 # A2-A7 10 Not legal immediately after start bytes F0 F8 F9
707 # A8-AF 11 Not legal immediately after start bytes F0 F9
708 # B0-B5,B8-BF 12 Not legal immediately after start byte F9
709 # B6,B7 13
710 # C0-C4 1 (reject, all are overlong)
711 # C5-DF 2 Accepts any legal continuation
712 # E0 1 (reject, all are overlong)
713 # E1-EF 3 Accepts any legal continuation
714 # F0 6 (has overlongs)
715 # F1 5 (has surrogates)
716 # F2-F7 4 Accepts any legal continuation
717 # F8 8 (has overlongs)
718 # F9 7 (has non-Unicode)
719 # FA-FF 1 (reject, all are non-Unicode)
720 #
721 # The first part of the table maps bytes to character classes to reduce
722 # the size of the transition table and create bitmasks.
723 #
724 # The second part is a transition table that maps a combination of a
725 # state of the automaton and a character class to a new state. The
726 # numbering of the original nodes is retained, but some have been split
727 # so that there are new nodes. They mean:
728 # N0 The initial state, and final accepting one.
729 # N1 One continuation byte (A0-BF) left. This is transitioned to
730 # immediately when the start byte indicates a two-byte sequence
731 # N2 Two continuation bytes left.
732 # N3 Three continuation bytes left.
733 # N4 Start byte is F0. Continuation bytes A[0-F] are illegal
734 # (overlong); the other continuations transition to N2
735 # N5 Start byte is F1. B6 and B7 are illegal (surrogates); the
736 # other continuations transition to N2
737 # N6 Start byte is F8. Continuation bytes A[0-7] are illegal
738 # (overlong); the other continuations transition to N3
739 # N7 Start byte is F9. Continuation bytes A0 and A1 transition to
740 # N3; the other continuation bytes are illegal (non-Unicode)
741 # 1 Reject. All transitions not mentioned above (except the single
742 # byte ones (as they are always legal) are to this state.
743
744 my $NUM_CLASSES = 14;
745 my $N0 = 0;
746 my $N1 = $N0 + $NUM_CLASSES;
747 my $N2 = $N1 + $NUM_CLASSES;
748 my $N3 = $N2 + $NUM_CLASSES;
749 my $N4 = $N3 + $NUM_CLASSES;
750 my $N5 = $N4 + $NUM_CLASSES;
751 my $N6 = $N5 + $NUM_CLASSES;
752 my $N7 = $N6 + $NUM_CLASSES;
753
754 my @C9_utf8_dfa;
755 my @i8 = (
c5975126 756 # 0 1 2 3 4 5 6 7 8 9 A B C D E F
c5bfbb64
KW
757 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
758 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
759 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
760 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
761 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
762 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
763 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
764 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
765 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
766 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
767 9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF
768 12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF
769 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
770 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
771 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
772 6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF
773 );
774 $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
775 push @C9_utf8_dfa, (
776 # Class:
777 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13
778 0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6, 1, 1, 1, 1, 1, # N0
779 1,1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, # N1
780 1,1, 1, 1, 1, 1, 1, 1, 1,$N1, $N1, $N1, $N1, $N1, # N2
781 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, $N2, # N3
782
783 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, # N4
784 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, 1, # N5
785 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3, $N3, $N3, # N6
786 1,1, 1, 1, 1, 1, 1, 1, 1,$N3, 1, 1, 1, 1, # N7
787 );
71525f77 788 output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
c5bfbb64
KW
789 }
790
4bc3dcfa
KW
791 print $out_fh get_conditional_compile_line_end();
792}
793
6a5bc5ac 794print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n";
4bc3dcfa
KW
795
796read_only_bottom_close_and_rename($out_fh);