This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
devel/devtools.pl: 'use warnings'
[perl5.git] / regen / ebcdic.pl
CommitLineData
4bc3dcfa
KW
1use v5.16.0;
2use strict;
3use warnings;
94e72741 4use integer;
3d7c117d
MB
5
6BEGIN { unshift @INC, '.' }
7
8require './regen/regen_lib.pl';
9require './regen/charset_translations.pl';
4bc3dcfa
KW
10
11# Generates the EBCDIC translation tables that were formerly hard-coded into
12# utfebcdic.h
13
14my $out_fh = open_new('ebcdic_tables.h', '>',
15 {style => '*', by => $0, });
16
94e72741
KW
17sub get_column_headers ($$;$) {
18 my ($row_hdr_len, $field_width, $dfa_columns) = @_;
19 my $format;
20 my $final_column_format;
21 my $num_columns;
22
23 if (defined $dfa_columns) {
24 $num_columns = $dfa_columns;
25
26 # Trailing blank to correspond with commas in the rows below
27 $format = "%${field_width}d ";
28 }
29 else { # Is a regular table
30 $num_columns = 16;
31
32 # Use blanks to separate the fields
33 $format = " " x ( $field_width
34 - 2); # For the '_X'
35 $format .= "_%X "; # Again, trailing blank over the commas below
36 }
37
38 my $header = "/*" . " " x ($row_hdr_len - length "/*");
39
40 # All but the final column
41 $header .= sprintf($format, $_) for 0 .. $num_columns - 2;
42
43 # Get rid of trailing blank, so that the final column takes up one less
44 # space so that the "*/" doesn't extend past the commas in the rows below
45 chop $header;
46 $header .= sprintf $format, $num_columns - 1;
47
48 # Again, remove trailing blank
49 chop $header;
50
51 return $header . "*/\n";
52}
53
635ff1f9
KW
54sub output_table_start($$$) {
55 my ($out_fh, $TYPE, $name) = @_;
56
57 my $declaration = "EXTCONST $TYPE $name\[\]";
58 print $out_fh <<EOF;
59# ifndef DOINIT
0a142f46 60 $declaration;
635ff1f9 61# else
0a142f46 62 $declaration = {
635ff1f9
KW
63EOF
64}
65
66sub output_table_end($) {
67 print $out_fh "};\n# endif\n\n";
68}
69
702cfe48 70sub output_table ($$;$) {
4bc3dcfa
KW
71 my $table_ref = shift;
72 my $name = shift;
73
94e72741
KW
74 # 0 => print in decimal
75 # 1 => print in hex (translates code point to code point)
76 # >= 2 => is a dfa table, like http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
77 # The number is how many columns in the part after the code point
78 # portion.
79 #
80 # code point tables in hex areasier to debug, but don't fit into 80
81 # columns
82 my $type = shift // 1;
83
84 my $print_in_hex = $type == 1;
85 my $is_dfa = ($type >= 2) ? $type : 0;
86 my $columns_after_256 = 16;
87
88 die "Requres 256 entries in table $name, got @$table_ref"
89 if ! $is_dfa && @$table_ref != 256;
90 if (! $is_dfa) {
91 die "Requres 256 entries in table $name, got @$table_ref"
92 if @$table_ref != 256;
93 }
94 else {
95 $columns_after_256 = $is_dfa;
96
97 print $out_fh <<'EOF';
98
99/* The table below is adapted from
100 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
101 * See copyright notice at the beginning of this file.
102 */
103
104EOF
105 }
106
107 # Highest number in the table
108 my $max_entry = 0;
109 $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref;
43bf0d50 110
94e72741
KW
111 # We assume that every table has at least one two digit entry, and none
112 # are more than three digit.
113 my $field_width = ($print_in_hex)
114 ? 4
115 : (($max_entry) > 99 ? 3 : 2);
4bc3dcfa 116
94e72741
KW
117 my $row_hdr_length;
118 my $node_number_field_width;
119 my $node_value_field_width;
120
121 # dfa tables have a special header for the rows in the transitions part of
122 # the table. It is longer than the regular one.
123 if ($is_dfa) {
124 my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1;
125 $node_number_field_width = ($max_node_number > 9) ? 2 : 1;
126 $node_value_field_width = ($max_node_number * $columns_after_256 > 99)
127 ? 3 : 2;
128 # The header starts with this template, and adds in the number of
129 # digits needed to represent the maximum node number and its value
130 $row_hdr_length = length("/*N=*/")
131 + $node_number_field_width
132 + $node_value_field_width;
133 }
134 else {
135 $row_hdr_length = length "/*_X*/"; # Template for what the header
136 # looks like
137 }
138
139 # The table may not be representable in 8 bits.
140 my $TYPE = 'U8';
141 $TYPE = 'U16' if grep { $_ > 255 } @$table_ref;
142
635ff1f9 143 output_table_start $out_fh, $TYPE, $name;
4bc3dcfa 144
94e72741
KW
145 # First the headers for the columns
146 print $out_fh get_column_headers($row_hdr_length, $field_width);
147
148 # Now the table body
149 my $count = @$table_ref;
150 my $last_was_nl = 1;
151
152 # Print each element individually, arranged in rows of columns
153 for my $i (0 .. $count - 1) {
154
155 # Node number for here is -1 until get into the dfa state transitions
156 my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256;
157
158 # Print row header at beginning of each row
159 if ($last_was_nl) {
160 if ($node >= 0) {
161 printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node,
162 $node_value_field_width, $i - 256;
163 }
164 else { # Otherwise is regular row; print its number
165 printf $out_fh "/*%X_", $i / 16;
166
167 # These rows in a dfa table require extra space so columns
168 # will align vertically (because the Ndd=ddd requires extra
169 # space)
170 if ($is_dfa) {
171 print $out_fh " " x ( $node_number_field_width
172 + $node_value_field_width);
173 }
174 print $out_fh "*/";
175 }
176 }
177
43bf0d50 178 if ($print_in_hex) {
702cfe48 179 printf $out_fh "0x%02X", $table_ref->[$i];
43bf0d50
KW
180 }
181 else {
94e72741
KW
182 printf $out_fh "%${field_width}d", $table_ref->[$i];
183 }
184
185 print $out_fh ",", if $i < $count -1; # No comma on final entry
186
187 # Add \n if at end of row, which is 16 columns until we get to the
188 # transitions part
189 if ( ($node < 0 && $i % 16 == 15)
190 || ($node >= 0 && ($i -256) % $columns_after_256
191 == $columns_after_256 - 1))
192 {
193 print $out_fh "\n";
194 $last_was_nl = 1;
195 }
196 else {
197 $last_was_nl = 0;
43bf0d50 198 }
4bc3dcfa 199 }
94e72741
KW
200
201 # Print column footer
202 print $out_fh get_column_headers($row_hdr_length, $field_width,
203 ($is_dfa) ? $columns_after_256 : undef);
204
635ff1f9 205 output_table_end($out_fh);
4bc3dcfa
KW
206}
207
3de6d141 208print $out_fh <<'END';
4bc3dcfa 209
6a5bc5ac
KW
210#ifndef PERL_EBCDIC_TABLES_H_ /* Guard against nested #includes */
211#define PERL_EBCDIC_TABLES_H_ 1
4bc3dcfa
KW
212
213/* This file contains definitions for various tables used in EBCDIC handling.
3de6d141
KW
214 * More info is in utfebcdic.h
215 *
216 * Some of the tables are adapted from
217 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
218 * which requires this copyright notice:
219
220Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
221
222Permission is hereby granted, free of charge, to any person obtaining a copy of
223this software and associated documentation files (the "Software"), to deal in
224the Software without restriction, including without limitation the rights to
225use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
226of the Software, and to permit persons to whom the Software is furnished to do
227so, subject to the following conditions:
228
229The above copyright notice and this permission notice shall be included in all
230copies or substantial portions of the Software.
231
232THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
233IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
234FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
235AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
236LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
237OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
238SOFTWARE.
239
240*/
4bc3dcfa
KW
241END
242
243my @charsets = get_supported_code_pages();
244shift @charsets; # ASCII is the 0th, and we don't deal with that here.
245foreach my $charset (@charsets) {
c30a0cf2
TC
246 # we process the whole array several times, make a copy
247 my @a2e = @{get_a2n($charset)};
4bc3dcfa
KW
248
249 print $out_fh "\n" . get_conditional_compile_line_start($charset);
250 print $out_fh "\n";
251
252 print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
253 output_table(\@a2e, "PL_a2e");
254
255 { # Construct the inverse
256 my @e2a;
257 for my $i (0 .. 255) {
258 $e2a[$a2e[$i]] = $i;
259 }
260 print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
261 output_table(\@e2a, "PL_e2a");
262 }
263
e0dcdb0a 264 my @i82utf = @{get_I8_2_utf($charset)};
4bc3dcfa
KW
265 print $out_fh <<END;
266/* (Confusingly named) Index is $charset I8 byte; value is
267 * $charset UTF-EBCDIC equivalent */
268END
269 output_table(\@i82utf, "PL_utf2e");
270
271 { #Construct the inverse
272 my @utf2i8;
273 for my $i (0 .. 255) {
274 $utf2i8[$i82utf[$i]] = $i;
275 }
276 print $out_fh <<END;
277/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
278 * $charset I8 equivalent */
279END
280 output_table(\@utf2i8, "PL_e2utf");
281 }
282
283 {
284 my @utf8skip;
285
286 # These are invariants or continuation bytes.
287 for my $i (0 .. 0xBF) {
288 $utf8skip[$i82utf[$i]] = 1;
289 }
290
291 # These are start bytes; The skip is the number of consecutive highest
292 # order 1-bits (up to 7)
293 for my $i (0xC0 .. 255) {
294 my $count;
c0236afe
KW
295 if ($i == 0b11111111) {
296 no warnings 'once';
297 $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES;
298 }
299 elsif (($i & 0b11111110) == 0b11111110) {
4bc3dcfa
KW
300 $count= 7;
301 }
302 elsif (($i & 0b11111100) == 0b11111100) {
303 $count= 6;
304 }
305 elsif (($i & 0b11111000) == 0b11111000) {
306 $count= 5;
307 }
308 elsif (($i & 0b11110000) == 0b11110000) {
309 $count= 4;
310 }
311 elsif (($i & 0b11100000) == 0b11100000) {
312 $count= 3;
313 }
314 elsif (($i & 0b11000000) == 0b11000000) {
315 $count= 2;
316 }
317 else {
318 die "Something wrong for UTF8SKIP calculation for $i";
319 }
320 $utf8skip[$i82utf[$i]] = $count;
321 }
322
323 print $out_fh <<END;
4719093e
KW
324/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes
325 * (including for overlongs); 1 for continuation. Adapted from the shadow
326 * flags table in tr16. The entries marked 9 in tr16 are continuation bytes
327 * and are marked as length 1 here so that we can recover. */
4bc3dcfa 328END
702cfe48
KW
329 output_table(\@utf8skip, "PL_utf8skip", 0); # The 0 means don't print
330 # in hex
4bc3dcfa
KW
331 }
332
333 use feature 'unicode_strings';
334
335 {
336 my @lc;
337 for my $i (0 .. 255) {
338 $lc[$a2e[$i]] = $a2e[ord lc chr $i];
339 }
94e72741
KW
340 print $out_fh
341 "/* Index is $charset code point; value is its lowercase equivalent */\n";
4bc3dcfa
KW
342 output_table(\@lc, "PL_latin1_lc");
343 }
344
345 {
346 my @uc;
347 for my $i (0 .. 255) {
348 my $uc = uc chr $i;
349 if (length $uc > 1 || ord $uc > 255) {
350 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
351 }
352 $uc[$a2e[$i]] = $a2e[ord $uc];
353 }
354 print $out_fh <<END;
355/* Index is $charset code point; value is its uppercase equivalent.
356 * The 'mod' in the name means that codepoints whose uppercase is above 255 or
357 * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
358END
359 output_table(\@uc, "PL_mod_latin1_uc");
360 }
361
362 { # PL_fold
363 my @ascii_fold;
364 for my $i (0 .. 255) { # Initialise to identity map
365 $ascii_fold[$i] = $i;
366 }
367
368 # Overwrite the entries that aren't identity
369 for my $chr ('A' .. 'Z') {
370 $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
371 }
372 for my $chr ('a' .. 'z') {
373 $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
374 }
375 print $out_fh <<END;
376/* Index is $charset code point; For A-Z, value is a-z; for a-z, value
377 * is A-Z; all other code points map to themselves */
378END
379 output_table(\@ascii_fold, "PL_fold");
380 }
381
382 {
383 my @latin1_fold;
384 for my $i (0 .. 255) {
385 my $char = chr $i;
386 my $lc = lc $char;
387
388 # lc and uc adequately proxy for fold-case pairs in this 0-255
389 # range
390 my $uc = uc $char;
391 $uc = $char if length $uc > 1 || ord $uc > 255;
392 if ($lc ne $char) {
393 $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
394 }
395 elsif ($uc ne $char) {
396 $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
397 }
398 else {
399 $latin1_fold[$a2e[$i]] = $a2e[$i];
400 }
401 }
402 print $out_fh <<END;
403/* Index is $charset code point; value is its other fold-pair equivalent
404 * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is
405 * the code point itself */
406END
407 output_table(\@latin1_fold, "PL_fold_latin1");
408 }
409
3de6d141
KW
410 {
411 # This generates the dfa table for perl extended UTF-8, which accepts
412 # surrogates, non-characters, and accepts start bytes up through FE
413 # (start byte FF has to be handled outside this dfa). The class numbers
414 # for start bytes are constrained so that they can be used as a shift
415 # count for masking off the leading one bits
416 #
417 # The classes are
418 # 00-9F 0
419 # A0-A1 7 Not legal immediately after start bytes F0 F8 FC
420 # FE
421 # A2-A3 8 Not legal immediately after start bytes F0 F8 FC
422 # A4-A7 9 Not legal immediately after start bytes F0 F8
423 # A8-AF 10 Not legal immediately after start bytes F0
424 # B0-BF 11
425 # C0-C4 1
426 # C5-DF 2
427 # E0 1
428 # E1-EF 3
429 # F0 12
430 # F1-F7 4
431 # F8 13
432 # F9-FB 5
433 # FC 14
434 # FD 6
435 # FE 15
436 # FF 1
437 #
438 # Here's the I8 for the code points before which overlongs occur:
439 # U+4000: \xF0\xB0\xA0\xA0
440 # U+40000: \xF8\xA8\xA0\xA0\xA0
441 # U+400000: \xFC\xA4\xA0\xA0\xA0\xA0
442 # U+4000000: \xFE\xA2\xA0\xA0\xA0\xA0\xA0
443 #
444 # The first part of the table maps bytes to character classes to reduce
445 # the size of the transition table and create bitmasks.
446 #
447 # The second part is a transition table that maps a combination of a
448 # state of the automaton and a character class to a new state. The
449 # numbering of the original nodes is retained, but some have been split
450 # so that there are new nodes. They mean:
451 # N0 The initial state, and final accepting one.
452 # N1 One continuation byte (A0-BF) left. This is transitioned to
453 # immediately when the start byte indicates a two-byte sequence
454 # N2 Two continuation bytes left.
455 # N3 Three continuation bytes left.
456 # N4 Four continuation bytes left.
457 # N5 Five continuation bytes left.
458 # N6 Start byte is F0. Continuation bytes A[0-F] are illegal
459 # (overlong); the other continuations transition to N2
460 # N7 Start byte is F8. Continuation bytes A[0-7] are illegal
461 # (overlong); the other continuations transition to N3
462 # N8 Start byte is FC. Continuation bytes A[0-3] are illegal
463 # (overlong); the other continuations transition to N4
464 # N9 Start byte is FE. Continuation bytes A[01] are illegal
465 # (overlong); the other continuations transition to N5
466 # 1 Reject. All transitions not mentioned above (except the single
467 # byte ones (as they are always legal) are to this state.
468
469 my $NUM_CLASSES = 16;
470 my $N0 = 0;
471 my $N1 = $N0 + $NUM_CLASSES;
472 my $N2 = $N1 + $NUM_CLASSES;
473 my $N3 = $N2 + $NUM_CLASSES;
474 my $N4 = $N3 + $NUM_CLASSES;
475 my $N5 = $N4 + $NUM_CLASSES;
476 my $N6 = $N5 + $NUM_CLASSES;
477 my $N7 = $N6 + $NUM_CLASSES;
478 my $N8 = $N7 + $NUM_CLASSES;
479 my $N9 = $N8 + $NUM_CLASSES;
480 my $N10 = $N9 + $NUM_CLASSES;
481
482 my @perl_extended_utf8_dfa;
483 my @i8 = (
484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
494 7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF
495 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF
496 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
497 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
498 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
499 12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF
500 );
501 $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
502 push @perl_extended_utf8_dfa, (
503 # Class:
504 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
505 0, 1,$N1,$N2,$N3,$N4,$N5, 1, 1, 1, 1, 1,$N6,$N7,$N8,$N9, # N0
506 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, # N1
507 1, 1, 1, 1, 1, 1, 1,$N1,$N1,$N1,$N1,$N1, 1, 1, 1, 1, # N2
508 1, 1, 1, 1, 1, 1, 1,$N2,$N2,$N2,$N2,$N2, 1, 1, 1, 1, # N3
509 1, 1, 1, 1, 1, 1, 1,$N3,$N3,$N3,$N3,$N3, 1, 1, 1, 1, # N4
510 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4,$N4,$N4, 1, 1, 1, 1, # N5
511
512 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N2, 1, 1, 1, 1, # N6
513 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N3,$N3, 1, 1, 1, 1, # N7
514 1, 1, 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4, 1, 1, 1, 1, # N8
515 1, 1, 1, 1, 1, 1, 1, 1,$N5,$N5,$N5,$N5, 1, 1, 1, 1, # N9
516 );
71525f77 517 output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab",
3de6d141
KW
518 $NUM_CLASSES);
519 }
520
123deead
KW
521 {
522 # This generates the dfa table for strict UTF-8, which rejects
523 # surrogates, non-characters, and above Unicode.
524 #
525 # The classes are
526 # 00-9F 0 Always legal at start
527 # A0 10 Not legal immediately after start bytes F0 F8
528 # A1 11 Not legal immediately after start bytes F0 F8,
529 # A2-A7 12 Not legal immediately after start bytes F0 F8 F9
530 # A8,AA,AC 13 Not legal immediately after start bytes F0 F9
531 # A9,AB,AD 14 Not legal immediately after start byte F0
532 # AE 15 Not legal immediately after start byte F0
533 # AF 16 Not legal immediately after start bytes F0
534 # B[0248AC] 17 Not legal immediately after start byte F9
535 # B[1359D] 18 Not legal immediately after start byte F9
536 # B6 19 Not legal immediately after start byte F9
537 # B7 20 Not legal immediately after start byte F9
538 # BE 21 Not legal immediately after start byte F9
539 # BF 22 Not legal immediately after start byte F9
540 # C0-C4 1 (reject, all are overlong)
541 # C5-DF 2 Accepts any legal continuation
542 # E0 1 (reject, all are overlong)
543 # E1-EF 3 Accepts any legal continuation
544 # F0 8 (has overlongs)
545 # F1 6 (has surrogates, non-chars)
546 # F2,F4,F6 4 Accepts any legal continuation
547 # F3,F5,F7 5 (has non-chars)
548 # F8 9 (has overlongs, non-chars)
549 # F9 7 (has non-chars, non-Unicode)
550 # FA-FF 1 (reject, all are non-Unicode)
551 #
552 # Here's the I8 for enough code points so that you can figure out what's
553 # going on:
554 #
555 # U+D800: \xF1\xB6\xA0\xA0
556 # U+DFFF: \xF1\xB7\xBF\xBF
557 # U+FDD0: \xF1\xBF\xAE\xB0
558 # U+FDEF: \xF1\xBF\xAF\xAF
559 # U+FFFE: \xF1\xBF\xBF\xBE
560 # U+1FFFE: \xF3\xBF\xBF\xBE
561 # U+2FFFE: \xF5\xBF\xBF\xBE
562 # U+3FFFE: \xF7\xBF\xBF\xBE
563 # U+4FFFE: \xF8\xA9\xBF\xBF\xBE
564 # U+5FFFE: \xF8\xAB\xBF\xBF\xBE
565 # U+6FFFE: \xF8\xAD\xBF\xBF\xBE
566 # U+7FFFE: \xF8\xAF\xBF\xBF\xBE
567 # U+8FFFE: \xF8\xB1\xBF\xBF\xBE
568 # U+9FFFE: \xF8\xB3\xBF\xBF\xBE
569 # U+AFFFE: \xF8\xB5\xBF\xBF\xBE
570 # U+BFFFE: \xF8\xB7\xBF\xBF\xBE
571 # U+CFFFE: \xF8\xB9\xBF\xBF\xBE
572 # U+DFFFE: \xF8\xBB\xBF\xBF\xBE
573 # U+EFFFE: \xF8\xBD\xBF\xBF\xBE
574 # U+FFFFE: \xF8\xBF\xBF\xBF\xBE
575 # U+10FFFE: \xF9\xA1\xBF\xBF\xBE
576 #
577 # The first part of the table maps bytes to character classes to reduce
578 # the size of the transition table and create bitmasks.
579 #
580 # The second part is a transition table that maps a combination of a
581 # state of the automaton and a character class to a new state. The
582 # numbering of the original nodes is retained, but some have been split
583 # so that there are new nodes. They mean:
584 # N0 The initial state, and final accepting one.
585 # N1 One continuation byte (A0-BF) left. This is transitioned to
586 # immediately when the start byte indicates a two-byte sequence
587 # N2 Two continuation bytes left.
588 # N3 Three continuation bytes left.
589 # N4 Start byte is F0. Continuation bytes A[0-F] are illegal
590 # (overlong); the other continuations transition to N2
591 # N5 Start byte is F1. Continuation bytes B6 and B7 are illegal
592 # (surrogates); BF transitions to N9; the other continuations to
593 # N2
594 # N6 Start byte is F[357]. Continuation byte BF transitions to N12;
595 # other continuations to N2
596 # N5 Start byte is F8. Continuation bytes A[0-7] are illegal
597 # (overlong); continuations A[9BDF] and B[13579BDF] transition to
598 # N14; the other continuations to N3
599 # N8 Start byte is F9. Continuation byte A0 transitions to N3; A1
600 # to N14; the other continuation bytes are illegal.
601 # N9 Initial sequence is F1 BF. Continuation byte AE transitions to
602 # state N10; AF to N11; BF to N13; the other continuations to N1.
603 # N10 Initial sequence is F1 BF AE. Continuation bytes B0-BF are
604 # illegal (non-chars); the other continuations are legal
605 # N11 Initial sequence is F1 BF AF. Continuation bytes A0-AF are
606 # illegal (non-chars); the other continuations are legal
607 # N12 Initial sequence is F[357] BF. Continuation bytes BF
608 # transitions to N13; the other continuations to N1
609 # N13 Initial sequence is F[1357] BF BF or F8 x BF (where x is
610 # something that can lead to a non-char. Continuation bytes BE
611 # and BF are illegal (non-chars); the other continuations are
612 # legal
613 # N14 Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1.
614 # Continuation byte BF transitions to N13; the other
615 # continuations to N2
616 # 1 Reject. All transitions not mentioned above (except the single
617 # byte ones (as they are always legal) are to this state.
618
619 my $NUM_CLASSES = 23;
620 my $N0 = 0;
621 my $N1 = $N0 + $NUM_CLASSES;
622 my $N2 = $N1 + $NUM_CLASSES;
623 my $N3 = $N2 + $NUM_CLASSES;
624 my $N4 = $N3 + $NUM_CLASSES;
625 my $N5 = $N4 + $NUM_CLASSES;
626 my $N6 = $N5 + $NUM_CLASSES;
627 my $N7 = $N6 + $NUM_CLASSES;
628 my $N8 = $N7 + $NUM_CLASSES;
629 my $N9 = $N8 + $NUM_CLASSES;
630 my $N10 = $N9 + $NUM_CLASSES;
631 my $N11 = $N10 + $NUM_CLASSES;
632 my $N12 = $N11 + $NUM_CLASSES;
633 my $N13 = $N12 + $NUM_CLASSES;
634 my $N14 = $N13 + $NUM_CLASSES;
635
636 my @strict_utf8_dfa;
637 my @i8 = (
638 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
639 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
640 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
641 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
642 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
643 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
644 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
645 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
648 10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF
649 17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF
650 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
651 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
652 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
653 8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF
654 );
655 $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
656 push @strict_utf8_dfa, (
657 # Class:
658 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
659 0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N0
660 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # N1
661 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2
662 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3
663
664 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, # N4
665 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, 1, 1, $N2, $N9, # N5
666 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6
667 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7
668 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N8
669 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9
670 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, # N10
671 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, # N11
672 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12
673 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, # N13
674 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N13, # N14
675 );
71525f77 676 output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES);
123deead
KW
677 }
678
c5bfbb64
KW
679 {
680 # This generates the dfa table for C9 strict UTF-8, which rejects
681 # surrogates and above Unicode, but allows non-characters,.
682 #
683 # The classes are
684 # 00-9F 0 Always legal at start
685 # A0-A1 9 Not legal immediately after start bytes F0 F8
686 # A2-A7 10 Not legal immediately after start bytes F0 F8 F9
687 # A8-AF 11 Not legal immediately after start bytes F0 F9
688 # B0-B5,B8-BF 12 Not legal immediately after start byte F9
689 # B6,B7 13
690 # C0-C4 1 (reject, all are overlong)
691 # C5-DF 2 Accepts any legal continuation
692 # E0 1 (reject, all are overlong)
693 # E1-EF 3 Accepts any legal continuation
694 # F0 6 (has overlongs)
695 # F1 5 (has surrogates)
696 # F2-F7 4 Accepts any legal continuation
697 # F8 8 (has overlongs)
698 # F9 7 (has non-Unicode)
699 # FA-FF 1 (reject, all are non-Unicode)
700 #
701 # The first part of the table maps bytes to character classes to reduce
702 # the size of the transition table and create bitmasks.
703 #
704 # The second part is a transition table that maps a combination of a
705 # state of the automaton and a character class to a new state. The
706 # numbering of the original nodes is retained, but some have been split
707 # so that there are new nodes. They mean:
708 # N0 The initial state, and final accepting one.
709 # N1 One continuation byte (A0-BF) left. This is transitioned to
710 # immediately when the start byte indicates a two-byte sequence
711 # N2 Two continuation bytes left.
712 # N3 Three continuation bytes left.
713 # N4 Start byte is F0. Continuation bytes A[0-F] are illegal
714 # (overlong); the other continuations transition to N2
715 # N5 Start byte is F1. B6 and B7 are illegal (surrogates); the
716 # other continuations transition to N2
717 # N6 Start byte is F8. Continuation bytes A[0-7] are illegal
718 # (overlong); the other continuations transition to N3
719 # N7 Start byte is F9. Continuation bytes A0 and A1 transition to
720 # N3; the other continuation bytes are illegal (non-Unicode)
721 # 1 Reject. All transitions not mentioned above (except the single
722 # byte ones (as they are always legal) are to this state.
723
724 my $NUM_CLASSES = 14;
725 my $N0 = 0;
726 my $N1 = $N0 + $NUM_CLASSES;
727 my $N2 = $N1 + $NUM_CLASSES;
728 my $N3 = $N2 + $NUM_CLASSES;
729 my $N4 = $N3 + $NUM_CLASSES;
730 my $N5 = $N4 + $NUM_CLASSES;
731 my $N6 = $N5 + $NUM_CLASSES;
732 my $N7 = $N6 + $NUM_CLASSES;
733
734 my @C9_utf8_dfa;
735 my @i8 = (
736 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
737 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
738 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
739 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
740 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
741 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
742 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
743 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
744 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
745 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
746 9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF
747 12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF
748 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
749 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
750 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
751 6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF
752 );
753 $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
754 push @C9_utf8_dfa, (
755 # Class:
756 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13
757 0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6, 1, 1, 1, 1, 1, # N0
758 1,1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, # N1
759 1,1, 1, 1, 1, 1, 1, 1, 1,$N1, $N1, $N1, $N1, $N1, # N2
760 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, $N2, # N3
761
762 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, # N4
763 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, 1, # N5
764 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3, $N3, $N3, # N6
765 1,1, 1, 1, 1, 1, 1, 1, 1,$N3, 1, 1, 1, 1, # N7
766 );
71525f77 767 output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
c5bfbb64
KW
768 }
769
4bc3dcfa
KW
770 print $out_fh get_conditional_compile_line_end();
771}
772
6a5bc5ac 773print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n";
4bc3dcfa
KW
774
775read_only_bottom_close_and_rename($out_fh);