This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
basic.t: Provide descriptions for all unit tests
[perl5.git] / regen / ebcdic.pl
CommitLineData
4bc3dcfa
KW
1use v5.16.0;
2use strict;
3use warnings;
94e72741 4use integer;
3d7c117d
MB
5
6BEGIN { unshift @INC, '.' }
7
8require './regen/regen_lib.pl';
9require './regen/charset_translations.pl';
4bc3dcfa
KW
10
11# Generates the EBCDIC translation tables that were formerly hard-coded into
12# utfebcdic.h
13
14my $out_fh = open_new('ebcdic_tables.h', '>',
15 {style => '*', by => $0, });
16
94e72741
KW
17sub get_column_headers ($$;$) {
18 my ($row_hdr_len, $field_width, $dfa_columns) = @_;
19 my $format;
20 my $final_column_format;
21 my $num_columns;
22
23 if (defined $dfa_columns) {
24 $num_columns = $dfa_columns;
25
26 # Trailing blank to correspond with commas in the rows below
27 $format = "%${field_width}d ";
28 }
29 else { # Is a regular table
30 $num_columns = 16;
31
32 # Use blanks to separate the fields
33 $format = " " x ( $field_width
34 - 2); # For the '_X'
35 $format .= "_%X "; # Again, trailing blank over the commas below
36 }
37
38 my $header = "/*" . " " x ($row_hdr_len - length "/*");
39
40 # All but the final column
41 $header .= sprintf($format, $_) for 0 .. $num_columns - 2;
42
43 # Get rid of trailing blank, so that the final column takes up one less
44 # space so that the "*/" doesn't extend past the commas in the rows below
45 chop $header;
46 $header .= sprintf $format, $num_columns - 1;
47
48 # Again, remove trailing blank
49 chop $header;
50
51 return $header . "*/\n";
52}
53
702cfe48 54sub output_table ($$;$) {
4bc3dcfa
KW
55 my $table_ref = shift;
56 my $name = shift;
57
94e72741
KW
58 # 0 => print in decimal
59 # 1 => print in hex (translates code point to code point)
60 # >= 2 => is a dfa table, like http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
61 # The number is how many columns in the part after the code point
62 # portion.
63 #
64 # code point tables in hex areasier to debug, but don't fit into 80
65 # columns
66 my $type = shift // 1;
67
68 my $print_in_hex = $type == 1;
69 my $is_dfa = ($type >= 2) ? $type : 0;
70 my $columns_after_256 = 16;
71
72 die "Requres 256 entries in table $name, got @$table_ref"
73 if ! $is_dfa && @$table_ref != 256;
74 if (! $is_dfa) {
75 die "Requres 256 entries in table $name, got @$table_ref"
76 if @$table_ref != 256;
77 }
78 else {
79 $columns_after_256 = $is_dfa;
80
81 print $out_fh <<'EOF';
82
83/* The table below is adapted from
84 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
85 * See copyright notice at the beginning of this file.
86 */
87
88EOF
89 }
90
91 # Highest number in the table
92 my $max_entry = 0;
93 $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref;
43bf0d50 94
94e72741
KW
95 # We assume that every table has at least one two digit entry, and none
96 # are more than three digit.
97 my $field_width = ($print_in_hex)
98 ? 4
99 : (($max_entry) > 99 ? 3 : 2);
4bc3dcfa 100
94e72741
KW
101 my $row_hdr_length;
102 my $node_number_field_width;
103 my $node_value_field_width;
104
105 # dfa tables have a special header for the rows in the transitions part of
106 # the table. It is longer than the regular one.
107 if ($is_dfa) {
108 my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1;
109 $node_number_field_width = ($max_node_number > 9) ? 2 : 1;
110 $node_value_field_width = ($max_node_number * $columns_after_256 > 99)
111 ? 3 : 2;
112 # The header starts with this template, and adds in the number of
113 # digits needed to represent the maximum node number and its value
114 $row_hdr_length = length("/*N=*/")
115 + $node_number_field_width
116 + $node_value_field_width;
117 }
118 else {
119 $row_hdr_length = length "/*_X*/"; # Template for what the header
120 # looks like
121 }
122
123 # The table may not be representable in 8 bits.
124 my $TYPE = 'U8';
125 $TYPE = 'U16' if grep { $_ > 255 } @$table_ref;
126
127 my $declaration = "EXTCONST $TYPE $name\[\]";
c05125c5
KW
128 print $out_fh <<EOF;
129# ifndef DOINIT
130# $declaration;
131# else
132# $declaration = {
133EOF
4bc3dcfa 134
94e72741
KW
135 # First the headers for the columns
136 print $out_fh get_column_headers($row_hdr_length, $field_width);
137
138 # Now the table body
139 my $count = @$table_ref;
140 my $last_was_nl = 1;
141
142 # Print each element individually, arranged in rows of columns
143 for my $i (0 .. $count - 1) {
144
145 # Node number for here is -1 until get into the dfa state transitions
146 my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256;
147
148 # Print row header at beginning of each row
149 if ($last_was_nl) {
150 if ($node >= 0) {
151 printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node,
152 $node_value_field_width, $i - 256;
153 }
154 else { # Otherwise is regular row; print its number
155 printf $out_fh "/*%X_", $i / 16;
156
157 # These rows in a dfa table require extra space so columns
158 # will align vertically (because the Ndd=ddd requires extra
159 # space)
160 if ($is_dfa) {
161 print $out_fh " " x ( $node_number_field_width
162 + $node_value_field_width);
163 }
164 print $out_fh "*/";
165 }
166 }
167
43bf0d50 168 if ($print_in_hex) {
702cfe48 169 printf $out_fh "0x%02X", $table_ref->[$i];
43bf0d50
KW
170 }
171 else {
94e72741
KW
172 printf $out_fh "%${field_width}d", $table_ref->[$i];
173 }
174
175 print $out_fh ",", if $i < $count -1; # No comma on final entry
176
177 # Add \n if at end of row, which is 16 columns until we get to the
178 # transitions part
179 if ( ($node < 0 && $i % 16 == 15)
180 || ($node >= 0 && ($i -256) % $columns_after_256
181 == $columns_after_256 - 1))
182 {
183 print $out_fh "\n";
184 $last_was_nl = 1;
185 }
186 else {
187 $last_was_nl = 0;
43bf0d50 188 }
4bc3dcfa 189 }
94e72741
KW
190
191 # Print column footer
192 print $out_fh get_column_headers($row_hdr_length, $field_width,
193 ($is_dfa) ? $columns_after_256 : undef);
194
c05125c5 195 print $out_fh "};\n# endif\n\n";
4bc3dcfa
KW
196}
197
3de6d141 198print $out_fh <<'END';
4bc3dcfa 199
6a5bc5ac
KW
200#ifndef PERL_EBCDIC_TABLES_H_ /* Guard against nested #includes */
201#define PERL_EBCDIC_TABLES_H_ 1
4bc3dcfa
KW
202
203/* This file contains definitions for various tables used in EBCDIC handling.
3de6d141
KW
204 * More info is in utfebcdic.h
205 *
206 * Some of the tables are adapted from
207 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
208 * which requires this copyright notice:
209
210Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
211
212Permission is hereby granted, free of charge, to any person obtaining a copy of
213this software and associated documentation files (the "Software"), to deal in
214the Software without restriction, including without limitation the rights to
215use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
216of the Software, and to permit persons to whom the Software is furnished to do
217so, subject to the following conditions:
218
219The above copyright notice and this permission notice shall be included in all
220copies or substantial portions of the Software.
221
222THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
223IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
224FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
225AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
226LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
227OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
228SOFTWARE.
229
230*/
4bc3dcfa
KW
231END
232
233my @charsets = get_supported_code_pages();
234shift @charsets; # ASCII is the 0th, and we don't deal with that here.
235foreach my $charset (@charsets) {
c30a0cf2
TC
236 # we process the whole array several times, make a copy
237 my @a2e = @{get_a2n($charset)};
4bc3dcfa
KW
238
239 print $out_fh "\n" . get_conditional_compile_line_start($charset);
240 print $out_fh "\n";
241
242 print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
243 output_table(\@a2e, "PL_a2e");
244
245 { # Construct the inverse
246 my @e2a;
247 for my $i (0 .. 255) {
248 $e2a[$a2e[$i]] = $i;
249 }
250 print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
251 output_table(\@e2a, "PL_e2a");
252 }
253
e0dcdb0a 254 my @i82utf = @{get_I8_2_utf($charset)};
4bc3dcfa
KW
255 print $out_fh <<END;
256/* (Confusingly named) Index is $charset I8 byte; value is
257 * $charset UTF-EBCDIC equivalent */
258END
259 output_table(\@i82utf, "PL_utf2e");
260
261 { #Construct the inverse
262 my @utf2i8;
263 for my $i (0 .. 255) {
264 $utf2i8[$i82utf[$i]] = $i;
265 }
266 print $out_fh <<END;
267/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
268 * $charset I8 equivalent */
269END
270 output_table(\@utf2i8, "PL_e2utf");
271 }
272
273 {
274 my @utf8skip;
275
276 # These are invariants or continuation bytes.
277 for my $i (0 .. 0xBF) {
278 $utf8skip[$i82utf[$i]] = 1;
279 }
280
281 # These are start bytes; The skip is the number of consecutive highest
282 # order 1-bits (up to 7)
283 for my $i (0xC0 .. 255) {
284 my $count;
c0236afe
KW
285 if ($i == 0b11111111) {
286 no warnings 'once';
287 $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES;
288 }
289 elsif (($i & 0b11111110) == 0b11111110) {
4bc3dcfa
KW
290 $count= 7;
291 }
292 elsif (($i & 0b11111100) == 0b11111100) {
293 $count= 6;
294 }
295 elsif (($i & 0b11111000) == 0b11111000) {
296 $count= 5;
297 }
298 elsif (($i & 0b11110000) == 0b11110000) {
299 $count= 4;
300 }
301 elsif (($i & 0b11100000) == 0b11100000) {
302 $count= 3;
303 }
304 elsif (($i & 0b11000000) == 0b11000000) {
305 $count= 2;
306 }
307 else {
308 die "Something wrong for UTF8SKIP calculation for $i";
309 }
310 $utf8skip[$i82utf[$i]] = $count;
311 }
312
313 print $out_fh <<END;
4719093e
KW
314/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes
315 * (including for overlongs); 1 for continuation. Adapted from the shadow
316 * flags table in tr16. The entries marked 9 in tr16 are continuation bytes
317 * and are marked as length 1 here so that we can recover. */
4bc3dcfa 318END
702cfe48
KW
319 output_table(\@utf8skip, "PL_utf8skip", 0); # The 0 means don't print
320 # in hex
4bc3dcfa
KW
321 }
322
323 use feature 'unicode_strings';
324
325 {
326 my @lc;
327 for my $i (0 .. 255) {
328 $lc[$a2e[$i]] = $a2e[ord lc chr $i];
329 }
94e72741
KW
330 print $out_fh
331 "/* Index is $charset code point; value is its lowercase equivalent */\n";
4bc3dcfa
KW
332 output_table(\@lc, "PL_latin1_lc");
333 }
334
335 {
336 my @uc;
337 for my $i (0 .. 255) {
338 my $uc = uc chr $i;
339 if (length $uc > 1 || ord $uc > 255) {
340 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
341 }
342 $uc[$a2e[$i]] = $a2e[ord $uc];
343 }
344 print $out_fh <<END;
345/* Index is $charset code point; value is its uppercase equivalent.
346 * The 'mod' in the name means that codepoints whose uppercase is above 255 or
347 * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
348END
349 output_table(\@uc, "PL_mod_latin1_uc");
350 }
351
352 { # PL_fold
353 my @ascii_fold;
354 for my $i (0 .. 255) { # Initialise to identity map
355 $ascii_fold[$i] = $i;
356 }
357
358 # Overwrite the entries that aren't identity
359 for my $chr ('A' .. 'Z') {
360 $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
361 }
362 for my $chr ('a' .. 'z') {
363 $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
364 }
365 print $out_fh <<END;
366/* Index is $charset code point; For A-Z, value is a-z; for a-z, value
367 * is A-Z; all other code points map to themselves */
368END
369 output_table(\@ascii_fold, "PL_fold");
370 }
371
372 {
373 my @latin1_fold;
374 for my $i (0 .. 255) {
375 my $char = chr $i;
376 my $lc = lc $char;
377
378 # lc and uc adequately proxy for fold-case pairs in this 0-255
379 # range
380 my $uc = uc $char;
381 $uc = $char if length $uc > 1 || ord $uc > 255;
382 if ($lc ne $char) {
383 $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
384 }
385 elsif ($uc ne $char) {
386 $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
387 }
388 else {
389 $latin1_fold[$a2e[$i]] = $a2e[$i];
390 }
391 }
392 print $out_fh <<END;
393/* Index is $charset code point; value is its other fold-pair equivalent
394 * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is
395 * the code point itself */
396END
397 output_table(\@latin1_fold, "PL_fold_latin1");
398 }
399
3de6d141
KW
400 {
401 # This generates the dfa table for perl extended UTF-8, which accepts
402 # surrogates, non-characters, and accepts start bytes up through FE
403 # (start byte FF has to be handled outside this dfa). The class numbers
404 # for start bytes are constrained so that they can be used as a shift
405 # count for masking off the leading one bits
406 #
407 # The classes are
408 # 00-9F 0
409 # A0-A1 7 Not legal immediately after start bytes F0 F8 FC
410 # FE
411 # A2-A3 8 Not legal immediately after start bytes F0 F8 FC
412 # A4-A7 9 Not legal immediately after start bytes F0 F8
413 # A8-AF 10 Not legal immediately after start bytes F0
414 # B0-BF 11
415 # C0-C4 1
416 # C5-DF 2
417 # E0 1
418 # E1-EF 3
419 # F0 12
420 # F1-F7 4
421 # F8 13
422 # F9-FB 5
423 # FC 14
424 # FD 6
425 # FE 15
426 # FF 1
427 #
428 # Here's the I8 for the code points before which overlongs occur:
429 # U+4000: \xF0\xB0\xA0\xA0
430 # U+40000: \xF8\xA8\xA0\xA0\xA0
431 # U+400000: \xFC\xA4\xA0\xA0\xA0\xA0
432 # U+4000000: \xFE\xA2\xA0\xA0\xA0\xA0\xA0
433 #
434 # The first part of the table maps bytes to character classes to reduce
435 # the size of the transition table and create bitmasks.
436 #
437 # The second part is a transition table that maps a combination of a
438 # state of the automaton and a character class to a new state. The
439 # numbering of the original nodes is retained, but some have been split
440 # so that there are new nodes. They mean:
441 # N0 The initial state, and final accepting one.
442 # N1 One continuation byte (A0-BF) left. This is transitioned to
443 # immediately when the start byte indicates a two-byte sequence
444 # N2 Two continuation bytes left.
445 # N3 Three continuation bytes left.
446 # N4 Four continuation bytes left.
447 # N5 Five continuation bytes left.
448 # N6 Start byte is F0. Continuation bytes A[0-F] are illegal
449 # (overlong); the other continuations transition to N2
450 # N7 Start byte is F8. Continuation bytes A[0-7] are illegal
451 # (overlong); the other continuations transition to N3
452 # N8 Start byte is FC. Continuation bytes A[0-3] are illegal
453 # (overlong); the other continuations transition to N4
454 # N9 Start byte is FE. Continuation bytes A[01] are illegal
455 # (overlong); the other continuations transition to N5
456 # 1 Reject. All transitions not mentioned above (except the single
457 # byte ones (as they are always legal) are to this state.
458
459 my $NUM_CLASSES = 16;
460 my $N0 = 0;
461 my $N1 = $N0 + $NUM_CLASSES;
462 my $N2 = $N1 + $NUM_CLASSES;
463 my $N3 = $N2 + $NUM_CLASSES;
464 my $N4 = $N3 + $NUM_CLASSES;
465 my $N5 = $N4 + $NUM_CLASSES;
466 my $N6 = $N5 + $NUM_CLASSES;
467 my $N7 = $N6 + $NUM_CLASSES;
468 my $N8 = $N7 + $NUM_CLASSES;
469 my $N9 = $N8 + $NUM_CLASSES;
470 my $N10 = $N9 + $NUM_CLASSES;
471
472 my @perl_extended_utf8_dfa;
473 my @i8 = (
474 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
475 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
476 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
477 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
480 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
484 7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF
485 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF
486 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
487 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
488 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
489 12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF
490 );
491 $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
492 push @perl_extended_utf8_dfa, (
493 # Class:
494 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
495 0, 1,$N1,$N2,$N3,$N4,$N5, 1, 1, 1, 1, 1,$N6,$N7,$N8,$N9, # N0
496 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, # N1
497 1, 1, 1, 1, 1, 1, 1,$N1,$N1,$N1,$N1,$N1, 1, 1, 1, 1, # N2
498 1, 1, 1, 1, 1, 1, 1,$N2,$N2,$N2,$N2,$N2, 1, 1, 1, 1, # N3
499 1, 1, 1, 1, 1, 1, 1,$N3,$N3,$N3,$N3,$N3, 1, 1, 1, 1, # N4
500 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4,$N4,$N4, 1, 1, 1, 1, # N5
501
502 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N2, 1, 1, 1, 1, # N6
503 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,$N3,$N3, 1, 1, 1, 1, # N7
504 1, 1, 1, 1, 1, 1, 1, 1, 1,$N4,$N4,$N4, 1, 1, 1, 1, # N8
505 1, 1, 1, 1, 1, 1, 1, 1,$N5,$N5,$N5,$N5, 1, 1, 1, 1, # N9
506 );
71525f77 507 output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab",
3de6d141
KW
508 $NUM_CLASSES);
509 }
510
123deead
KW
511 {
512 # This generates the dfa table for strict UTF-8, which rejects
513 # surrogates, non-characters, and above Unicode.
514 #
515 # The classes are
516 # 00-9F 0 Always legal at start
517 # A0 10 Not legal immediately after start bytes F0 F8
518 # A1 11 Not legal immediately after start bytes F0 F8,
519 # A2-A7 12 Not legal immediately after start bytes F0 F8 F9
520 # A8,AA,AC 13 Not legal immediately after start bytes F0 F9
521 # A9,AB,AD 14 Not legal immediately after start byte F0
522 # AE 15 Not legal immediately after start byte F0
523 # AF 16 Not legal immediately after start bytes F0
524 # B[0248AC] 17 Not legal immediately after start byte F9
525 # B[1359D] 18 Not legal immediately after start byte F9
526 # B6 19 Not legal immediately after start byte F9
527 # B7 20 Not legal immediately after start byte F9
528 # BE 21 Not legal immediately after start byte F9
529 # BF 22 Not legal immediately after start byte F9
530 # C0-C4 1 (reject, all are overlong)
531 # C5-DF 2 Accepts any legal continuation
532 # E0 1 (reject, all are overlong)
533 # E1-EF 3 Accepts any legal continuation
534 # F0 8 (has overlongs)
535 # F1 6 (has surrogates, non-chars)
536 # F2,F4,F6 4 Accepts any legal continuation
537 # F3,F5,F7 5 (has non-chars)
538 # F8 9 (has overlongs, non-chars)
539 # F9 7 (has non-chars, non-Unicode)
540 # FA-FF 1 (reject, all are non-Unicode)
541 #
542 # Here's the I8 for enough code points so that you can figure out what's
543 # going on:
544 #
545 # U+D800: \xF1\xB6\xA0\xA0
546 # U+DFFF: \xF1\xB7\xBF\xBF
547 # U+FDD0: \xF1\xBF\xAE\xB0
548 # U+FDEF: \xF1\xBF\xAF\xAF
549 # U+FFFE: \xF1\xBF\xBF\xBE
550 # U+1FFFE: \xF3\xBF\xBF\xBE
551 # U+2FFFE: \xF5\xBF\xBF\xBE
552 # U+3FFFE: \xF7\xBF\xBF\xBE
553 # U+4FFFE: \xF8\xA9\xBF\xBF\xBE
554 # U+5FFFE: \xF8\xAB\xBF\xBF\xBE
555 # U+6FFFE: \xF8\xAD\xBF\xBF\xBE
556 # U+7FFFE: \xF8\xAF\xBF\xBF\xBE
557 # U+8FFFE: \xF8\xB1\xBF\xBF\xBE
558 # U+9FFFE: \xF8\xB3\xBF\xBF\xBE
559 # U+AFFFE: \xF8\xB5\xBF\xBF\xBE
560 # U+BFFFE: \xF8\xB7\xBF\xBF\xBE
561 # U+CFFFE: \xF8\xB9\xBF\xBF\xBE
562 # U+DFFFE: \xF8\xBB\xBF\xBF\xBE
563 # U+EFFFE: \xF8\xBD\xBF\xBF\xBE
564 # U+FFFFE: \xF8\xBF\xBF\xBF\xBE
565 # U+10FFFE: \xF9\xA1\xBF\xBF\xBE
566 #
567 # The first part of the table maps bytes to character classes to reduce
568 # the size of the transition table and create bitmasks.
569 #
570 # The second part is a transition table that maps a combination of a
571 # state of the automaton and a character class to a new state. The
572 # numbering of the original nodes is retained, but some have been split
573 # so that there are new nodes. They mean:
574 # N0 The initial state, and final accepting one.
575 # N1 One continuation byte (A0-BF) left. This is transitioned to
576 # immediately when the start byte indicates a two-byte sequence
577 # N2 Two continuation bytes left.
578 # N3 Three continuation bytes left.
579 # N4 Start byte is F0. Continuation bytes A[0-F] are illegal
580 # (overlong); the other continuations transition to N2
581 # N5 Start byte is F1. Continuation bytes B6 and B7 are illegal
582 # (surrogates); BF transitions to N9; the other continuations to
583 # N2
584 # N6 Start byte is F[357]. Continuation byte BF transitions to N12;
585 # other continuations to N2
586 # N5 Start byte is F8. Continuation bytes A[0-7] are illegal
587 # (overlong); continuations A[9BDF] and B[13579BDF] transition to
588 # N14; the other continuations to N3
589 # N8 Start byte is F9. Continuation byte A0 transitions to N3; A1
590 # to N14; the other continuation bytes are illegal.
591 # N9 Initial sequence is F1 BF. Continuation byte AE transitions to
592 # state N10; AF to N11; BF to N13; the other continuations to N1.
593 # N10 Initial sequence is F1 BF AE. Continuation bytes B0-BF are
594 # illegal (non-chars); the other continuations are legal
595 # N11 Initial sequence is F1 BF AF. Continuation bytes A0-AF are
596 # illegal (non-chars); the other continuations are legal
597 # N12 Initial sequence is F[357] BF. Continuation bytes BF
598 # transitions to N13; the other continuations to N1
599 # N13 Initial sequence is F[1357] BF BF or F8 x BF (where x is
600 # something that can lead to a non-char. Continuation bytes BE
601 # and BF are illegal (non-chars); the other continuations are
602 # legal
603 # N14 Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1.
604 # Continuation byte BF transitions to N13; the other
605 # continuations to N2
606 # 1 Reject. All transitions not mentioned above (except the single
607 # byte ones (as they are always legal) are to this state.
608
609 my $NUM_CLASSES = 23;
610 my $N0 = 0;
611 my $N1 = $N0 + $NUM_CLASSES;
612 my $N2 = $N1 + $NUM_CLASSES;
613 my $N3 = $N2 + $NUM_CLASSES;
614 my $N4 = $N3 + $NUM_CLASSES;
615 my $N5 = $N4 + $NUM_CLASSES;
616 my $N6 = $N5 + $NUM_CLASSES;
617 my $N7 = $N6 + $NUM_CLASSES;
618 my $N8 = $N7 + $NUM_CLASSES;
619 my $N9 = $N8 + $NUM_CLASSES;
620 my $N10 = $N9 + $NUM_CLASSES;
621 my $N11 = $N10 + $NUM_CLASSES;
622 my $N12 = $N11 + $NUM_CLASSES;
623 my $N13 = $N12 + $NUM_CLASSES;
624 my $N14 = $N13 + $NUM_CLASSES;
625
626 my @strict_utf8_dfa;
627 my @i8 = (
628 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
633 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
635 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
636 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
637 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
638 10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF
639 17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF
640 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
641 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
642 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
643 8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF
644 );
645 $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
646 push @strict_utf8_dfa, (
647 # Class:
648 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
649 0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N0
650 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # N1
651 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2
652 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3
653
654 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, # N4
655 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, 1, 1, $N2, $N9, # N5
656 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6
657 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7
658 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N3,$N14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # N8
659 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9
660 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, # N10
661 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, # N11
662 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12
663 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, # N13
664 1,1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N13, # N14
665 );
71525f77 666 output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES);
123deead
KW
667 }
668
c5bfbb64
KW
669 {
670 # This generates the dfa table for C9 strict UTF-8, which rejects
671 # surrogates and above Unicode, but allows non-characters,.
672 #
673 # The classes are
674 # 00-9F 0 Always legal at start
675 # A0-A1 9 Not legal immediately after start bytes F0 F8
676 # A2-A7 10 Not legal immediately after start bytes F0 F8 F9
677 # A8-AF 11 Not legal immediately after start bytes F0 F9
678 # B0-B5,B8-BF 12 Not legal immediately after start byte F9
679 # B6,B7 13
680 # C0-C4 1 (reject, all are overlong)
681 # C5-DF 2 Accepts any legal continuation
682 # E0 1 (reject, all are overlong)
683 # E1-EF 3 Accepts any legal continuation
684 # F0 6 (has overlongs)
685 # F1 5 (has surrogates)
686 # F2-F7 4 Accepts any legal continuation
687 # F8 8 (has overlongs)
688 # F9 7 (has non-Unicode)
689 # FA-FF 1 (reject, all are non-Unicode)
690 #
691 # The first part of the table maps bytes to character classes to reduce
692 # the size of the transition table and create bitmasks.
693 #
694 # The second part is a transition table that maps a combination of a
695 # state of the automaton and a character class to a new state. The
696 # numbering of the original nodes is retained, but some have been split
697 # so that there are new nodes. They mean:
698 # N0 The initial state, and final accepting one.
699 # N1 One continuation byte (A0-BF) left. This is transitioned to
700 # immediately when the start byte indicates a two-byte sequence
701 # N2 Two continuation bytes left.
702 # N3 Three continuation bytes left.
703 # N4 Start byte is F0. Continuation bytes A[0-F] are illegal
704 # (overlong); the other continuations transition to N2
705 # N5 Start byte is F1. B6 and B7 are illegal (surrogates); the
706 # other continuations transition to N2
707 # N6 Start byte is F8. Continuation bytes A[0-7] are illegal
708 # (overlong); the other continuations transition to N3
709 # N7 Start byte is F9. Continuation bytes A0 and A1 transition to
710 # N3; the other continuation bytes are illegal (non-Unicode)
711 # 1 Reject. All transitions not mentioned above (except the single
712 # byte ones (as they are always legal) are to this state.
713
714 my $NUM_CLASSES = 14;
715 my $N0 = 0;
716 my $N1 = $N0 + $NUM_CLASSES;
717 my $N2 = $N1 + $NUM_CLASSES;
718 my $N3 = $N2 + $NUM_CLASSES;
719 my $N4 = $N3 + $NUM_CLASSES;
720 my $N5 = $N4 + $NUM_CLASSES;
721 my $N6 = $N5 + $NUM_CLASSES;
722 my $N7 = $N6 + $NUM_CLASSES;
723
724 my @C9_utf8_dfa;
725 my @i8 = (
726 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
731 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
733 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
734 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
735 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
736 9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF
737 12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF
738 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
739 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
740 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
741 6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF
742 );
743 $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
744 push @C9_utf8_dfa, (
745 # Class:
746 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13
747 0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6, 1, 1, 1, 1, 1, # N0
748 1,1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, # N1
749 1,1, 1, 1, 1, 1, 1, 1, 1,$N1, $N1, $N1, $N1, $N1, # N2
750 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, $N2, # N3
751
752 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N2, $N2, # N4
753 1,1, 1, 1, 1, 1, 1, 1, 1,$N2, $N2, $N2, $N2, 1, # N5
754 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, $N3, $N3, $N3, # N6
755 1,1, 1, 1, 1, 1, 1, 1, 1,$N3, 1, 1, 1, 1, # N7
756 );
71525f77 757 output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
c5bfbb64
KW
758 }
759
4bc3dcfa
KW
760 print $out_fh get_conditional_compile_line_end();
761}
762
6a5bc5ac 763print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n";
4bc3dcfa
KW
764
765read_only_bottom_close_and_rename($out_fh);