regen/ebcdic.pl

   1 use v5.16.0;
   2 use strict;
   3 use warnings;
   4 use integer;
   5
   6 BEGIN { unshift @INC, '.' }
   7
   8 require './regen/regen_lib.pl';
   9 require './regen/charset_translations.pl';
  10
  11 # Generates the EBCDIC translation tables that were formerly hard-coded into
  12 # utfebcdic.h
  13
  14 my $out_fh = open_new('ebcdic_tables.h', '>',
  15         {style => '*', by => $0, });
  16
  17 sub get_column_headers ($$;$) {
  18     my ($row_hdr_len, $field_width, $dfa_columns) = @_;
  19     my $format;
  20     my $final_column_format;
  21     my $num_columns;
  22
  23     if (defined $dfa_columns) {
  24         $num_columns = $dfa_columns;
  25
  26         # Trailing blank to correspond with commas in the rows below
  27         $format = "%${field_width}d ";
  28     }
  29     else {  # Is a regular table
  30         $num_columns = 16;
  31
  32         # Use blanks to separate the fields
  33         $format = " " x ( $field_width
  34                         - 2);               # For the '_X'
  35         $format .= "_%X ";  # Again, trailing blank over the commas below
  36     }
  37
  38     my $header = "/*" . " " x ($row_hdr_len - length "/*");
  39
  40     # All but the final column
  41     $header .= sprintf($format, $_) for 0 .. $num_columns - 2;
  42
  43      # Get rid of trailing blank, so that the final column takes up one less
  44      # space so that the "*/" doesn't extend past the commas in the rows below
  45     chop $header;
  46     $header .= sprintf $format, $num_columns - 1;
  47
  48     # Again, remove trailing blank
  49     chop $header;
  50
  51     return $header . "*/\n";
  52 }
  53
  54 sub output_table_start($$$) {
  55     my ($out_fh, $TYPE, $name) = @_;
  56
  57     my $declaration = "EXTCONST $TYPE $name\[\]";
  58     print $out_fh <<EOF;
  59 #  ifndef DOINIT
  60     $declaration;
  61 #  else
  62     $declaration = {
  63 EOF
  64 }
  65
  66 sub output_table_end($) {
  67     print $out_fh "};\n#  endif\n\n";
  68 }
  69
  70 sub output_table ($$;$) {
  71     my $table_ref = shift;
  72     my $name = shift;
  73
  74     # 0 => print in decimal
  75     # 1 => print in hex (translates code point to code point)
  76     # >= 2 => is a dfa table, like http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
  77     #      The number is how many columns in the part after the code point
  78     #      portion.
  79     #
  80     # code point tables in hex areasier to debug, but don't fit into 80
  81     # columns
  82     my $type = shift // 1;
  83
  84     my $print_in_hex = $type == 1;
  85     my $is_dfa = ($type >= 2) ? $type : 0;
  86     my $columns_after_256 = 16;
  87
  88     die "Requres 256 entries in table $name, got @$table_ref"
  89                                 if ! $is_dfa && @$table_ref != 256;
  90     if (! $is_dfa) {
  91         die "Requres 256 entries in table $name, got @$table_ref"
  92                                                         if @$table_ref != 256;
  93     }
  94     else {
  95         $columns_after_256 = $is_dfa;
  96
  97         print $out_fh <<'EOF';
  98
  99 /* The table below is adapted from
 100  *      http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
 101  * See copyright notice at the beginning of this file.
 102  */
 103
 104 EOF
 105     }
 106
 107     # Highest number in the table
 108     my $max_entry = 0;
 109     $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref;
 110
 111     # We assume that every table has at least one two digit entry, and none
 112     # are more than three digit.
 113     my $field_width = ($print_in_hex)
 114                       ? 4
 115                       : (($max_entry) > 99 ? 3 : 2);
 116
 117     my $row_hdr_length;
 118     my $node_number_field_width;
 119     my $node_value_field_width;
 120
 121     # dfa tables have a special header for the rows in the transitions part of
 122     # the table.  It is longer than the regular one.
 123     if ($is_dfa) {
 124         my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1;
 125         $node_number_field_width = ($max_node_number > 9) ? 2 : 1;
 126         $node_value_field_width = ($max_node_number * $columns_after_256 > 99)
 127                                   ? 3 : 2;
 128         # The header starts with this template, and adds in the number of
 129         # digits needed to represent the maximum node number and its value
 130         $row_hdr_length = length("/*N=*/")
 131                         + $node_number_field_width
 132                         + $node_value_field_width;
 133     }
 134     else {
 135         $row_hdr_length = length "/*_X*/";  # Template for what the header
 136                                             # looks like
 137     }
 138
 139     # The table may not be representable in 8 bits.
 140     my $TYPE = 'U8';
 141     $TYPE = 'U16' if grep { $_ > 255 } @$table_ref;
 142
 143     output_table_start $out_fh, $TYPE, $name;
 144
 145     # First the headers for the columns
 146     print $out_fh get_column_headers($row_hdr_length, $field_width);
 147
 148     # Now the table body
 149     my $count = @$table_ref;
 150     my $last_was_nl = 1;
 151
 152     # Print each element individually, arranged in rows of columns
 153     for my $i (0 .. $count - 1) {
 154
 155         # Node number for here is -1 until get into the dfa state transitions
 156         my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256;
 157
 158         # Print row header at beginning of each row
 159         if ($last_was_nl) {
 160             if ($node >= 0) {
 161                 printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node,
 162                                                $node_value_field_width, $i - 256;
 163             }
 164             else {  # Otherwise is regular row; print its number
 165                 printf $out_fh "/*%X_", $i / 16;
 166
 167                 # These rows in a dfa table require extra space so columns
 168                 # will align vertically (because the Ndd=ddd requires extra
 169                 # space)
 170                 if ($is_dfa) {
 171                     print  $out_fh " " x (  $node_number_field_width
 172                                           + $node_value_field_width);
 173                 }
 174                 print  $out_fh "*/";
 175             }
 176         }
 177
 178         if ($print_in_hex) {
 179             printf $out_fh "0x%02X", $table_ref->[$i];
 180         }
 181         else {
 182             printf $out_fh "%${field_width}d", $table_ref->[$i];
 183         }
 184
 185         print $out_fh ",", if $i < $count -1;   # No comma on final entry
 186
 187         # Add \n if at end of row, which is 16 columns until we get to the
 188         # transitions part
 189         if (   ($node < 0 && $i % 16 == 15)
 190             || ($node >= 0 && ($i -256) % $columns_after_256
 191                                                     == $columns_after_256 - 1))
 192         {
 193             print $out_fh "\n";
 194             $last_was_nl = 1;
 195         }
 196         else {
 197             $last_was_nl = 0;
 198         }
 199     }
 200
 201     # Print column footer
 202     print $out_fh get_column_headers($row_hdr_length, $field_width,
 203                                      ($is_dfa) ? $columns_after_256 : undef);
 204
 205     output_table_end($out_fh);
 206 }
 207
 208 print $out_fh <<'END';
 209
 210 #ifndef PERL_EBCDIC_TABLES_H_   /* Guard against nested #includes */
 211 #define PERL_EBCDIC_TABLES_H_   1
 212
 213 /* This file contains definitions for various tables used in EBCDIC handling.
 214  * More info is in utfebcdic.h
 215  *
 216  * Some of the tables are adapted from
 217  *      http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
 218  * which requires this copyright notice:
 219
 220 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
 221
 222 Permission is hereby granted, free of charge, to any person obtaining a copy of
 223 this software and associated documentation files (the "Software"), to deal in
 224 the Software without restriction, including without limitation the rights to
 225 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 226 of the Software, and to permit persons to whom the Software is furnished to do
 227 so, subject to the following conditions:
 228
 229 The above copyright notice and this permission notice shall be included in all
 230 copies or substantial portions of the Software.
 231
 232 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 233 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 234 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 235 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 236 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 237 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 238 SOFTWARE.
 239
 240 */
 241 END
 242
 243 my @charsets = get_supported_code_pages();
 244 shift @charsets;    # ASCII is the 0th, and we don't deal with that here.
 245 foreach my $charset (@charsets) {
 246     # we process the whole array several times, make a copy
 247     my @a2e = @{get_a2n($charset)};
 248
 249     print $out_fh "\n" . get_conditional_compile_line_start($charset);
 250     print $out_fh "\n";
 251
 252     print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
 253     output_table(\@a2e, "PL_a2e");
 254
 255     { # Construct the inverse
 256         my @e2a;
 257         for my $i (0 .. 255) {
 258             $e2a[$a2e[$i]] = $i;
 259         }
 260         print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
 261         output_table(\@e2a, "PL_e2a");
 262     }
 263
 264     my @i82utf = @{get_I8_2_utf($charset)};
 265     print $out_fh <<END;
 266 /* (Confusingly named) Index is $charset I8 byte; value is
 267  * $charset UTF-EBCDIC equivalent */
 268 END
 269     output_table(\@i82utf, "PL_utf2e");
 270
 271     { #Construct the inverse
 272         my @utf2i8;
 273         for my $i (0 .. 255) {
 274             $utf2i8[$i82utf[$i]] = $i;
 275         }
 276         print $out_fh <<END;
 277 /* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
 278  * $charset I8 equivalent */
 279 END
 280         output_table(\@utf2i8, "PL_e2utf");
 281     }
 282
 283     {
 284         my @utf8skip;
 285
 286         # These are invariants or continuation bytes.
 287         for my $i (0 .. 0xBF) {
 288             $utf8skip[$i82utf[$i]] = 1;
 289         }
 290
 291         # These are start bytes;  The skip is the number of consecutive highest
 292         # order 1-bits (up to 7)
 293         for my $i (0xC0 .. 255) {
 294             my $count;
 295             if ($i == 0b11111111) {
 296                 no warnings 'once';
 297                 $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES;
 298             }
 299             elsif (($i & 0b11111110) == 0b11111110) {
 300                 $count= 7;
 301             }
 302             elsif (($i & 0b11111100) == 0b11111100) {
 303                 $count= 6;
 304             }
 305             elsif (($i & 0b11111000) == 0b11111000) {
 306                 $count= 5;
 307             }
 308             elsif (($i & 0b11110000) == 0b11110000) {
 309                 $count= 4;
 310             }
 311             elsif (($i & 0b11100000) == 0b11100000) {
 312                 $count= 3;
 313             }
 314             elsif (($i & 0b11000000) == 0b11000000) {
 315                 $count= 2;
 316             }
 317             else {
 318                 die "Something wrong for UTF8SKIP calculation for $i";
 319             }
 320             $utf8skip[$i82utf[$i]] = $count;
 321         }
 322
 323         print $out_fh <<END;
 324 /* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes
 325  * (including for overlongs); 1 for continuation.  Adapted from the shadow
 326  * flags table in tr16.  The entries marked 9 in tr16 are continuation bytes
 327  * and are marked as length 1 here so that we can recover. */
 328 END
 329         output_table(\@utf8skip, "PL_utf8skip", 0);  # The 0 means don't print
 330                                                      # in hex
 331     }
 332
 333     use feature 'unicode_strings';
 334
 335     {
 336         my @lc;
 337         for my $i (0 .. 255) {
 338             $lc[$a2e[$i]] = $a2e[ord lc chr $i];
 339         }
 340         print $out_fh
 341         "/* Index is $charset code point; value is its lowercase equivalent */\n";
 342         output_table(\@lc, "PL_latin1_lc");
 343     }
 344
 345     {
 346         my @uc;
 347         for my $i (0 .. 255) {
 348             my $uc = uc chr $i;
 349             if (length $uc > 1 || ord $uc > 255) {
 350                 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
 351             }
 352             $uc[$a2e[$i]] = $a2e[ord $uc];
 353         }
 354         print $out_fh <<END;
 355 /* Index is $charset code point; value is its uppercase equivalent.
 356  * The 'mod' in the name means that codepoints whose uppercase is above 255 or
 357  * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
 358 END
 359         output_table(\@uc, "PL_mod_latin1_uc");
 360     }
 361
 362     { # PL_fold
 363         my @ascii_fold;
 364         for my $i (0 .. 255) {  # Initialise to identity map
 365             $ascii_fold[$i] = $i;
 366         }
 367
 368         # Overwrite the entries that aren't identity
 369         for my $chr ('A' .. 'Z') {
 370             $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
 371         }
 372         for my $chr ('a' .. 'z') {
 373             $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
 374         }
 375         print $out_fh <<END;
 376 /* Index is $charset code point; For A-Z, value is a-z; for a-z, value
 377  * is A-Z; all other code points map to themselves */
 378 END
 379         output_table(\@ascii_fold, "PL_fold");
 380     }
 381
 382     {
 383         my @latin1_fold;
 384         for my $i (0 .. 255) {
 385             my $char = chr $i;
 386             my $lc = lc $char;
 387
 388             # lc and uc adequately proxy for fold-case pairs in this 0-255
 389             # range
 390             my $uc = uc $char;
 391             $uc = $char if length $uc > 1 || ord $uc > 255;
 392             if ($lc ne $char) {
 393                 $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
 394             }
 395             elsif ($uc ne $char) {
 396                 $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
 397             }
 398             else {
 399                 $latin1_fold[$a2e[$i]] = $a2e[$i];
 400             }
 401         }
 402         print $out_fh <<END;
 403 /* Index is $charset code point; value is its other fold-pair equivalent
 404  * (A => a; a => A, etc) in the 0-255 range.  If no such equivalent, value is
 405  * the code point itself */
 406 END
 407         output_table(\@latin1_fold, "PL_fold_latin1");
 408     }
 409
 410     {
 411       # This generates the dfa table for perl extended UTF-8, which accepts
 412       # surrogates, non-characters, and accepts start bytes up through FE
 413       # (start byte FF has to be handled outside this dfa).  The class numbers
 414       # for start bytes are constrained so that they can be used as a shift
 415       # count for masking off the leading one bits
 416       #
 417       # The classes are
 418       #   00-9F           0
 419       #   A0-A1           7   Not legal immediately after start bytes F0 F8 FC
 420       #                       FE
 421       #   A2-A3           8   Not legal immediately after start bytes F0 F8 FC
 422       #   A4-A7           9   Not legal immediately after start bytes F0 F8
 423       #   A8-AF          10   Not legal immediately after start bytes F0
 424       #   B0-BF          11
 425       #   C0-C4           1
 426       #   C5-DF           2
 427       #   E0              1
 428       #   E1-EF           3
 429       #   F0             12
 430       #   F1-F7           4
 431       #   F8             13
 432       #   F9-FB           5
 433       #   FC             14
 434       #   FD              6
 435       #   FE             15
 436       #   FF              1
 437       #
 438       # Here's the I8 for the code points before which overlongs occur:
 439       # U+4000:     \xF0\xB0\xA0\xA0
 440       # U+40000:    \xF8\xA8\xA0\xA0\xA0
 441       # U+400000:   \xFC\xA4\xA0\xA0\xA0\xA0
 442       # U+4000000:  \xFE\xA2\xA0\xA0\xA0\xA0\xA0
 443       #
 444       # The first part of the table maps bytes to character classes to reduce
 445       # the size of the transition table and create bitmasks.
 446       #
 447       # The second part is a transition table that maps a combination of a
 448       # state of the automaton and a character class to a new state.  The
 449       # numbering of the original nodes is retained, but some have been split
 450       # so that there are new nodes.  They mean:
 451       # N0     The initial state, and final accepting one.
 452       # N1     One continuation byte (A0-BF) left.  This is transitioned to
 453       #        immediately when the start byte indicates a two-byte sequence
 454       # N2     Two continuation bytes left.
 455       # N3     Three continuation bytes left.
 456       # N4     Four continuation bytes left.
 457       # N5     Five continuation bytes left.
 458       # N6     Start byte is F0.  Continuation bytes A[0-F] are illegal
 459       #        (overlong); the other continuations transition to N2
 460       # N7     Start byte is F8.  Continuation bytes A[0-7] are illegal
 461       #        (overlong); the other continuations transition to N3
 462       # N8     Start byte is FC.  Continuation bytes A[0-3] are illegal
 463       #        (overlong); the other continuations transition to N4
 464       # N9     Start byte is FE.  Continuation bytes A[01] are illegal
 465       #        (overlong); the other continuations transition to N5
 466       # 1      Reject.  All transitions not mentioned above (except the single
 467       #        byte ones (as they are always legal) are to this state.
 468
 469         my $NUM_CLASSES = 16;
 470         my $N0 = 0;
 471         my $N1 =  $N0 + $NUM_CLASSES;
 472         my $N2 =  $N1 + $NUM_CLASSES;
 473         my $N3 =  $N2 + $NUM_CLASSES;
 474         my $N4 =  $N3 + $NUM_CLASSES;
 475         my $N5 =  $N4 + $NUM_CLASSES;
 476         my $N6 =  $N5 + $NUM_CLASSES;
 477         my $N7 =  $N6 + $NUM_CLASSES;
 478         my $N8 =  $N7 + $NUM_CLASSES;
 479         my $N9 =  $N8 + $NUM_CLASSES;
 480         my $N10 = $N9 + $NUM_CLASSES;
 481
 482         my @perl_extended_utf8_dfa;
 483         my @i8 = (
 484                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
 485                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
 486                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
 487                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
 488                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
 489                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
 490                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
 491                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
 492                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
 493                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
 494                   7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF
 495                  11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF
 496                   1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
 497                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
 498                   1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
 499                  12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF
 500                 );
 501         $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
 502         push @perl_extended_utf8_dfa, (
 503           # Class:
 504           # 0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 505             0,  1,$N1,$N2,$N3,$N4,$N5,  1,  1,  1,  1,  1,$N6,$N7,$N8,$N9, # N0
 506             1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  1,  1,  1,  1, # N1
 507             1,  1,  1,  1,  1,  1,  1,$N1,$N1,$N1,$N1,$N1,  1,  1,  1,  1, # N2
 508             1,  1,  1,  1,  1,  1,  1,$N2,$N2,$N2,$N2,$N2,  1,  1,  1,  1, # N3
 509             1,  1,  1,  1,  1,  1,  1,$N3,$N3,$N3,$N3,$N3,  1,  1,  1,  1, # N4
 510             1,  1,  1,  1,  1,  1,  1,$N4,$N4,$N4,$N4,$N4,  1,  1,  1,  1, # N5
 511
 512             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,$N2,  1,  1,  1,  1, # N6
 513             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,$N3,$N3,  1,  1,  1,  1, # N7
 514             1,  1,  1,  1,  1,  1,  1,  1,  1,$N4,$N4,$N4,  1,  1,  1,  1, # N8
 515             1,  1,  1,  1,  1,  1,  1,  1,$N5,$N5,$N5,$N5,  1,  1,  1,  1, # N9
 516         );
 517         output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab",
 518                                                                    $NUM_CLASSES);
 519     }
 520
 521     {
 522       # This generates the dfa table for strict UTF-8, which rejects
 523       # surrogates, non-characters, and above Unicode.
 524       #
 525       # The classes are
 526       #   00-9F           0   Always legal at start
 527       #   A0             10   Not legal immediately after start bytes F0 F8
 528       #   A1             11   Not legal immediately after start bytes F0 F8,
 529       #   A2-A7          12   Not legal immediately after start bytes F0 F8 F9
 530       #   A8,AA,AC       13   Not legal immediately after start bytes F0 F9
 531       #   A9,AB,AD       14   Not legal immediately after start byte F0
 532       #   AE             15   Not legal immediately after start byte F0
 533       #   AF             16   Not legal immediately after start bytes F0
 534       #   B[0248AC]      17   Not legal immediately after start byte F9
 535       #   B[1359D]       18   Not legal immediately after start byte F9
 536       #   B6             19   Not legal immediately after start byte F9
 537       #   B7             20   Not legal immediately after start byte F9
 538       #   BE             21   Not legal immediately after start byte F9
 539       #   BF             22   Not legal immediately after start byte F9
 540       #   C0-C4           1   (reject, all are overlong)
 541       #   C5-DF           2   Accepts any legal continuation
 542       #   E0              1   (reject, all are overlong)
 543       #   E1-EF           3   Accepts any legal continuation
 544       #   F0              8   (has overlongs)
 545       #   F1              6   (has surrogates, non-chars)
 546       #   F2,F4,F6        4   Accepts any legal continuation
 547       #   F3,F5,F7        5   (has non-chars)
 548       #   F8              9   (has overlongs, non-chars)
 549       #   F9              7   (has non-chars, non-Unicode)
 550       #   FA-FF           1   (reject, all are non-Unicode)
 551       #
 552       # Here's the I8 for enough code points so that you can figure out what's
 553       # going on:
 554       #
 555       # U+D800: \xF1\xB6\xA0\xA0
 556       # U+DFFF: \xF1\xB7\xBF\xBF
 557       # U+FDD0: \xF1\xBF\xAE\xB0
 558       # U+FDEF: \xF1\xBF\xAF\xAF
 559       # U+FFFE: \xF1\xBF\xBF\xBE
 560       # U+1FFFE: \xF3\xBF\xBF\xBE
 561       # U+2FFFE: \xF5\xBF\xBF\xBE
 562       # U+3FFFE: \xF7\xBF\xBF\xBE
 563       # U+4FFFE: \xF8\xA9\xBF\xBF\xBE
 564       # U+5FFFE: \xF8\xAB\xBF\xBF\xBE
 565       # U+6FFFE: \xF8\xAD\xBF\xBF\xBE
 566       # U+7FFFE: \xF8\xAF\xBF\xBF\xBE
 567       # U+8FFFE: \xF8\xB1\xBF\xBF\xBE
 568       # U+9FFFE: \xF8\xB3\xBF\xBF\xBE
 569       # U+AFFFE: \xF8\xB5\xBF\xBF\xBE
 570       # U+BFFFE: \xF8\xB7\xBF\xBF\xBE
 571       # U+CFFFE: \xF8\xB9\xBF\xBF\xBE
 572       # U+DFFFE: \xF8\xBB\xBF\xBF\xBE
 573       # U+EFFFE: \xF8\xBD\xBF\xBF\xBE
 574       # U+FFFFE: \xF8\xBF\xBF\xBF\xBE
 575       # U+10FFFE: \xF9\xA1\xBF\xBF\xBE
 576       #
 577       # The first part of the table maps bytes to character classes to reduce
 578       # the size of the transition table and create bitmasks.
 579       #
 580       # The second part is a transition table that maps a combination of a
 581       # state of the automaton and a character class to a new state.  The
 582       # numbering of the original nodes is retained, but some have been split
 583       # so that there are new nodes.  They mean:
 584       # N0     The initial state, and final accepting one.
 585       # N1     One continuation byte (A0-BF) left.  This is transitioned to
 586       #        immediately when the start byte indicates a two-byte sequence
 587       # N2     Two continuation bytes left.
 588       # N3     Three continuation bytes left.
 589       # N4     Start byte is F0.  Continuation bytes A[0-F] are illegal
 590       #        (overlong); the other continuations transition to N2
 591       # N5     Start byte is F1.  Continuation bytes B6 and B7 are illegal
 592       #        (surrogates); BF transitions to N9; the other continuations to
 593       #        N2
 594       # N6     Start byte is F[357].  Continuation byte BF transitions to N12;
 595       #        other continuations to N2
 596       # N5     Start byte is F8.  Continuation bytes A[0-7] are illegal
 597       #        (overlong); continuations A[9BDF] and B[13579BDF] transition to
 598       #        N14; the other continuations to N3
 599       # N8     Start byte is F9.  Continuation byte A0 transitions to N3; A1
 600       #        to N14; the other continuation bytes are illegal.
 601       # N9     Initial sequence is F1 BF.  Continuation byte AE transitions to
 602       #        state N10; AF to N11; BF to N13; the other continuations to N1.
 603       # N10    Initial sequence is F1 BF AE.  Continuation bytes B0-BF are
 604       #        illegal (non-chars); the other continuations are legal
 605       # N11    Initial sequence is F1 BF AF.  Continuation bytes A0-AF are
 606       #        illegal (non-chars); the other continuations are legal
 607       # N12    Initial sequence is F[357] BF.  Continuation bytes BF
 608       #        transitions to N13; the other continuations to N1
 609       # N13    Initial sequence is F[1357] BF BF or F8 x BF (where x is
 610       #        something that can lead to a non-char.  Continuation bytes BE
 611       #        and BF are illegal (non-chars); the other continuations are
 612       #        legal
 613       # N14    Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1.
 614       #        Continuation byte BF transitions to N13; the other
 615       #        continuations to N2
 616       # 1      Reject.  All transitions not mentioned above (except the single
 617       #        byte ones (as they are always legal) are to this state.
 618
 619         my $NUM_CLASSES = 23;
 620         my $N0 = 0;
 621         my $N1 =  $N0 + $NUM_CLASSES;
 622         my $N2 =  $N1 + $NUM_CLASSES;
 623         my $N3 =  $N2 + $NUM_CLASSES;
 624         my $N4 =  $N3 + $NUM_CLASSES;
 625         my $N5 =  $N4 + $NUM_CLASSES;
 626         my $N6 =  $N5 + $NUM_CLASSES;
 627         my $N7 =  $N6 + $NUM_CLASSES;
 628         my $N8 =  $N7 + $NUM_CLASSES;
 629         my $N9 =  $N8 + $NUM_CLASSES;
 630         my $N10 = $N9 + $NUM_CLASSES;
 631         my $N11 = $N10 + $NUM_CLASSES;
 632         my $N12 = $N11 + $NUM_CLASSES;
 633         my $N13 = $N12 + $NUM_CLASSES;
 634         my $N14 = $N13 + $NUM_CLASSES;
 635
 636         my @strict_utf8_dfa;
 637         my @i8 = (
 638                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
 639                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
 640                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
 641                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
 642                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
 643                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
 644                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
 645                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
 646                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
 647                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
 648                  10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF
 649                  17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF
 650                   1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
 651                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
 652                   1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
 653                   8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF
 654                 );
 655         $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
 656         push @strict_utf8_dfa, (
 657           # Class:
 658           # 0 1   2   3   4   5   6   7   8   9   10   11   12   13   14   15   16   17   18   19   20   21   22
 659             0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1, # N0
 660             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, # N1
 661             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2
 662             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3
 663
 664             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1,   1,   1,   1,   1, $N2, $N2, $N2, $N2, $N2, $N2, # N4
 665             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,   1,   1, $N2, $N9, # N5
 666             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6
 667             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7
 668             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N3,$N14,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1, # N8
 669             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9
 670             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1, # N10
 671             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1,   1,   1,   1,   1,   0,   0,   0,   0,   0,   0, # N11
 672             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12
 673             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1, # N13
 674             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N13, # N14
 675         );
 676         output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES);
 677     }
 678
 679     {
 680       # This generates the dfa table for C9 strict UTF-8, which rejects
 681       # surrogates and above Unicode, but allows non-characters,.
 682       #
 683       # The classes are
 684       #   00-9F           0   Always legal at start
 685       #   A0-A1           9   Not legal immediately after start bytes F0 F8
 686       #   A2-A7          10   Not legal immediately after start bytes F0 F8 F9
 687       #   A8-AF          11   Not legal immediately after start bytes F0 F9
 688       #   B0-B5,B8-BF    12   Not legal immediately after start byte F9
 689       #   B6,B7          13
 690       #   C0-C4           1   (reject, all are overlong)
 691       #   C5-DF           2   Accepts any legal continuation
 692       #   E0              1   (reject, all are overlong)
 693       #   E1-EF           3   Accepts any legal continuation
 694       #   F0              6   (has overlongs)
 695       #   F1              5   (has surrogates)
 696       #   F2-F7           4   Accepts any legal continuation
 697       #   F8              8   (has overlongs)
 698       #   F9              7   (has non-Unicode)
 699       #   FA-FF           1   (reject, all are non-Unicode)
 700       #
 701       # The first part of the table maps bytes to character classes to reduce
 702       # the size of the transition table and create bitmasks.
 703       #
 704       # The second part is a transition table that maps a combination of a
 705       # state of the automaton and a character class to a new state.  The
 706       # numbering of the original nodes is retained, but some have been split
 707       # so that there are new nodes.  They mean:
 708       # N0     The initial state, and final accepting one.
 709       # N1     One continuation byte (A0-BF) left.  This is transitioned to
 710       #        immediately when the start byte indicates a two-byte sequence
 711       # N2     Two continuation bytes left.
 712       # N3     Three continuation bytes left.
 713       # N4     Start byte is F0.  Continuation bytes A[0-F] are illegal
 714       #        (overlong); the other continuations transition to N2
 715       # N5     Start byte is F1.  B6 and B7 are illegal (surrogates); the
 716       #        other continuations transition to N2
 717       # N6     Start byte is F8.  Continuation bytes A[0-7] are illegal
 718       #        (overlong); the other continuations transition to N3
 719       # N7     Start byte is F9.  Continuation bytes A0 and A1 transition to
 720       #        N3; the other continuation bytes are illegal (non-Unicode)
 721       # 1      Reject.  All transitions not mentioned above (except the single
 722       #        byte ones (as they are always legal) are to this state.
 723
 724         my $NUM_CLASSES = 14;
 725         my $N0 = 0;
 726         my $N1 =  $N0 + $NUM_CLASSES;
 727         my $N2 =  $N1 + $NUM_CLASSES;
 728         my $N3 =  $N2 + $NUM_CLASSES;
 729         my $N4 =  $N3 + $NUM_CLASSES;
 730         my $N5 =  $N4 + $NUM_CLASSES;
 731         my $N6 =  $N5 + $NUM_CLASSES;
 732         my $N7 =  $N6 + $NUM_CLASSES;
 733
 734         my @C9_utf8_dfa;
 735         my @i8 = (
 736                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
 737                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
 738                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
 739                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
 740                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
 741                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
 742                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
 743                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
 744                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
 745                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
 746                   9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF
 747                  12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF
 748                   1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
 749                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
 750                   1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
 751                   6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF
 752                 );
 753         $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
 754         push @C9_utf8_dfa, (
 755           # Class:
 756           # 0 1   2   3   4   5   6   7   8   9   10   11   12   13
 757             0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6,  1,   1,   1,   1,   1, # N0
 758             1,1,  1,  1,  1,  1,  1,  1,  1,  0,   0,   0,   0,   0, # N1
 759             1,1,  1,  1,  1,  1,  1,  1,  1,$N1, $N1, $N1, $N1, $N1, # N2
 760             1,1,  1,  1,  1,  1,  1,  1,  1,$N2, $N2, $N2, $N2, $N2, # N3
 761
 762             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1, $N2, $N2, # N4
 763             1,1,  1,  1,  1,  1,  1,  1,  1,$N2, $N2, $N2, $N2,   1, # N5
 764             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1, $N3, $N3, $N3, # N6
 765             1,1,  1,  1,  1,  1,  1,  1,  1,$N3,   1,   1,   1,   1, # N7
 766         );
 767         output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
 768     }
 769
 770     print $out_fh get_conditional_compile_line_end();
 771 }
 772
 773 print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n";
 774
 775 read_only_bottom_close_and_rename($out_fh);