regen/ebcdic.pl

   1 use v5.16.0;
   2 use strict;
   3 use warnings;
   4 use integer;
   5
   6 BEGIN { unshift @INC, '.' }
   7
   8 require './regen/regen_lib.pl';
   9 require './regen/charset_translations.pl';
  10
  11 # Generates the EBCDIC translation tables that were formerly hard-coded into
  12 # utfebcdic.h
  13
  14 my $out_fh = open_new('ebcdic_tables.h', '>',
  15         {style => '*', by => $0, });
  16
  17 sub get_column_headers ($$;$) {
  18     my ($row_hdr_len, $field_width, $dfa_columns) = @_;
  19     my $format;
  20     my $final_column_format;
  21     my $num_columns;
  22
  23     if (defined $dfa_columns) {
  24         $num_columns = $dfa_columns;
  25
  26         # Trailing blank to correspond with commas in the rows below
  27         $format = "%${field_width}d ";
  28     }
  29     else {  # Is a regular table
  30         $num_columns = 16;
  31
  32         # Use blanks to separate the fields
  33         $format = " " x ( $field_width
  34                         - 2);               # For the '_X'
  35         $format .= "_%X ";  # Again, trailing blank over the commas below
  36     }
  37
  38     my $header = "/*" . " " x ($row_hdr_len - length "/*");
  39
  40     # All but the final column
  41     $header .= sprintf($format, $_) for 0 .. $num_columns - 2;
  42
  43      # Get rid of trailing blank, so that the final column takes up one less
  44      # space so that the "*/" doesn't extend past the commas in the rows below
  45     chop $header;
  46     $header .= sprintf $format, $num_columns - 1;
  47
  48     # Again, remove trailing blank
  49     chop $header;
  50
  51     return $header . "*/\n";
  52 }
  53
  54 sub output_table_start($$$;$) {
  55     my ($out_fh, $TYPE, $name, $size) = @_;
  56
  57     $size = "" unless defined $size;
  58
  59     # Anything locale related will be written on
  60     my $const = ($name !~ /locale/i) ? 'CONST' : "";
  61
  62     my $declaration = "EXT$const $TYPE $name\[$size\]";
  63     print $out_fh <<EOF;
  64 #  ifndef DOINIT
  65     $declaration;
  66 #  else
  67     $declaration = {
  68 EOF
  69 }
  70
  71 sub output_table_end($) {
  72     print $out_fh "};\n#  endif\n\n";
  73 }
  74
  75 sub output_table ($$;$) {
  76     my $table_ref = shift;
  77     my $name = shift;
  78
  79     my $size = @$table_ref;
  80
  81     # 0 => print in decimal
  82     # 1 => print in hex (translates code point to code point)
  83     # >= 2 => is a dfa table, like https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
  84     #      The number is how many columns in the part after the code point
  85     #      portion.
  86     #
  87     # code point tables in hex areasier to debug, but don't fit into 80
  88     # columns
  89     my $type = shift // 1;
  90
  91     my $print_in_hex = $type == 1;
  92     my $is_dfa = ($type >= 2) ? $type : 0;
  93     my $columns_after_256 = 16;
  94
  95     die "Requres 256 entries in table $name, got @$table_ref"
  96                                 if ! $is_dfa && @$table_ref != 256;
  97     if (! $is_dfa) {
  98         die "Requres 256 entries in table $name, got @$table_ref"
  99                                                         if @$table_ref != 256;
 100     }
 101     else {
 102         $columns_after_256 = $is_dfa;
 103
 104         print $out_fh <<'EOF';
 105
 106 /* The table below is adapted from
 107  *      https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
 108  * See copyright notice at the beginning of this file.
 109  */
 110
 111 EOF
 112     }
 113
 114     # Highest number in the table
 115     my $max_entry = 0;
 116     $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref;
 117
 118     # We assume that every table has at least one two digit entry, and none
 119     # are more than three digit.
 120     my $field_width = ($print_in_hex)
 121                       ? 4
 122                       : (($max_entry) > 99 ? 3 : 2);
 123
 124     my $row_hdr_length;
 125     my $node_number_field_width;
 126     my $node_value_field_width;
 127
 128     # dfa tables have a special header for the rows in the transitions part of
 129     # the table.  It is longer than the regular one.
 130     if ($is_dfa) {
 131         my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1;
 132         $node_number_field_width = ($max_node_number > 9) ? 2 : 1;
 133         $node_value_field_width = ($max_node_number * $columns_after_256 > 99)
 134                                   ? 3 : 2;
 135         # The header starts with this template, and adds in the number of
 136         # digits needed to represent the maximum node number and its value
 137         $row_hdr_length = length("/*N=*/")
 138                         + $node_number_field_width
 139                         + $node_value_field_width;
 140     }
 141     else {
 142         $row_hdr_length = length "/*_X*/";  # Template for what the header
 143                                             # looks like
 144     }
 145
 146     # The table may not be representable in 8 bits.
 147     my $TYPE = 'U8';
 148     $TYPE = 'U16' if grep { $_ > 255 } @$table_ref;
 149
 150     output_table_start $out_fh, $TYPE, $name, $size;
 151
 152     # First the headers for the columns
 153     print $out_fh get_column_headers($row_hdr_length, $field_width);
 154
 155     # Now the table body
 156     my $count = @$table_ref;
 157     my $last_was_nl = 1;
 158
 159     # Print each element individually, arranged in rows of columns
 160     for my $i (0 .. $count - 1) {
 161
 162         # Node number for here is -1 until get into the dfa state transitions
 163         my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256;
 164
 165         # Print row header at beginning of each row
 166         if ($last_was_nl) {
 167             if ($node >= 0) {
 168                 printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node,
 169                                                $node_value_field_width, $i - 256;
 170             }
 171             else {  # Otherwise is regular row; print its number
 172                 printf $out_fh "/*%X_", $i / 16;
 173
 174                 # These rows in a dfa table require extra space so columns
 175                 # will align vertically (because the Ndd=ddd requires extra
 176                 # space)
 177                 if ($is_dfa) {
 178                     print  $out_fh " " x (  $node_number_field_width
 179                                           + $node_value_field_width);
 180                 }
 181                 print  $out_fh "*/";
 182             }
 183         }
 184
 185         if ($print_in_hex) {
 186             printf $out_fh "0x%02X", $table_ref->[$i];
 187         }
 188         else {
 189             printf $out_fh "%${field_width}d", $table_ref->[$i];
 190         }
 191
 192         print $out_fh ",", if $i < $count -1;   # No comma on final entry
 193
 194         # Add \n if at end of row, which is 16 columns until we get to the
 195         # transitions part
 196         if (   ($node < 0 && $i % 16 == 15)
 197             || ($node >= 0 && ($i -256) % $columns_after_256
 198                                                     == $columns_after_256 - 1))
 199         {
 200             print $out_fh "\n";
 201             $last_was_nl = 1;
 202         }
 203         else {
 204             $last_was_nl = 0;
 205         }
 206     }
 207
 208     # Print column footer
 209     print $out_fh get_column_headers($row_hdr_length, $field_width,
 210                                      ($is_dfa) ? $columns_after_256 : undef);
 211
 212     output_table_end($out_fh);
 213 }
 214
 215 print $out_fh <<'END';
 216
 217 #ifndef PERL_EBCDIC_TABLES_H_   /* Guard against nested #includes */
 218 #define PERL_EBCDIC_TABLES_H_   1
 219
 220 /* This file contains definitions for various tables used in EBCDIC handling.
 221  * More info is in utfebcdic.h
 222  *
 223  * Some of the tables are adapted from
 224  *      https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
 225  * which requires this copyright notice:
 226
 227 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
 228
 229 Permission is hereby granted, free of charge, to any person obtaining a copy of
 230 this software and associated documentation files (the "Software"), to deal in
 231 the Software without restriction, including without limitation the rights to
 232 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 233 of the Software, and to permit persons to whom the Software is furnished to do
 234 so, subject to the following conditions:
 235
 236 The above copyright notice and this permission notice shall be included in all
 237 copies or substantial portions of the Software.
 238
 239 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 240 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 241 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 242 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 243 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 244 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 245 SOFTWARE.
 246
 247 */
 248 END
 249
 250 my @charsets = get_supported_code_pages();
 251 shift @charsets;    # ASCII is the 0th, and we don't deal with that here.
 252 foreach my $charset (@charsets) {
 253     # we process the whole array several times, make a copy
 254     my @a2e = @{get_a2n($charset)};
 255     my @e2a;
 256
 257     print $out_fh "\n" . get_conditional_compile_line_start($charset);
 258     print $out_fh "\n";
 259
 260     print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
 261     output_table(\@a2e, "PL_a2e");
 262
 263     { # Construct the inverse
 264         for my $i (0 .. 255) {
 265             $e2a[$a2e[$i]] = $i;
 266         }
 267         print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
 268         output_table(\@e2a, "PL_e2a");
 269     }
 270
 271     my @i82utf = @{get_I8_2_utf($charset)};
 272     print $out_fh <<END;
 273 /* (Confusingly named) Index is $charset I8 byte; value is
 274  * $charset UTF-EBCDIC equivalent */
 275 END
 276     output_table(\@i82utf, "PL_utf2e");
 277
 278     { #Construct the inverse
 279         my @utf2i8;
 280         for my $i (0 .. 255) {
 281             $utf2i8[$i82utf[$i]] = $i;
 282         }
 283         print $out_fh <<END;
 284 /* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
 285  * $charset I8 equivalent */
 286 END
 287         output_table(\@utf2i8, "PL_e2utf");
 288     }
 289
 290     {
 291         my @utf8skip;
 292
 293         # These are invariants or continuation bytes.
 294         for my $i (0 .. 0xBF) {
 295             $utf8skip[$i82utf[$i]] = 1;
 296         }
 297
 298         # These are start bytes;  The skip is the number of consecutive highest
 299         # order 1-bits (up to 7)
 300         for my $i (0xC0 .. 255) {
 301             my $count;
 302             if ($i == 0b11111111) {
 303                 no warnings 'once';
 304                 $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES;
 305             }
 306             elsif (($i & 0b11111110) == 0b11111110) {
 307                 $count= 7;
 308             }
 309             elsif (($i & 0b11111100) == 0b11111100) {
 310                 $count= 6;
 311             }
 312             elsif (($i & 0b11111000) == 0b11111000) {
 313                 $count= 5;
 314             }
 315             elsif (($i & 0b11110000) == 0b11110000) {
 316                 $count= 4;
 317             }
 318             elsif (($i & 0b11100000) == 0b11100000) {
 319                 $count= 3;
 320             }
 321             elsif (($i & 0b11000000) == 0b11000000) {
 322                 $count= 2;
 323             }
 324             else {
 325                 die "Something wrong for UTF8SKIP calculation for $i";
 326             }
 327             $utf8skip[$i82utf[$i]] = $count;
 328         }
 329
 330         print $out_fh <<END;
 331 /* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes
 332  * (including for overlongs); 1 for continuation.  Adapted from the shadow
 333  * flags table in tr16.  The entries marked 9 in tr16 are continuation bytes
 334  * and are marked as length 1 here so that we can recover. */
 335 END
 336         output_table(\@utf8skip, "PL_utf8skip", 0);  # The 0 means don't print
 337                                                      # in hex
 338     }
 339
 340     use feature 'unicode_strings';
 341
 342     {
 343         my @lc;
 344         for my $i (0 .. 255) {
 345             $lc[$a2e[$i]] = $a2e[ord lc chr $i];
 346         }
 347         print $out_fh
 348         "/* Index is $charset code point; value is its lowercase equivalent */\n";
 349         output_table(\@lc, "PL_latin1_lc");
 350     }
 351
 352     {
 353         my @uc;
 354         for my $i (0 .. 255) {
 355             my $uc = uc chr $i;
 356             if (length $uc > 1 || ord $uc > 255) {
 357                 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
 358             }
 359             $uc[$a2e[$i]] = $a2e[ord $uc];
 360         }
 361         print $out_fh <<END;
 362 /* Index is $charset code point; value is its uppercase equivalent.
 363  * The 'mod' in the name means that codepoints whose uppercase is above 255 or
 364  * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
 365 END
 366         output_table(\@uc, "PL_mod_latin1_uc");
 367     }
 368
 369     { # PL_fold
 370         my @ascii_fold;
 371         for my $i (0 .. 255) {  # Initialise to identity map
 372             $ascii_fold[$i] = $i;
 373         }
 374
 375         # Overwrite the entries that aren't identity
 376         for my $chr ('A' .. 'Z') {
 377             $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
 378         }
 379         for my $chr ('a' .. 'z') {
 380             $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
 381         }
 382         print $out_fh <<END;
 383 /* Index is $charset code point; For A-Z, value is a-z; for a-z, value
 384  * is A-Z; all other code points map to themselves */
 385 END
 386         output_table(\@ascii_fold, "PL_fold");
 387
 388         # This table is also the correct folding for the default C locale
 389         output_table(\@ascii_fold, "PL_fold_locale");
 390     }
 391
 392     {
 393         my @latin1_fold;
 394         for my $i (0 .. 255) {
 395             my $char = chr $i;
 396             my $lc = lc $char;
 397
 398             # lc and uc adequately proxy for fold-case pairs in this 0-255
 399             # range
 400             my $uc = uc $char;
 401             $uc = $char if length $uc > 1 || ord $uc > 255;
 402             if ($lc ne $char) {
 403                 $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
 404             }
 405             elsif ($uc ne $char) {
 406                 $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
 407             }
 408             else {
 409                 $latin1_fold[$a2e[$i]] = $a2e[$i];
 410             }
 411         }
 412         print $out_fh <<END;
 413 /* Index is $charset code point; value is its other fold-pair equivalent
 414  * (A => a; a => A, etc) in the 0-255 range.  If no such equivalent, value is
 415  * the code point itself */
 416 END
 417         output_table(\@latin1_fold, "PL_fold_latin1");
 418     }
 419
 420     {
 421       # This generates the dfa table for perl extended UTF-8, which accepts
 422       # surrogates, non-characters, and accepts start bytes up through FE
 423       # (start byte FF has to be handled outside this dfa).  The class numbers
 424       # for start bytes are constrained so that they can be used as a shift
 425       # count for masking off the leading one bits
 426       #
 427       # The classes are
 428       #   00-9F           0
 429       #   A0-A1           7   Not legal immediately after start bytes F0 F8 FC
 430       #                       FE
 431       #   A2-A3           8   Not legal immediately after start bytes F0 F8 FC
 432       #   A4-A7           9   Not legal immediately after start bytes F0 F8
 433       #   A8-AF          10   Not legal immediately after start bytes F0
 434       #   B0-BF          11
 435       #   C0-C4           1
 436       #   C5-DF           2
 437       #   E0              1
 438       #   E1-EF           3
 439       #   F0             12
 440       #   F1-F7           4
 441       #   F8             13
 442       #   F9-FB           5
 443       #   FC             14
 444       #   FD              6
 445       #   FE             15
 446       #   FF              1
 447       #
 448       # Here's the I8 for the code points before which overlongs occur:
 449       # U+4000:     \xF0\xB0\xA0\xA0
 450       # U+40000:    \xF8\xA8\xA0\xA0\xA0
 451       # U+400000:   \xFC\xA4\xA0\xA0\xA0\xA0
 452       # U+4000000:  \xFE\xA2\xA0\xA0\xA0\xA0\xA0
 453       #
 454       # The first part of the table maps bytes to character classes to reduce
 455       # the size of the transition table and create bitmasks.
 456       #
 457       # The second part is a transition table that maps a combination of a
 458       # state of the automaton and a character class to a new state.  The
 459       # numbering of the original nodes is retained, but some have been split
 460       # so that there are new nodes.  They mean:
 461       # N0     The initial state, and final accepting one.
 462       # N1     One continuation byte (A0-BF) left.  This is transitioned to
 463       #        immediately when the start byte indicates a two-byte sequence
 464       # N2     Two continuation bytes left.
 465       # N3     Three continuation bytes left.
 466       # N4     Four continuation bytes left.
 467       # N5     Five continuation bytes left.
 468       # N6     Start byte is F0.  Continuation bytes A[0-F] are illegal
 469       #        (overlong); the other continuations transition to N2
 470       # N7     Start byte is F8.  Continuation bytes A[0-7] are illegal
 471       #        (overlong); the other continuations transition to N3
 472       # N8     Start byte is FC.  Continuation bytes A[0-3] are illegal
 473       #        (overlong); the other continuations transition to N4
 474       # N9     Start byte is FE.  Continuation bytes A[01] are illegal
 475       #        (overlong); the other continuations transition to N5
 476       # 1      Reject.  All transitions not mentioned above (except the single
 477       #        byte ones (as they are always legal) are to this state.
 478
 479         my $NUM_CLASSES = 16;
 480         my $N0 = 0;
 481         my $N1 =  $N0 + $NUM_CLASSES;
 482         my $N2 =  $N1 + $NUM_CLASSES;
 483         my $N3 =  $N2 + $NUM_CLASSES;
 484         my $N4 =  $N3 + $NUM_CLASSES;
 485         my $N5 =  $N4 + $NUM_CLASSES;
 486         my $N6 =  $N5 + $NUM_CLASSES;
 487         my $N7 =  $N6 + $NUM_CLASSES;
 488         my $N8 =  $N7 + $NUM_CLASSES;
 489         my $N9 =  $N8 + $NUM_CLASSES;
 490         my $N10 = $N9 + $NUM_CLASSES;
 491
 492         my @perl_extended_utf8_dfa;
 493         my @i8 = (
 494                 # 0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
 495                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
 496                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
 497                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
 498                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
 499                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
 500                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
 501                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
 502                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
 503                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
 504                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
 505                   7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF
 506                  11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF
 507                   1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
 508                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
 509                   1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
 510                  12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF
 511                 );
 512         $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
 513         push @perl_extended_utf8_dfa, (
 514           # Class:
 515           # 0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 516             0,  1,$N1,$N2,$N3,$N4,$N5,  1,  1,  1,  1,  1,$N6,$N7,$N8,$N9, # N0
 517             1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  1,  1,  1,  1, # N1
 518             1,  1,  1,  1,  1,  1,  1,$N1,$N1,$N1,$N1,$N1,  1,  1,  1,  1, # N2
 519             1,  1,  1,  1,  1,  1,  1,$N2,$N2,$N2,$N2,$N2,  1,  1,  1,  1, # N3
 520             1,  1,  1,  1,  1,  1,  1,$N3,$N3,$N3,$N3,$N3,  1,  1,  1,  1, # N4
 521             1,  1,  1,  1,  1,  1,  1,$N4,$N4,$N4,$N4,$N4,  1,  1,  1,  1, # N5
 522
 523             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,$N2,  1,  1,  1,  1, # N6
 524             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,$N3,$N3,  1,  1,  1,  1, # N7
 525             1,  1,  1,  1,  1,  1,  1,  1,  1,$N4,$N4,$N4,  1,  1,  1,  1, # N8
 526             1,  1,  1,  1,  1,  1,  1,  1,$N5,$N5,$N5,$N5,  1,  1,  1,  1, # N9
 527         );
 528         output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab",
 529                                                                    $NUM_CLASSES);
 530     }
 531
 532     {
 533       # This generates the dfa table for strict UTF-8, which rejects
 534       # surrogates, non-characters, and above Unicode.
 535       #
 536       # The classes are
 537       #   00-9F           0   Always legal at start
 538       #   A0             10   Not legal immediately after start bytes F0 F8
 539       #   A1             11   Not legal immediately after start bytes F0 F8,
 540       #   A2-A7          12   Not legal immediately after start bytes F0 F8 F9
 541       #   A8,AA,AC       13   Not legal immediately after start bytes F0 F9
 542       #   A9,AB,AD       14   Not legal immediately after start byte F0
 543       #   AE             15   Not legal immediately after start byte F0
 544       #   AF             16   Not legal immediately after start bytes F0
 545       #   B[0248AC]      17   Not legal immediately after start byte F9
 546       #   B[1359D]       18   Not legal immediately after start byte F9
 547       #   B6             19   Not legal immediately after start byte F9
 548       #   B7             20   Not legal immediately after start byte F9
 549       #   BE             21   Not legal immediately after start byte F9
 550       #   BF             22   Not legal immediately after start byte F9
 551       #   C0-C4           1   (reject, all are overlong)
 552       #   C5-DF           2   Accepts any legal continuation
 553       #   E0              1   (reject, all are overlong)
 554       #   E1-EF           3   Accepts any legal continuation
 555       #   F0              8   (has overlongs)
 556       #   F1              6   (has surrogates, non-chars)
 557       #   F2,F4,F6        4   Accepts any legal continuation
 558       #   F3,F5,F7        5   (has non-chars)
 559       #   F8              9   (has overlongs, non-chars)
 560       #   F9              7   (has non-chars, non-Unicode)
 561       #   FA-FF           1   (reject, all are non-Unicode)
 562       #
 563       # Here's the I8 for enough code points so that you can figure out what's
 564       # going on:
 565       #
 566       # U+D800: \xF1\xB6\xA0\xA0
 567       # U+DFFF: \xF1\xB7\xBF\xBF
 568       # U+FDD0: \xF1\xBF\xAE\xB0
 569       # U+FDEF: \xF1\xBF\xAF\xAF
 570       # U+FFFE: \xF1\xBF\xBF\xBE
 571       # U+1FFFE: \xF3\xBF\xBF\xBE
 572       # U+2FFFE: \xF5\xBF\xBF\xBE
 573       # U+3FFFE: \xF7\xBF\xBF\xBE
 574       # U+4FFFE: \xF8\xA9\xBF\xBF\xBE
 575       # U+5FFFE: \xF8\xAB\xBF\xBF\xBE
 576       # U+6FFFE: \xF8\xAD\xBF\xBF\xBE
 577       # U+7FFFE: \xF8\xAF\xBF\xBF\xBE
 578       # U+8FFFE: \xF8\xB1\xBF\xBF\xBE
 579       # U+9FFFE: \xF8\xB3\xBF\xBF\xBE
 580       # U+AFFFE: \xF8\xB5\xBF\xBF\xBE
 581       # U+BFFFE: \xF8\xB7\xBF\xBF\xBE
 582       # U+CFFFE: \xF8\xB9\xBF\xBF\xBE
 583       # U+DFFFE: \xF8\xBB\xBF\xBF\xBE
 584       # U+EFFFE: \xF8\xBD\xBF\xBF\xBE
 585       # U+FFFFE: \xF8\xBF\xBF\xBF\xBE
 586       # U+10FFFE: \xF9\xA1\xBF\xBF\xBE
 587       #
 588       # The first part of the table maps bytes to character classes to reduce
 589       # the size of the transition table and create bitmasks.
 590       #
 591       # The second part is a transition table that maps a combination of a
 592       # state of the automaton and a character class to a new state.  The
 593       # numbering of the original nodes is retained, but some have been split
 594       # so that there are new nodes.  They mean:
 595       # N0     The initial state, and final accepting one.
 596       # N1     One continuation byte (A0-BF) left.  This is transitioned to
 597       #        immediately when the start byte indicates a two-byte sequence
 598       # N2     Two continuation bytes left.
 599       # N3     Three continuation bytes left.
 600       # N4     Start byte is F0.  Continuation bytes A[0-F] are illegal
 601       #        (overlong); the other continuations transition to N2
 602       # N5     Start byte is F1.  Continuation bytes B6 and B7 are illegal
 603       #        (surrogates); BF transitions to N9; the other continuations to
 604       #        N2
 605       # N6     Start byte is F[357].  Continuation byte BF transitions to N12;
 606       #        other continuations to N2
 607       # N7     Start byte is F8.  Continuation bytes A[0-7] are illegal
 608       #        (overlong); continuations A[9BDF] and B[13579BDF] transition to
 609       #        N14; the other continuations to N3
 610       # N8     Start byte is F9.  Continuation byte A0 transitions to N3; A1
 611       #        to N14; the other continuation bytes are illegal.
 612       # N9     Initial sequence is F1 BF.  Continuation byte AE transitions to
 613       #        state N10; AF to N11; BF to N13; the other continuations to N1.
 614       # N10    Initial sequence is F1 BF AE.  Continuation bytes B0-BF are
 615       #        illegal (non-chars); the other continuations are legal
 616       # N11    Initial sequence is F1 BF AF.  Continuation bytes A0-AF are
 617       #        illegal (non-chars); the other continuations are legal
 618       # N12    Initial sequence is F[357] BF.  Continuation bytes BF
 619       #        transitions to N13; the other continuations to N1
 620       # N13    Initial sequence is F[1357] BF BF or F8 x y BF (where x and y
 621       #        are something that can lead to a non-char.  Continuation bytes
 622       #        BE and BF are illegal (non-chars); the other continuations are
 623       #        legal
 624       # N14    Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1.
 625       #        Continuation byte BF transitions to N15; the other
 626       #        continuations to N2
 627       # N15    Initial sequence is F8 A[9BDF] BF; or F8 B[13579BDF] BF; or
 628       #        F9 A1 BF.  Continuation byte BF transitions to N16; the other
 629       #        continuations to N2
 630       # N16    Initial sequence is F8 A[9BDF] BF BF; or F8 B[13579BDF] BF BF;
 631       #        or F9 A1 BF BF.  Continuation bytes BE and BF are illegal
 632       #        (non-chars); the other continuations are legal
 633       # 1      Reject.  All transitions not mentioned above (except the single
 634       #        byte ones (as they are always legal) are to this state.
 635
 636         my $NUM_CLASSES = 23;
 637         my $N0 = 0;
 638         my $N1 =  $N0 + $NUM_CLASSES;
 639         my $N2 =  $N1 + $NUM_CLASSES;
 640         my $N3 =  $N2 + $NUM_CLASSES;
 641         my $N4 =  $N3 + $NUM_CLASSES;
 642         my $N5 =  $N4 + $NUM_CLASSES;
 643         my $N6 =  $N5 + $NUM_CLASSES;
 644         my $N7 =  $N6 + $NUM_CLASSES;
 645         my $N8 =  $N7 + $NUM_CLASSES;
 646         my $N9 =  $N8 + $NUM_CLASSES;
 647         my $N10 = $N9 + $NUM_CLASSES;
 648         my $N11 = $N10 + $NUM_CLASSES;
 649         my $N12 = $N11 + $NUM_CLASSES;
 650         my $N13 = $N12 + $NUM_CLASSES;
 651         my $N14 = $N13 + $NUM_CLASSES;
 652         my $N15 = $N14 + $NUM_CLASSES;
 653
 654         my @strict_utf8_dfa;
 655         my @i8 = (
 656                 # 0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
 657                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
 658                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
 659                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
 660                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
 661                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
 662                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
 663                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
 664                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
 665                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
 666                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
 667                  10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF
 668                  17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF
 669                   1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
 670                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
 671                   1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
 672                   8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF
 673                 );
 674         $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
 675         push @strict_utf8_dfa, (
 676           # Class:
 677           # 0 1   2   3   4   5   6   7   8   9   10   11   12   13   14   15   16   17   18   19   20   21   22
 678             0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1, # N0
 679             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, # N1
 680             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2
 681             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3
 682
 683             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1,   1,   1,   1,   1, $N2, $N2, $N2, $N2, $N2, $N2, # N4
 684             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,   1,   1, $N2, $N9, # N5
 685             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6
 686             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7
 687             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N3,$N14,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1, # N8
 688             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9
 689             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1, # N10
 690             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1,   1,   1,   1,   1,   0,   0,   0,   0,   0,   0, # N11
 691             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12
 692             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1, # N13
 693             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N15, # N14
 694             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N15
 695         );
 696         output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES);
 697     }
 698
 699     {
 700       # This generates the dfa table for C9 strict UTF-8, which rejects
 701       # surrogates and above Unicode, but allows non-characters,.
 702       #
 703       # The classes are
 704       #   00-9F           0   Always legal at start
 705       #   A0-A1           9   Not legal immediately after start bytes F0 F8
 706       #   A2-A7          10   Not legal immediately after start bytes F0 F8 F9
 707       #   A8-AF          11   Not legal immediately after start bytes F0 F9
 708       #   B0-B5,B8-BF    12   Not legal immediately after start byte F9
 709       #   B6,B7          13
 710       #   C0-C4           1   (reject, all are overlong)
 711       #   C5-DF           2   Accepts any legal continuation
 712       #   E0              1   (reject, all are overlong)
 713       #   E1-EF           3   Accepts any legal continuation
 714       #   F0              6   (has overlongs)
 715       #   F1              5   (has surrogates)
 716       #   F2-F7           4   Accepts any legal continuation
 717       #   F8              8   (has overlongs)
 718       #   F9              7   (has non-Unicode)
 719       #   FA-FF           1   (reject, all are non-Unicode)
 720       #
 721       # The first part of the table maps bytes to character classes to reduce
 722       # the size of the transition table and create bitmasks.
 723       #
 724       # The second part is a transition table that maps a combination of a
 725       # state of the automaton and a character class to a new state.  The
 726       # numbering of the original nodes is retained, but some have been split
 727       # so that there are new nodes.  They mean:
 728       # N0     The initial state, and final accepting one.
 729       # N1     One continuation byte (A0-BF) left.  This is transitioned to
 730       #        immediately when the start byte indicates a two-byte sequence
 731       # N2     Two continuation bytes left.
 732       # N3     Three continuation bytes left.
 733       # N4     Start byte is F0.  Continuation bytes A[0-F] are illegal
 734       #        (overlong); the other continuations transition to N2
 735       # N5     Start byte is F1.  B6 and B7 are illegal (surrogates); the
 736       #        other continuations transition to N2
 737       # N6     Start byte is F8.  Continuation bytes A[0-7] are illegal
 738       #        (overlong); the other continuations transition to N3
 739       # N7     Start byte is F9.  Continuation bytes A0 and A1 transition to
 740       #        N3; the other continuation bytes are illegal (non-Unicode)
 741       # 1      Reject.  All transitions not mentioned above (except the single
 742       #        byte ones (as they are always legal) are to this state.
 743
 744         my $NUM_CLASSES = 14;
 745         my $N0 = 0;
 746         my $N1 =  $N0 + $NUM_CLASSES;
 747         my $N2 =  $N1 + $NUM_CLASSES;
 748         my $N3 =  $N2 + $NUM_CLASSES;
 749         my $N4 =  $N3 + $NUM_CLASSES;
 750         my $N5 =  $N4 + $NUM_CLASSES;
 751         my $N6 =  $N5 + $NUM_CLASSES;
 752         my $N7 =  $N6 + $NUM_CLASSES;
 753
 754         my @C9_utf8_dfa;
 755         my @i8 = (
 756                 # 0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
 757                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
 758                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
 759                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
 760                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
 761                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
 762                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
 763                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
 764                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
 765                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
 766                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
 767                   9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF
 768                  12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF
 769                   1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
 770                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
 771                   1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
 772                   6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF
 773                 );
 774         $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
 775         push @C9_utf8_dfa, (
 776           # Class:
 777           # 0 1   2   3   4   5   6   7   8   9   10   11   12   13
 778             0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6,  1,   1,   1,   1,   1, # N0
 779             1,1,  1,  1,  1,  1,  1,  1,  1,  0,   0,   0,   0,   0, # N1
 780             1,1,  1,  1,  1,  1,  1,  1,  1,$N1, $N1, $N1, $N1, $N1, # N2
 781             1,1,  1,  1,  1,  1,  1,  1,  1,$N2, $N2, $N2, $N2, $N2, # N3
 782
 783             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1, $N2, $N2, # N4
 784             1,1,  1,  1,  1,  1,  1,  1,  1,$N2, $N2, $N2, $N2,   1, # N5
 785             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1, $N3, $N3, $N3, # N6
 786             1,1,  1,  1,  1,  1,  1,  1,  1,$N3,   1,   1,   1,   1, # N7
 787         );
 788         output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
 789     }
 790
 791     print $out_fh get_conditional_compile_line_end();
 792 }
 793
 794 print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n";
 795
 796 read_only_bottom_close_and_rename($out_fh);