regen/ebcdic.pl

   1 use v5.16.0;
   2 use strict;
   3 use warnings;
   4 use integer;
   5
   6 BEGIN { unshift @INC, '.' }
   7
   8 require './regen/regen_lib.pl';
   9 require './regen/charset_translations.pl';
  10
  11 # Generates the EBCDIC translation tables that were formerly hard-coded into
  12 # utfebcdic.h
  13
  14 my $out_fh = open_new('ebcdic_tables.h', '>',
  15         {style => '*', by => $0, });
  16
  17 sub get_column_headers ($$;$) {
  18     my ($row_hdr_len, $field_width, $dfa_columns) = @_;
  19     my $format;
  20     my $final_column_format;
  21     my $num_columns;
  22
  23     if (defined $dfa_columns) {
  24         $num_columns = $dfa_columns;
  25
  26         # Trailing blank to correspond with commas in the rows below
  27         $format = "%${field_width}d ";
  28     }
  29     else {  # Is a regular table
  30         $num_columns = 16;
  31
  32         # Use blanks to separate the fields
  33         $format = " " x ( $field_width
  34                         - 2);               # For the '_X'
  35         $format .= "_%X ";  # Again, trailing blank over the commas below
  36     }
  37
  38     my $header = "/*" . " " x ($row_hdr_len - length "/*");
  39
  40     # All but the final column
  41     $header .= sprintf($format, $_) for 0 .. $num_columns - 2;
  42
  43      # Get rid of trailing blank, so that the final column takes up one less
  44      # space so that the "*/" doesn't extend past the commas in the rows below
  45     chop $header;
  46     $header .= sprintf $format, $num_columns - 1;
  47
  48     # Again, remove trailing blank
  49     chop $header;
  50
  51     return $header . "*/\n";
  52 }
  53
  54 sub output_table ($$;$) {
  55     my $table_ref = shift;
  56     my $name = shift;
  57
  58     # 0 => print in decimal
  59     # 1 => print in hex (translates code point to code point)
  60     # >= 2 => is a dfa table, like http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
  61     #      The number is how many columns in the part after the code point
  62     #      portion.
  63     #
  64     # code point tables in hex areasier to debug, but don't fit into 80
  65     # columns
  66     my $type = shift // 1;
  67
  68     my $print_in_hex = $type == 1;
  69     my $is_dfa = ($type >= 2) ? $type : 0;
  70     my $columns_after_256 = 16;
  71
  72     die "Requres 256 entries in table $name, got @$table_ref"
  73                                 if ! $is_dfa && @$table_ref != 256;
  74     if (! $is_dfa) {
  75         die "Requres 256 entries in table $name, got @$table_ref"
  76                                                         if @$table_ref != 256;
  77     }
  78     else {
  79         $columns_after_256 = $is_dfa;
  80
  81         print $out_fh <<'EOF';
  82
  83 /* The table below is adapted from
  84  *      http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
  85  * See copyright notice at the beginning of this file.
  86  */
  87
  88 EOF
  89     }
  90
  91     # Highest number in the table
  92     my $max_entry = 0;
  93     $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref;
  94
  95     # We assume that every table has at least one two digit entry, and none
  96     # are more than three digit.
  97     my $field_width = ($print_in_hex)
  98                       ? 4
  99                       : (($max_entry) > 99 ? 3 : 2);
 100
 101     my $row_hdr_length;
 102     my $node_number_field_width;
 103     my $node_value_field_width;
 104
 105     # dfa tables have a special header for the rows in the transitions part of
 106     # the table.  It is longer than the regular one.
 107     if ($is_dfa) {
 108         my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1;
 109         $node_number_field_width = ($max_node_number > 9) ? 2 : 1;
 110         $node_value_field_width = ($max_node_number * $columns_after_256 > 99)
 111                                   ? 3 : 2;
 112         # The header starts with this template, and adds in the number of
 113         # digits needed to represent the maximum node number and its value
 114         $row_hdr_length = length("/*N=*/")
 115                         + $node_number_field_width
 116                         + $node_value_field_width;
 117     }
 118     else {
 119         $row_hdr_length = length "/*_X*/";  # Template for what the header
 120                                             # looks like
 121     }
 122
 123     # The table may not be representable in 8 bits.
 124     my $TYPE = 'U8';
 125     $TYPE = 'U16' if grep { $_ > 255 } @$table_ref;
 126
 127     my $declaration = "EXTCONST $TYPE $name\[\]";
 128     print $out_fh <<EOF;
 129 #  ifndef DOINIT
 130 #    $declaration;
 131 #  else
 132 #    $declaration = {
 133 EOF
 134
 135     # First the headers for the columns
 136     print $out_fh get_column_headers($row_hdr_length, $field_width);
 137
 138     # Now the table body
 139     my $count = @$table_ref;
 140     my $last_was_nl = 1;
 141
 142     # Print each element individually, arranged in rows of columns
 143     for my $i (0 .. $count - 1) {
 144
 145         # Node number for here is -1 until get into the dfa state transitions
 146         my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256;
 147
 148         # Print row header at beginning of each row
 149         if ($last_was_nl) {
 150             if ($node >= 0) {
 151                 printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node,
 152                                                $node_value_field_width, $i - 256;
 153             }
 154             else {  # Otherwise is regular row; print its number
 155                 printf $out_fh "/*%X_", $i / 16;
 156
 157                 # These rows in a dfa table require extra space so columns
 158                 # will align vertically (because the Ndd=ddd requires extra
 159                 # space)
 160                 if ($is_dfa) {
 161                     print  $out_fh " " x (  $node_number_field_width
 162                                           + $node_value_field_width);
 163                 }
 164                 print  $out_fh "*/";
 165             }
 166         }
 167
 168         if ($print_in_hex) {
 169             printf $out_fh "0x%02X", $table_ref->[$i];
 170         }
 171         else {
 172             printf $out_fh "%${field_width}d", $table_ref->[$i];
 173         }
 174
 175         print $out_fh ",", if $i < $count -1;   # No comma on final entry
 176
 177         # Add \n if at end of row, which is 16 columns until we get to the
 178         # transitions part
 179         if (   ($node < 0 && $i % 16 == 15)
 180             || ($node >= 0 && ($i -256) % $columns_after_256
 181                                                     == $columns_after_256 - 1))
 182         {
 183             print $out_fh "\n";
 184             $last_was_nl = 1;
 185         }
 186         else {
 187             $last_was_nl = 0;
 188         }
 189     }
 190
 191     # Print column footer
 192     print $out_fh get_column_headers($row_hdr_length, $field_width,
 193                                      ($is_dfa) ? $columns_after_256 : undef);
 194
 195     print $out_fh "};\n#  endif\n\n";
 196 }
 197
 198 print $out_fh <<'END';
 199
 200 #ifndef PERL_EBCDIC_TABLES_H_   /* Guard against nested #includes */
 201 #define PERL_EBCDIC_TABLES_H_   1
 202
 203 /* This file contains definitions for various tables used in EBCDIC handling.
 204  * More info is in utfebcdic.h
 205  *
 206  * Some of the tables are adapted from
 207  *      http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
 208  * which requires this copyright notice:
 209
 210 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
 211
 212 Permission is hereby granted, free of charge, to any person obtaining a copy of
 213 this software and associated documentation files (the "Software"), to deal in
 214 the Software without restriction, including without limitation the rights to
 215 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 216 of the Software, and to permit persons to whom the Software is furnished to do
 217 so, subject to the following conditions:
 218
 219 The above copyright notice and this permission notice shall be included in all
 220 copies or substantial portions of the Software.
 221
 222 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 223 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 224 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 225 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 226 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 227 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 228 SOFTWARE.
 229
 230 */
 231 END
 232
 233 my @charsets = get_supported_code_pages();
 234 shift @charsets;    # ASCII is the 0th, and we don't deal with that here.
 235 foreach my $charset (@charsets) {
 236     # we process the whole array several times, make a copy
 237     my @a2e = @{get_a2n($charset)};
 238
 239     print $out_fh "\n" . get_conditional_compile_line_start($charset);
 240     print $out_fh "\n";
 241
 242     print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
 243     output_table(\@a2e, "PL_a2e");
 244
 245     { # Construct the inverse
 246         my @e2a;
 247         for my $i (0 .. 255) {
 248             $e2a[$a2e[$i]] = $i;
 249         }
 250         print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
 251         output_table(\@e2a, "PL_e2a");
 252     }
 253
 254     my @i82utf = @{get_I8_2_utf($charset)};
 255     print $out_fh <<END;
 256 /* (Confusingly named) Index is $charset I8 byte; value is
 257  * $charset UTF-EBCDIC equivalent */
 258 END
 259     output_table(\@i82utf, "PL_utf2e");
 260
 261     { #Construct the inverse
 262         my @utf2i8;
 263         for my $i (0 .. 255) {
 264             $utf2i8[$i82utf[$i]] = $i;
 265         }
 266         print $out_fh <<END;
 267 /* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
 268  * $charset I8 equivalent */
 269 END
 270         output_table(\@utf2i8, "PL_e2utf");
 271     }
 272
 273     {
 274         my @utf8skip;
 275
 276         # These are invariants or continuation bytes.
 277         for my $i (0 .. 0xBF) {
 278             $utf8skip[$i82utf[$i]] = 1;
 279         }
 280
 281         # These are start bytes;  The skip is the number of consecutive highest
 282         # order 1-bits (up to 7)
 283         for my $i (0xC0 .. 255) {
 284             my $count;
 285             if ($i == 0b11111111) {
 286                 no warnings 'once';
 287                 $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES;
 288             }
 289             elsif (($i & 0b11111110) == 0b11111110) {
 290                 $count= 7;
 291             }
 292             elsif (($i & 0b11111100) == 0b11111100) {
 293                 $count= 6;
 294             }
 295             elsif (($i & 0b11111000) == 0b11111000) {
 296                 $count= 5;
 297             }
 298             elsif (($i & 0b11110000) == 0b11110000) {
 299                 $count= 4;
 300             }
 301             elsif (($i & 0b11100000) == 0b11100000) {
 302                 $count= 3;
 303             }
 304             elsif (($i & 0b11000000) == 0b11000000) {
 305                 $count= 2;
 306             }
 307             else {
 308                 die "Something wrong for UTF8SKIP calculation for $i";
 309             }
 310             $utf8skip[$i82utf[$i]] = $count;
 311         }
 312
 313         print $out_fh <<END;
 314 /* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes
 315  * (including for overlongs); 1 for continuation.  Adapted from the shadow
 316  * flags table in tr16.  The entries marked 9 in tr16 are continuation bytes
 317  * and are marked as length 1 here so that we can recover. */
 318 END
 319         output_table(\@utf8skip, "PL_utf8skip", 0);  # The 0 means don't print
 320                                                      # in hex
 321     }
 322
 323     use feature 'unicode_strings';
 324
 325     {
 326         my @lc;
 327         for my $i (0 .. 255) {
 328             $lc[$a2e[$i]] = $a2e[ord lc chr $i];
 329         }
 330         print $out_fh
 331         "/* Index is $charset code point; value is its lowercase equivalent */\n";
 332         output_table(\@lc, "PL_latin1_lc");
 333     }
 334
 335     {
 336         my @uc;
 337         for my $i (0 .. 255) {
 338             my $uc = uc chr $i;
 339             if (length $uc > 1 || ord $uc > 255) {
 340                 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
 341             }
 342             $uc[$a2e[$i]] = $a2e[ord $uc];
 343         }
 344         print $out_fh <<END;
 345 /* Index is $charset code point; value is its uppercase equivalent.
 346  * The 'mod' in the name means that codepoints whose uppercase is above 255 or
 347  * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
 348 END
 349         output_table(\@uc, "PL_mod_latin1_uc");
 350     }
 351
 352     { # PL_fold
 353         my @ascii_fold;
 354         for my $i (0 .. 255) {  # Initialise to identity map
 355             $ascii_fold[$i] = $i;
 356         }
 357
 358         # Overwrite the entries that aren't identity
 359         for my $chr ('A' .. 'Z') {
 360             $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
 361         }
 362         for my $chr ('a' .. 'z') {
 363             $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
 364         }
 365         print $out_fh <<END;
 366 /* Index is $charset code point; For A-Z, value is a-z; for a-z, value
 367  * is A-Z; all other code points map to themselves */
 368 END
 369         output_table(\@ascii_fold, "PL_fold");
 370     }
 371
 372     {
 373         my @latin1_fold;
 374         for my $i (0 .. 255) {
 375             my $char = chr $i;
 376             my $lc = lc $char;
 377
 378             # lc and uc adequately proxy for fold-case pairs in this 0-255
 379             # range
 380             my $uc = uc $char;
 381             $uc = $char if length $uc > 1 || ord $uc > 255;
 382             if ($lc ne $char) {
 383                 $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
 384             }
 385             elsif ($uc ne $char) {
 386                 $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
 387             }
 388             else {
 389                 $latin1_fold[$a2e[$i]] = $a2e[$i];
 390             }
 391         }
 392         print $out_fh <<END;
 393 /* Index is $charset code point; value is its other fold-pair equivalent
 394  * (A => a; a => A, etc) in the 0-255 range.  If no such equivalent, value is
 395  * the code point itself */
 396 END
 397         output_table(\@latin1_fold, "PL_fold_latin1");
 398     }
 399
 400     {
 401       # This generates the dfa table for perl extended UTF-8, which accepts
 402       # surrogates, non-characters, and accepts start bytes up through FE
 403       # (start byte FF has to be handled outside this dfa).  The class numbers
 404       # for start bytes are constrained so that they can be used as a shift
 405       # count for masking off the leading one bits
 406       #
 407       # The classes are
 408       #   00-9F           0
 409       #   A0-A1           7   Not legal immediately after start bytes F0 F8 FC
 410       #                       FE
 411       #   A2-A3           8   Not legal immediately after start bytes F0 F8 FC
 412       #   A4-A7           9   Not legal immediately after start bytes F0 F8
 413       #   A8-AF          10   Not legal immediately after start bytes F0
 414       #   B0-BF          11
 415       #   C0-C4           1
 416       #   C5-DF           2
 417       #   E0              1
 418       #   E1-EF           3
 419       #   F0             12
 420       #   F1-F7           4
 421       #   F8             13
 422       #   F9-FB           5
 423       #   FC             14
 424       #   FD              6
 425       #   FE             15
 426       #   FF              1
 427       #
 428       # Here's the I8 for the code points before which overlongs occur:
 429       # U+4000:     \xF0\xB0\xA0\xA0
 430       # U+40000:    \xF8\xA8\xA0\xA0\xA0
 431       # U+400000:   \xFC\xA4\xA0\xA0\xA0\xA0
 432       # U+4000000:  \xFE\xA2\xA0\xA0\xA0\xA0\xA0
 433       #
 434       # The first part of the table maps bytes to character classes to reduce
 435       # the size of the transition table and create bitmasks.
 436       #
 437       # The second part is a transition table that maps a combination of a
 438       # state of the automaton and a character class to a new state.  The
 439       # numbering of the original nodes is retained, but some have been split
 440       # so that there are new nodes.  They mean:
 441       # N0     The initial state, and final accepting one.
 442       # N1     One continuation byte (A0-BF) left.  This is transitioned to
 443       #        immediately when the start byte indicates a two-byte sequence
 444       # N2     Two continuation bytes left.
 445       # N3     Three continuation bytes left.
 446       # N4     Four continuation bytes left.
 447       # N5     Five continuation bytes left.
 448       # N6     Start byte is F0.  Continuation bytes A[0-F] are illegal
 449       #        (overlong); the other continuations transition to N2
 450       # N7     Start byte is F8.  Continuation bytes A[0-7] are illegal
 451       #        (overlong); the other continuations transition to N3
 452       # N8     Start byte is FC.  Continuation bytes A[0-3] are illegal
 453       #        (overlong); the other continuations transition to N4
 454       # N9     Start byte is FE.  Continuation bytes A[01] are illegal
 455       #        (overlong); the other continuations transition to N5
 456       # 1      Reject.  All transitions not mentioned above (except the single
 457       #        byte ones (as they are always legal) are to this state.
 458
 459         my $NUM_CLASSES = 16;
 460         my $N0 = 0;
 461         my $N1 =  $N0 + $NUM_CLASSES;
 462         my $N2 =  $N1 + $NUM_CLASSES;
 463         my $N3 =  $N2 + $NUM_CLASSES;
 464         my $N4 =  $N3 + $NUM_CLASSES;
 465         my $N5 =  $N4 + $NUM_CLASSES;
 466         my $N6 =  $N5 + $NUM_CLASSES;
 467         my $N7 =  $N6 + $NUM_CLASSES;
 468         my $N8 =  $N7 + $NUM_CLASSES;
 469         my $N9 =  $N8 + $NUM_CLASSES;
 470         my $N10 = $N9 + $NUM_CLASSES;
 471
 472         my @perl_extended_utf8_dfa;
 473         my @i8 = (
 474                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
 475                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
 476                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
 477                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
 478                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
 479                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
 480                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
 481                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
 482                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
 483                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
 484                   7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF
 485                  11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF
 486                   1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
 487                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
 488                   1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
 489                  12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF
 490                 );
 491         $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
 492         push @perl_extended_utf8_dfa, (
 493           # Class:
 494           # 0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 495             0,  1,$N1,$N2,$N3,$N4,$N5,  1,  1,  1,  1,  1,$N6,$N7,$N8,$N9, # N0
 496             1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  1,  1,  1,  1, # N1
 497             1,  1,  1,  1,  1,  1,  1,$N1,$N1,$N1,$N1,$N1,  1,  1,  1,  1, # N2
 498             1,  1,  1,  1,  1,  1,  1,$N2,$N2,$N2,$N2,$N2,  1,  1,  1,  1, # N3
 499             1,  1,  1,  1,  1,  1,  1,$N3,$N3,$N3,$N3,$N3,  1,  1,  1,  1, # N4
 500             1,  1,  1,  1,  1,  1,  1,$N4,$N4,$N4,$N4,$N4,  1,  1,  1,  1, # N5
 501
 502             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,$N2,  1,  1,  1,  1, # N6
 503             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,$N3,$N3,  1,  1,  1,  1, # N7
 504             1,  1,  1,  1,  1,  1,  1,  1,  1,$N4,$N4,$N4,  1,  1,  1,  1, # N8
 505             1,  1,  1,  1,  1,  1,  1,  1,$N5,$N5,$N5,$N5,  1,  1,  1,  1, # N9
 506         );
 507         output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab",
 508                                                                    $NUM_CLASSES);
 509     }
 510
 511     {
 512       # This generates the dfa table for strict UTF-8, which rejects
 513       # surrogates, non-characters, and above Unicode.
 514       #
 515       # The classes are
 516       #   00-9F           0   Always legal at start
 517       #   A0             10   Not legal immediately after start bytes F0 F8
 518       #   A1             11   Not legal immediately after start bytes F0 F8,
 519       #   A2-A7          12   Not legal immediately after start bytes F0 F8 F9
 520       #   A8,AA,AC       13   Not legal immediately after start bytes F0 F9
 521       #   A9,AB,AD       14   Not legal immediately after start byte F0
 522       #   AE             15   Not legal immediately after start byte F0
 523       #   AF             16   Not legal immediately after start bytes F0
 524       #   B[0248AC]      17   Not legal immediately after start byte F9
 525       #   B[1359D]       18   Not legal immediately after start byte F9
 526       #   B6             19   Not legal immediately after start byte F9
 527       #   B7             20   Not legal immediately after start byte F9
 528       #   BE             21   Not legal immediately after start byte F9
 529       #   BF             22   Not legal immediately after start byte F9
 530       #   C0-C4           1   (reject, all are overlong)
 531       #   C5-DF           2   Accepts any legal continuation
 532       #   E0              1   (reject, all are overlong)
 533       #   E1-EF           3   Accepts any legal continuation
 534       #   F0              8   (has overlongs)
 535       #   F1              6   (has surrogates, non-chars)
 536       #   F2,F4,F6        4   Accepts any legal continuation
 537       #   F3,F5,F7        5   (has non-chars)
 538       #   F8              9   (has overlongs, non-chars)
 539       #   F9              7   (has non-chars, non-Unicode)
 540       #   FA-FF           1   (reject, all are non-Unicode)
 541       #
 542       # Here's the I8 for enough code points so that you can figure out what's
 543       # going on:
 544       #
 545       # U+D800: \xF1\xB6\xA0\xA0
 546       # U+DFFF: \xF1\xB7\xBF\xBF
 547       # U+FDD0: \xF1\xBF\xAE\xB0
 548       # U+FDEF: \xF1\xBF\xAF\xAF
 549       # U+FFFE: \xF1\xBF\xBF\xBE
 550       # U+1FFFE: \xF3\xBF\xBF\xBE
 551       # U+2FFFE: \xF5\xBF\xBF\xBE
 552       # U+3FFFE: \xF7\xBF\xBF\xBE
 553       # U+4FFFE: \xF8\xA9\xBF\xBF\xBE
 554       # U+5FFFE: \xF8\xAB\xBF\xBF\xBE
 555       # U+6FFFE: \xF8\xAD\xBF\xBF\xBE
 556       # U+7FFFE: \xF8\xAF\xBF\xBF\xBE
 557       # U+8FFFE: \xF8\xB1\xBF\xBF\xBE
 558       # U+9FFFE: \xF8\xB3\xBF\xBF\xBE
 559       # U+AFFFE: \xF8\xB5\xBF\xBF\xBE
 560       # U+BFFFE: \xF8\xB7\xBF\xBF\xBE
 561       # U+CFFFE: \xF8\xB9\xBF\xBF\xBE
 562       # U+DFFFE: \xF8\xBB\xBF\xBF\xBE
 563       # U+EFFFE: \xF8\xBD\xBF\xBF\xBE
 564       # U+FFFFE: \xF8\xBF\xBF\xBF\xBE
 565       # U+10FFFE: \xF9\xA1\xBF\xBF\xBE
 566       #
 567       # The first part of the table maps bytes to character classes to reduce
 568       # the size of the transition table and create bitmasks.
 569       #
 570       # The second part is a transition table that maps a combination of a
 571       # state of the automaton and a character class to a new state.  The
 572       # numbering of the original nodes is retained, but some have been split
 573       # so that there are new nodes.  They mean:
 574       # N0     The initial state, and final accepting one.
 575       # N1     One continuation byte (A0-BF) left.  This is transitioned to
 576       #        immediately when the start byte indicates a two-byte sequence
 577       # N2     Two continuation bytes left.
 578       # N3     Three continuation bytes left.
 579       # N4     Start byte is F0.  Continuation bytes A[0-F] are illegal
 580       #        (overlong); the other continuations transition to N2
 581       # N5     Start byte is F1.  Continuation bytes B6 and B7 are illegal
 582       #        (surrogates); BF transitions to N9; the other continuations to
 583       #        N2
 584       # N6     Start byte is F[357].  Continuation byte BF transitions to N12;
 585       #        other continuations to N2
 586       # N5     Start byte is F8.  Continuation bytes A[0-7] are illegal
 587       #        (overlong); continuations A[9BDF] and B[13579BDF] transition to
 588       #        N14; the other continuations to N3
 589       # N8     Start byte is F9.  Continuation byte A0 transitions to N3; A1
 590       #        to N14; the other continuation bytes are illegal.
 591       # N9     Initial sequence is F1 BF.  Continuation byte AE transitions to
 592       #        state N10; AF to N11; BF to N13; the other continuations to N1.
 593       # N10    Initial sequence is F1 BF AE.  Continuation bytes B0-BF are
 594       #        illegal (non-chars); the other continuations are legal
 595       # N11    Initial sequence is F1 BF AF.  Continuation bytes A0-AF are
 596       #        illegal (non-chars); the other continuations are legal
 597       # N12    Initial sequence is F[357] BF.  Continuation bytes BF
 598       #        transitions to N13; the other continuations to N1
 599       # N13    Initial sequence is F[1357] BF BF or F8 x BF (where x is
 600       #        something that can lead to a non-char.  Continuation bytes BE
 601       #        and BF are illegal (non-chars); the other continuations are
 602       #        legal
 603       # N14    Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1.
 604       #        Continuation byte BF transitions to N13; the other
 605       #        continuations to N2
 606       # 1      Reject.  All transitions not mentioned above (except the single
 607       #        byte ones (as they are always legal) are to this state.
 608
 609         my $NUM_CLASSES = 23;
 610         my $N0 = 0;
 611         my $N1 =  $N0 + $NUM_CLASSES;
 612         my $N2 =  $N1 + $NUM_CLASSES;
 613         my $N3 =  $N2 + $NUM_CLASSES;
 614         my $N4 =  $N3 + $NUM_CLASSES;
 615         my $N5 =  $N4 + $NUM_CLASSES;
 616         my $N6 =  $N5 + $NUM_CLASSES;
 617         my $N7 =  $N6 + $NUM_CLASSES;
 618         my $N8 =  $N7 + $NUM_CLASSES;
 619         my $N9 =  $N8 + $NUM_CLASSES;
 620         my $N10 = $N9 + $NUM_CLASSES;
 621         my $N11 = $N10 + $NUM_CLASSES;
 622         my $N12 = $N11 + $NUM_CLASSES;
 623         my $N13 = $N12 + $NUM_CLASSES;
 624         my $N14 = $N13 + $NUM_CLASSES;
 625
 626         my @strict_utf8_dfa;
 627         my @i8 = (
 628                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
 629                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
 630                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
 631                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
 632                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
 633                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
 634                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
 635                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
 636                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
 637                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
 638                  10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF
 639                  17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF
 640                   1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
 641                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
 642                   1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
 643                   8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF
 644                 );
 645         $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
 646         push @strict_utf8_dfa, (
 647           # Class:
 648           # 0 1   2   3   4   5   6   7   8   9   10   11   12   13   14   15   16   17   18   19   20   21   22
 649             0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1, # N0
 650             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, # N1
 651             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2
 652             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3
 653
 654             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1,   1,   1,   1,   1, $N2, $N2, $N2, $N2, $N2, $N2, # N4
 655             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,   1,   1, $N2, $N9, # N5
 656             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6
 657             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7
 658             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N3,$N14,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1, # N8
 659             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9
 660             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1, # N10
 661             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1,   1,   1,   1,   1,   0,   0,   0,   0,   0,   0, # N11
 662             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12
 663             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1, # N13
 664             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N13, # N14
 665         );
 666         output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES);
 667     }
 668
 669     {
 670       # This generates the dfa table for C9 strict UTF-8, which rejects
 671       # surrogates and above Unicode, but allows non-characters,.
 672       #
 673       # The classes are
 674       #   00-9F           0   Always legal at start
 675       #   A0-A1           9   Not legal immediately after start bytes F0 F8
 676       #   A2-A7          10   Not legal immediately after start bytes F0 F8 F9
 677       #   A8-AF          11   Not legal immediately after start bytes F0 F9
 678       #   B0-B5,B8-BF    12   Not legal immediately after start byte F9
 679       #   B6,B7          13
 680       #   C0-C4           1   (reject, all are overlong)
 681       #   C5-DF           2   Accepts any legal continuation
 682       #   E0              1   (reject, all are overlong)
 683       #   E1-EF           3   Accepts any legal continuation
 684       #   F0              6   (has overlongs)
 685       #   F1              5   (has surrogates)
 686       #   F2-F7           4   Accepts any legal continuation
 687       #   F8              8   (has overlongs)
 688       #   F9              7   (has non-Unicode)
 689       #   FA-FF           1   (reject, all are non-Unicode)
 690       #
 691       # The first part of the table maps bytes to character classes to reduce
 692       # the size of the transition table and create bitmasks.
 693       #
 694       # The second part is a transition table that maps a combination of a
 695       # state of the automaton and a character class to a new state.  The
 696       # numbering of the original nodes is retained, but some have been split
 697       # so that there are new nodes.  They mean:
 698       # N0     The initial state, and final accepting one.
 699       # N1     One continuation byte (A0-BF) left.  This is transitioned to
 700       #        immediately when the start byte indicates a two-byte sequence
 701       # N2     Two continuation bytes left.
 702       # N3     Three continuation bytes left.
 703       # N4     Start byte is F0.  Continuation bytes A[0-F] are illegal
 704       #        (overlong); the other continuations transition to N2
 705       # N5     Start byte is F1.  B6 and B7 are illegal (surrogates); the
 706       #        other continuations transition to N2
 707       # N6     Start byte is F8.  Continuation bytes A[0-7] are illegal
 708       #        (overlong); the other continuations transition to N3
 709       # N7     Start byte is F9.  Continuation bytes A0 and A1 transition to
 710       #        N3; the other continuation bytes are illegal (non-Unicode)
 711       # 1      Reject.  All transitions not mentioned above (except the single
 712       #        byte ones (as they are always legal) are to this state.
 713
 714         my $NUM_CLASSES = 14;
 715         my $N0 = 0;
 716         my $N1 =  $N0 + $NUM_CLASSES;
 717         my $N2 =  $N1 + $NUM_CLASSES;
 718         my $N3 =  $N2 + $NUM_CLASSES;
 719         my $N4 =  $N3 + $NUM_CLASSES;
 720         my $N5 =  $N4 + $NUM_CLASSES;
 721         my $N6 =  $N5 + $NUM_CLASSES;
 722         my $N7 =  $N6 + $NUM_CLASSES;
 723
 724         my @C9_utf8_dfa;
 725         my @i8 = (
 726                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
 727                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
 728                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
 729                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
 730                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
 731                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
 732                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
 733                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
 734                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
 735                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
 736                   9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF
 737                  12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF
 738                   1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
 739                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
 740                   1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
 741                   6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF
 742                 );
 743         $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
 744         push @C9_utf8_dfa, (
 745           # Class:
 746           # 0 1   2   3   4   5   6   7   8   9   10   11   12   13
 747             0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6,  1,   1,   1,   1,   1, # N0
 748             1,1,  1,  1,  1,  1,  1,  1,  1,  0,   0,   0,   0,   0, # N1
 749             1,1,  1,  1,  1,  1,  1,  1,  1,$N1, $N1, $N1, $N1, $N1, # N2
 750             1,1,  1,  1,  1,  1,  1,  1,  1,$N2, $N2, $N2, $N2, $N2, # N3
 751
 752             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1, $N2, $N2, # N4
 753             1,1,  1,  1,  1,  1,  1,  1,  1,$N2, $N2, $N2, $N2,   1, # N5
 754             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1, $N3, $N3, $N3, # N6
 755             1,1,  1,  1,  1,  1,  1,  1,  1,$N3,   1,   1,   1,   1, # N7
 756         );
 757         output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
 758     }
 759
 760     print $out_fh get_conditional_compile_line_end();
 761 }
 762
 763 print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n";
 764
 765 read_only_bottom_close_and_rename($out_fh);