regen/ebcdic.pl

   1 use v5.16.0;
   2 use strict;
   3 use warnings;
   4 use integer;
   5
   6 BEGIN { unshift @INC, '.' }
   7
   8 require './regen/regen_lib.pl';
   9 require './regen/charset_translations.pl';
  10
  11 # Generates the EBCDIC translation tables that were formerly hard-coded into
  12 # utfebcdic.h
  13
  14 my $out_fh = open_new('ebcdic_tables.h', '>',
  15         {style => '*', by => $0, });
  16
  17 sub get_column_headers ($$;$) {
  18     my ($row_hdr_len, $field_width, $dfa_columns) = @_;
  19     my $format;
  20     my $final_column_format;
  21     my $num_columns;
  22
  23     if (defined $dfa_columns) {
  24         $num_columns = $dfa_columns;
  25
  26         # Trailing blank to correspond with commas in the rows below
  27         $format = "%${field_width}d ";
  28     }
  29     else {  # Is a regular table
  30         $num_columns = 16;
  31
  32         # Use blanks to separate the fields
  33         $format = " " x ( $field_width
  34                         - 2);               # For the '_X'
  35         $format .= "_%X ";  # Again, trailing blank over the commas below
  36     }
  37
  38     my $header = "/*" . " " x ($row_hdr_len - length "/*");
  39
  40     # All but the final column
  41     $header .= sprintf($format, $_) for 0 .. $num_columns - 2;
  42
  43      # Get rid of trailing blank, so that the final column takes up one less
  44      # space so that the "*/" doesn't extend past the commas in the rows below
  45     chop $header;
  46     $header .= sprintf $format, $num_columns - 1;
  47
  48     # Again, remove trailing blank
  49     chop $header;
  50
  51     return $header . "*/\n";
  52 }
  53
  54 sub output_table_start($$$;$) {
  55     my ($out_fh, $TYPE, $name, $size) = @_;
  56
  57     $size = "" unless defined $size;
  58     my $declaration = "EXTCONST $TYPE $name\[$size\]";
  59     print $out_fh <<EOF;
  60 #  ifndef DOINIT
  61     $declaration;
  62 #  else
  63     $declaration = {
  64 EOF
  65 }
  66
  67 sub output_table_end($) {
  68     print $out_fh "};\n#  endif\n\n";
  69 }
  70
  71 sub output_table ($$;$) {
  72     my $table_ref = shift;
  73     my $name = shift;
  74
  75     # 0 => print in decimal
  76     # 1 => print in hex (translates code point to code point)
  77     # >= 2 => is a dfa table, like https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
  78     #      The number is how many columns in the part after the code point
  79     #      portion.
  80     #
  81     # code point tables in hex areasier to debug, but don't fit into 80
  82     # columns
  83     my $type = shift // 1;
  84
  85     my $print_in_hex = $type == 1;
  86     my $is_dfa = ($type >= 2) ? $type : 0;
  87     my $columns_after_256 = 16;
  88
  89     die "Requres 256 entries in table $name, got @$table_ref"
  90                                 if ! $is_dfa && @$table_ref != 256;
  91     if (! $is_dfa) {
  92         die "Requres 256 entries in table $name, got @$table_ref"
  93                                                         if @$table_ref != 256;
  94     }
  95     else {
  96         $columns_after_256 = $is_dfa;
  97
  98         print $out_fh <<'EOF';
  99
 100 /* The table below is adapted from
 101  *      https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
 102  * See copyright notice at the beginning of this file.
 103  */
 104
 105 EOF
 106     }
 107
 108     # Highest number in the table
 109     my $max_entry = 0;
 110     $max_entry = map { $_ > $max_entry ? $_ : $max_entry } @$table_ref;
 111
 112     # We assume that every table has at least one two digit entry, and none
 113     # are more than three digit.
 114     my $field_width = ($print_in_hex)
 115                       ? 4
 116                       : (($max_entry) > 99 ? 3 : 2);
 117
 118     my $row_hdr_length;
 119     my $node_number_field_width;
 120     my $node_value_field_width;
 121
 122     # dfa tables have a special header for the rows in the transitions part of
 123     # the table.  It is longer than the regular one.
 124     if ($is_dfa) {
 125         my $max_node_number = ($max_entry - 256) / $columns_after_256 - 1;
 126         $node_number_field_width = ($max_node_number > 9) ? 2 : 1;
 127         $node_value_field_width = ($max_node_number * $columns_after_256 > 99)
 128                                   ? 3 : 2;
 129         # The header starts with this template, and adds in the number of
 130         # digits needed to represent the maximum node number and its value
 131         $row_hdr_length = length("/*N=*/")
 132                         + $node_number_field_width
 133                         + $node_value_field_width;
 134     }
 135     else {
 136         $row_hdr_length = length "/*_X*/";  # Template for what the header
 137                                             # looks like
 138     }
 139
 140     # The table may not be representable in 8 bits.
 141     my $TYPE = 'U8';
 142     $TYPE = 'U16' if grep { $_ > 255 } @$table_ref;
 143
 144     output_table_start $out_fh, $TYPE, $name;
 145
 146     # First the headers for the columns
 147     print $out_fh get_column_headers($row_hdr_length, $field_width);
 148
 149     # Now the table body
 150     my $count = @$table_ref;
 151     my $last_was_nl = 1;
 152
 153     # Print each element individually, arranged in rows of columns
 154     for my $i (0 .. $count - 1) {
 155
 156         # Node number for here is -1 until get into the dfa state transitions
 157         my $node = ($i < 256) ? -1 : ($i - 256) / $columns_after_256;
 158
 159         # Print row header at beginning of each row
 160         if ($last_was_nl) {
 161             if ($node >= 0) {
 162                 printf $out_fh "/*N%-*d=%*d*/", $node_number_field_width, $node,
 163                                                $node_value_field_width, $i - 256;
 164             }
 165             else {  # Otherwise is regular row; print its number
 166                 printf $out_fh "/*%X_", $i / 16;
 167
 168                 # These rows in a dfa table require extra space so columns
 169                 # will align vertically (because the Ndd=ddd requires extra
 170                 # space)
 171                 if ($is_dfa) {
 172                     print  $out_fh " " x (  $node_number_field_width
 173                                           + $node_value_field_width);
 174                 }
 175                 print  $out_fh "*/";
 176             }
 177         }
 178
 179         if ($print_in_hex) {
 180             printf $out_fh "0x%02X", $table_ref->[$i];
 181         }
 182         else {
 183             printf $out_fh "%${field_width}d", $table_ref->[$i];
 184         }
 185
 186         print $out_fh ",", if $i < $count -1;   # No comma on final entry
 187
 188         # Add \n if at end of row, which is 16 columns until we get to the
 189         # transitions part
 190         if (   ($node < 0 && $i % 16 == 15)
 191             || ($node >= 0 && ($i -256) % $columns_after_256
 192                                                     == $columns_after_256 - 1))
 193         {
 194             print $out_fh "\n";
 195             $last_was_nl = 1;
 196         }
 197         else {
 198             $last_was_nl = 0;
 199         }
 200     }
 201
 202     # Print column footer
 203     print $out_fh get_column_headers($row_hdr_length, $field_width,
 204                                      ($is_dfa) ? $columns_after_256 : undef);
 205
 206     output_table_end($out_fh);
 207 }
 208
 209 print $out_fh <<'END';
 210
 211 #ifndef PERL_EBCDIC_TABLES_H_   /* Guard against nested #includes */
 212 #define PERL_EBCDIC_TABLES_H_   1
 213
 214 /* This file contains definitions for various tables used in EBCDIC handling.
 215  * More info is in utfebcdic.h
 216  *
 217  * Some of the tables are adapted from
 218  *      https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
 219  * which requires this copyright notice:
 220
 221 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
 222
 223 Permission is hereby granted, free of charge, to any person obtaining a copy of
 224 this software and associated documentation files (the "Software"), to deal in
 225 the Software without restriction, including without limitation the rights to
 226 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 227 of the Software, and to permit persons to whom the Software is furnished to do
 228 so, subject to the following conditions:
 229
 230 The above copyright notice and this permission notice shall be included in all
 231 copies or substantial portions of the Software.
 232
 233 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 234 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 235 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 236 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 237 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 238 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 239 SOFTWARE.
 240
 241 */
 242 END
 243
 244 my @charsets = get_supported_code_pages();
 245 shift @charsets;    # ASCII is the 0th, and we don't deal with that here.
 246 foreach my $charset (@charsets) {
 247     # we process the whole array several times, make a copy
 248     my @a2e = @{get_a2n($charset)};
 249     my @e2a;
 250
 251     print $out_fh "\n" . get_conditional_compile_line_start($charset);
 252     print $out_fh "\n";
 253
 254     print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
 255     output_table(\@a2e, "PL_a2e");
 256
 257     { # Construct the inverse
 258         for my $i (0 .. 255) {
 259             $e2a[$a2e[$i]] = $i;
 260         }
 261         print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
 262         output_table(\@e2a, "PL_e2a");
 263     }
 264
 265     my @i82utf = @{get_I8_2_utf($charset)};
 266     print $out_fh <<END;
 267 /* (Confusingly named) Index is $charset I8 byte; value is
 268  * $charset UTF-EBCDIC equivalent */
 269 END
 270     output_table(\@i82utf, "PL_utf2e");
 271
 272     { #Construct the inverse
 273         my @utf2i8;
 274         for my $i (0 .. 255) {
 275             $utf2i8[$i82utf[$i]] = $i;
 276         }
 277         print $out_fh <<END;
 278 /* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
 279  * $charset I8 equivalent */
 280 END
 281         output_table(\@utf2i8, "PL_e2utf");
 282     }
 283
 284     {
 285         my @utf8skip;
 286
 287         # These are invariants or continuation bytes.
 288         for my $i (0 .. 0xBF) {
 289             $utf8skip[$i82utf[$i]] = 1;
 290         }
 291
 292         # These are start bytes;  The skip is the number of consecutive highest
 293         # order 1-bits (up to 7)
 294         for my $i (0xC0 .. 255) {
 295             my $count;
 296             if ($i == 0b11111111) {
 297                 no warnings 'once';
 298                 $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES;
 299             }
 300             elsif (($i & 0b11111110) == 0b11111110) {
 301                 $count= 7;
 302             }
 303             elsif (($i & 0b11111100) == 0b11111100) {
 304                 $count= 6;
 305             }
 306             elsif (($i & 0b11111000) == 0b11111000) {
 307                 $count= 5;
 308             }
 309             elsif (($i & 0b11110000) == 0b11110000) {
 310                 $count= 4;
 311             }
 312             elsif (($i & 0b11100000) == 0b11100000) {
 313                 $count= 3;
 314             }
 315             elsif (($i & 0b11000000) == 0b11000000) {
 316                 $count= 2;
 317             }
 318             else {
 319                 die "Something wrong for UTF8SKIP calculation for $i";
 320             }
 321             $utf8skip[$i82utf[$i]] = $count;
 322         }
 323
 324         print $out_fh <<END;
 325 /* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes
 326  * (including for overlongs); 1 for continuation.  Adapted from the shadow
 327  * flags table in tr16.  The entries marked 9 in tr16 are continuation bytes
 328  * and are marked as length 1 here so that we can recover. */
 329 END
 330         output_table(\@utf8skip, "PL_utf8skip", 0);  # The 0 means don't print
 331                                                      # in hex
 332     }
 333
 334     use feature 'unicode_strings';
 335
 336     {
 337         my @lc;
 338         for my $i (0 .. 255) {
 339             $lc[$a2e[$i]] = $a2e[ord lc chr $i];
 340         }
 341         print $out_fh
 342         "/* Index is $charset code point; value is its lowercase equivalent */\n";
 343         output_table(\@lc, "PL_latin1_lc");
 344     }
 345
 346     {
 347         my @uc;
 348         for my $i (0 .. 255) {
 349             my $uc = uc chr $i;
 350             if (length $uc > 1 || ord $uc > 255) {
 351                 $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
 352             }
 353             $uc[$a2e[$i]] = $a2e[ord $uc];
 354         }
 355         print $out_fh <<END;
 356 /* Index is $charset code point; value is its uppercase equivalent.
 357  * The 'mod' in the name means that codepoints whose uppercase is above 255 or
 358  * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
 359 END
 360         output_table(\@uc, "PL_mod_latin1_uc");
 361     }
 362
 363     { # PL_fold
 364         my @ascii_fold;
 365         for my $i (0 .. 255) {  # Initialise to identity map
 366             $ascii_fold[$i] = $i;
 367         }
 368
 369         # Overwrite the entries that aren't identity
 370         for my $chr ('A' .. 'Z') {
 371             $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
 372         }
 373         for my $chr ('a' .. 'z') {
 374             $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
 375         }
 376         print $out_fh <<END;
 377 /* Index is $charset code point; For A-Z, value is a-z; for a-z, value
 378  * is A-Z; all other code points map to themselves */
 379 END
 380         output_table(\@ascii_fold, "PL_fold");
 381     }
 382
 383     {
 384         my @latin1_fold;
 385         for my $i (0 .. 255) {
 386             my $char = chr $i;
 387             my $lc = lc $char;
 388
 389             # lc and uc adequately proxy for fold-case pairs in this 0-255
 390             # range
 391             my $uc = uc $char;
 392             $uc = $char if length $uc > 1 || ord $uc > 255;
 393             if ($lc ne $char) {
 394                 $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
 395             }
 396             elsif ($uc ne $char) {
 397                 $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
 398             }
 399             else {
 400                 $latin1_fold[$a2e[$i]] = $a2e[$i];
 401             }
 402         }
 403         print $out_fh <<END;
 404 /* Index is $charset code point; value is its other fold-pair equivalent
 405  * (A => a; a => A, etc) in the 0-255 range.  If no such equivalent, value is
 406  * the code point itself */
 407 END
 408         output_table(\@latin1_fold, "PL_fold_latin1");
 409     }
 410
 411     {
 412       # This generates the dfa table for perl extended UTF-8, which accepts
 413       # surrogates, non-characters, and accepts start bytes up through FE
 414       # (start byte FF has to be handled outside this dfa).  The class numbers
 415       # for start bytes are constrained so that they can be used as a shift
 416       # count for masking off the leading one bits
 417       #
 418       # The classes are
 419       #   00-9F           0
 420       #   A0-A1           7   Not legal immediately after start bytes F0 F8 FC
 421       #                       FE
 422       #   A2-A3           8   Not legal immediately after start bytes F0 F8 FC
 423       #   A4-A7           9   Not legal immediately after start bytes F0 F8
 424       #   A8-AF          10   Not legal immediately after start bytes F0
 425       #   B0-BF          11
 426       #   C0-C4           1
 427       #   C5-DF           2
 428       #   E0              1
 429       #   E1-EF           3
 430       #   F0             12
 431       #   F1-F7           4
 432       #   F8             13
 433       #   F9-FB           5
 434       #   FC             14
 435       #   FD              6
 436       #   FE             15
 437       #   FF              1
 438       #
 439       # Here's the I8 for the code points before which overlongs occur:
 440       # U+4000:     \xF0\xB0\xA0\xA0
 441       # U+40000:    \xF8\xA8\xA0\xA0\xA0
 442       # U+400000:   \xFC\xA4\xA0\xA0\xA0\xA0
 443       # U+4000000:  \xFE\xA2\xA0\xA0\xA0\xA0\xA0
 444       #
 445       # The first part of the table maps bytes to character classes to reduce
 446       # the size of the transition table and create bitmasks.
 447       #
 448       # The second part is a transition table that maps a combination of a
 449       # state of the automaton and a character class to a new state.  The
 450       # numbering of the original nodes is retained, but some have been split
 451       # so that there are new nodes.  They mean:
 452       # N0     The initial state, and final accepting one.
 453       # N1     One continuation byte (A0-BF) left.  This is transitioned to
 454       #        immediately when the start byte indicates a two-byte sequence
 455       # N2     Two continuation bytes left.
 456       # N3     Three continuation bytes left.
 457       # N4     Four continuation bytes left.
 458       # N5     Five continuation bytes left.
 459       # N6     Start byte is F0.  Continuation bytes A[0-F] are illegal
 460       #        (overlong); the other continuations transition to N2
 461       # N7     Start byte is F8.  Continuation bytes A[0-7] are illegal
 462       #        (overlong); the other continuations transition to N3
 463       # N8     Start byte is FC.  Continuation bytes A[0-3] are illegal
 464       #        (overlong); the other continuations transition to N4
 465       # N9     Start byte is FE.  Continuation bytes A[01] are illegal
 466       #        (overlong); the other continuations transition to N5
 467       # 1      Reject.  All transitions not mentioned above (except the single
 468       #        byte ones (as they are always legal) are to this state.
 469
 470         my $NUM_CLASSES = 16;
 471         my $N0 = 0;
 472         my $N1 =  $N0 + $NUM_CLASSES;
 473         my $N2 =  $N1 + $NUM_CLASSES;
 474         my $N3 =  $N2 + $NUM_CLASSES;
 475         my $N4 =  $N3 + $NUM_CLASSES;
 476         my $N5 =  $N4 + $NUM_CLASSES;
 477         my $N6 =  $N5 + $NUM_CLASSES;
 478         my $N7 =  $N6 + $NUM_CLASSES;
 479         my $N8 =  $N7 + $NUM_CLASSES;
 480         my $N9 =  $N8 + $NUM_CLASSES;
 481         my $N10 = $N9 + $NUM_CLASSES;
 482
 483         my @perl_extended_utf8_dfa;
 484         my @i8 = (
 485                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
 486                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
 487                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
 488                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
 489                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
 490                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
 491                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
 492                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
 493                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
 494                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
 495                   7, 7, 8, 8, 9, 9, 9, 9,10,10,10,10,10,10,10,10, # A0-AF
 496                  11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, # B0-BF
 497                   1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
 498                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
 499                   1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
 500                  12, 4, 4, 4, 4, 4, 4, 4,13, 5, 5, 5,14, 6,15, 1, # F0-FF
 501                 );
 502         $perl_extended_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
 503         push @perl_extended_utf8_dfa, (
 504           # Class:
 505           # 0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 506             0,  1,$N1,$N2,$N3,$N4,$N5,  1,  1,  1,  1,  1,$N6,$N7,$N8,$N9, # N0
 507             1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  1,  1,  1,  1, # N1
 508             1,  1,  1,  1,  1,  1,  1,$N1,$N1,$N1,$N1,$N1,  1,  1,  1,  1, # N2
 509             1,  1,  1,  1,  1,  1,  1,$N2,$N2,$N2,$N2,$N2,  1,  1,  1,  1, # N3
 510             1,  1,  1,  1,  1,  1,  1,$N3,$N3,$N3,$N3,$N3,  1,  1,  1,  1, # N4
 511             1,  1,  1,  1,  1,  1,  1,$N4,$N4,$N4,$N4,$N4,  1,  1,  1,  1, # N5
 512
 513             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,$N2,  1,  1,  1,  1, # N6
 514             1,  1,  1,  1,  1,  1,  1,  1,  1,  1,$N3,$N3,  1,  1,  1,  1, # N7
 515             1,  1,  1,  1,  1,  1,  1,  1,  1,$N4,$N4,$N4,  1,  1,  1,  1, # N8
 516             1,  1,  1,  1,  1,  1,  1,  1,$N5,$N5,$N5,$N5,  1,  1,  1,  1, # N9
 517         );
 518         output_table(\@perl_extended_utf8_dfa, "PL_extended_utf8_dfa_tab",
 519                                                                    $NUM_CLASSES);
 520     }
 521
 522     {
 523       # This generates the dfa table for strict UTF-8, which rejects
 524       # surrogates, non-characters, and above Unicode.
 525       #
 526       # The classes are
 527       #   00-9F           0   Always legal at start
 528       #   A0             10   Not legal immediately after start bytes F0 F8
 529       #   A1             11   Not legal immediately after start bytes F0 F8,
 530       #   A2-A7          12   Not legal immediately after start bytes F0 F8 F9
 531       #   A8,AA,AC       13   Not legal immediately after start bytes F0 F9
 532       #   A9,AB,AD       14   Not legal immediately after start byte F0
 533       #   AE             15   Not legal immediately after start byte F0
 534       #   AF             16   Not legal immediately after start bytes F0
 535       #   B[0248AC]      17   Not legal immediately after start byte F9
 536       #   B[1359D]       18   Not legal immediately after start byte F9
 537       #   B6             19   Not legal immediately after start byte F9
 538       #   B7             20   Not legal immediately after start byte F9
 539       #   BE             21   Not legal immediately after start byte F9
 540       #   BF             22   Not legal immediately after start byte F9
 541       #   C0-C4           1   (reject, all are overlong)
 542       #   C5-DF           2   Accepts any legal continuation
 543       #   E0              1   (reject, all are overlong)
 544       #   E1-EF           3   Accepts any legal continuation
 545       #   F0              8   (has overlongs)
 546       #   F1              6   (has surrogates, non-chars)
 547       #   F2,F4,F6        4   Accepts any legal continuation
 548       #   F3,F5,F7        5   (has non-chars)
 549       #   F8              9   (has overlongs, non-chars)
 550       #   F9              7   (has non-chars, non-Unicode)
 551       #   FA-FF           1   (reject, all are non-Unicode)
 552       #
 553       # Here's the I8 for enough code points so that you can figure out what's
 554       # going on:
 555       #
 556       # U+D800: \xF1\xB6\xA0\xA0
 557       # U+DFFF: \xF1\xB7\xBF\xBF
 558       # U+FDD0: \xF1\xBF\xAE\xB0
 559       # U+FDEF: \xF1\xBF\xAF\xAF
 560       # U+FFFE: \xF1\xBF\xBF\xBE
 561       # U+1FFFE: \xF3\xBF\xBF\xBE
 562       # U+2FFFE: \xF5\xBF\xBF\xBE
 563       # U+3FFFE: \xF7\xBF\xBF\xBE
 564       # U+4FFFE: \xF8\xA9\xBF\xBF\xBE
 565       # U+5FFFE: \xF8\xAB\xBF\xBF\xBE
 566       # U+6FFFE: \xF8\xAD\xBF\xBF\xBE
 567       # U+7FFFE: \xF8\xAF\xBF\xBF\xBE
 568       # U+8FFFE: \xF8\xB1\xBF\xBF\xBE
 569       # U+9FFFE: \xF8\xB3\xBF\xBF\xBE
 570       # U+AFFFE: \xF8\xB5\xBF\xBF\xBE
 571       # U+BFFFE: \xF8\xB7\xBF\xBF\xBE
 572       # U+CFFFE: \xF8\xB9\xBF\xBF\xBE
 573       # U+DFFFE: \xF8\xBB\xBF\xBF\xBE
 574       # U+EFFFE: \xF8\xBD\xBF\xBF\xBE
 575       # U+FFFFE: \xF8\xBF\xBF\xBF\xBE
 576       # U+10FFFE: \xF9\xA1\xBF\xBF\xBE
 577       #
 578       # The first part of the table maps bytes to character classes to reduce
 579       # the size of the transition table and create bitmasks.
 580       #
 581       # The second part is a transition table that maps a combination of a
 582       # state of the automaton and a character class to a new state.  The
 583       # numbering of the original nodes is retained, but some have been split
 584       # so that there are new nodes.  They mean:
 585       # N0     The initial state, and final accepting one.
 586       # N1     One continuation byte (A0-BF) left.  This is transitioned to
 587       #        immediately when the start byte indicates a two-byte sequence
 588       # N2     Two continuation bytes left.
 589       # N3     Three continuation bytes left.
 590       # N4     Start byte is F0.  Continuation bytes A[0-F] are illegal
 591       #        (overlong); the other continuations transition to N2
 592       # N5     Start byte is F1.  Continuation bytes B6 and B7 are illegal
 593       #        (surrogates); BF transitions to N9; the other continuations to
 594       #        N2
 595       # N6     Start byte is F[357].  Continuation byte BF transitions to N12;
 596       #        other continuations to N2
 597       # N5     Start byte is F8.  Continuation bytes A[0-7] are illegal
 598       #        (overlong); continuations A[9BDF] and B[13579BDF] transition to
 599       #        N14; the other continuations to N3
 600       # N8     Start byte is F9.  Continuation byte A0 transitions to N3; A1
 601       #        to N14; the other continuation bytes are illegal.
 602       # N9     Initial sequence is F1 BF.  Continuation byte AE transitions to
 603       #        state N10; AF to N11; BF to N13; the other continuations to N1.
 604       # N10    Initial sequence is F1 BF AE.  Continuation bytes B0-BF are
 605       #        illegal (non-chars); the other continuations are legal
 606       # N11    Initial sequence is F1 BF AF.  Continuation bytes A0-AF are
 607       #        illegal (non-chars); the other continuations are legal
 608       # N12    Initial sequence is F[357] BF.  Continuation bytes BF
 609       #        transitions to N13; the other continuations to N1
 610       # N13    Initial sequence is F[1357] BF BF or F8 x BF (where x is
 611       #        something that can lead to a non-char.  Continuation bytes BE
 612       #        and BF are illegal (non-chars); the other continuations are
 613       #        legal
 614       # N14    Initial sequence is F8 A[9BDF]; or F8 B[13579BDF]; or F9 A1.
 615       #        Continuation byte BF transitions to N13; the other
 616       #        continuations to N2
 617       # 1      Reject.  All transitions not mentioned above (except the single
 618       #        byte ones (as they are always legal) are to this state.
 619
 620         my $NUM_CLASSES = 23;
 621         my $N0 = 0;
 622         my $N1 =  $N0 + $NUM_CLASSES;
 623         my $N2 =  $N1 + $NUM_CLASSES;
 624         my $N3 =  $N2 + $NUM_CLASSES;
 625         my $N4 =  $N3 + $NUM_CLASSES;
 626         my $N5 =  $N4 + $NUM_CLASSES;
 627         my $N6 =  $N5 + $NUM_CLASSES;
 628         my $N7 =  $N6 + $NUM_CLASSES;
 629         my $N8 =  $N7 + $NUM_CLASSES;
 630         my $N9 =  $N8 + $NUM_CLASSES;
 631         my $N10 = $N9 + $NUM_CLASSES;
 632         my $N11 = $N10 + $NUM_CLASSES;
 633         my $N12 = $N11 + $NUM_CLASSES;
 634         my $N13 = $N12 + $NUM_CLASSES;
 635         my $N14 = $N13 + $NUM_CLASSES;
 636
 637         my @strict_utf8_dfa;
 638         my @i8 = (
 639                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
 640                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
 641                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
 642                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
 643                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
 644                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
 645                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
 646                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
 647                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
 648                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
 649                  10,11,12,12,12,12,12,12,13,14,13,14,13,14,15,16, # A0-AF
 650                  17,18,17,18,17,18,19,20,17,18,17,18,17,18,21,22, # B0-BF
 651                   1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
 652                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
 653                   1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
 654                   8, 6, 4, 5, 4, 5, 4, 5, 9, 7, 1, 1, 1, 1, 1, 1, # F0-FF
 655                 );
 656         $strict_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
 657         push @strict_utf8_dfa, (
 658           # Class:
 659           # 0 1   2   3   4   5   6   7   8   9   10   11   12   13   14   15   16   17   18   19   20   21   22
 660             0,1,$N1,$N2,$N3,$N6,$N5,$N8,$N4,$N7,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1, # N0
 661             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, # N1
 662             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, # N2
 663             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, # N3
 664
 665             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1,   1,   1,   1,   1, $N2, $N2, $N2, $N2, $N2, $N2, # N4
 666             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,   1,   1, $N2, $N9, # N5
 667             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N12, # N6
 668             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, $N3,$N14, # N7
 669             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N3,$N14,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1, # N8
 670             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1,$N10,$N11, $N1, $N1, $N1, $N1, $N1,$N13, # N9
 671             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1, # N10
 672             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1,   1,   1,   1,   1,   1,   0,   0,   0,   0,   0,   0, # N11
 673             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1, $N1,$N13, # N12
 674             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1, # N13
 675             1,1,  1,  1,  1,  1,  1,  1,  1,  1, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2, $N2,$N13, # N14
 676         );
 677         output_table(\@strict_utf8_dfa, "PL_strict_utf8_dfa_tab", $NUM_CLASSES);
 678     }
 679
 680     {
 681       # This generates the dfa table for C9 strict UTF-8, which rejects
 682       # surrogates and above Unicode, but allows non-characters,.
 683       #
 684       # The classes are
 685       #   00-9F           0   Always legal at start
 686       #   A0-A1           9   Not legal immediately after start bytes F0 F8
 687       #   A2-A7          10   Not legal immediately after start bytes F0 F8 F9
 688       #   A8-AF          11   Not legal immediately after start bytes F0 F9
 689       #   B0-B5,B8-BF    12   Not legal immediately after start byte F9
 690       #   B6,B7          13
 691       #   C0-C4           1   (reject, all are overlong)
 692       #   C5-DF           2   Accepts any legal continuation
 693       #   E0              1   (reject, all are overlong)
 694       #   E1-EF           3   Accepts any legal continuation
 695       #   F0              6   (has overlongs)
 696       #   F1              5   (has surrogates)
 697       #   F2-F7           4   Accepts any legal continuation
 698       #   F8              8   (has overlongs)
 699       #   F9              7   (has non-Unicode)
 700       #   FA-FF           1   (reject, all are non-Unicode)
 701       #
 702       # The first part of the table maps bytes to character classes to reduce
 703       # the size of the transition table and create bitmasks.
 704       #
 705       # The second part is a transition table that maps a combination of a
 706       # state of the automaton and a character class to a new state.  The
 707       # numbering of the original nodes is retained, but some have been split
 708       # so that there are new nodes.  They mean:
 709       # N0     The initial state, and final accepting one.
 710       # N1     One continuation byte (A0-BF) left.  This is transitioned to
 711       #        immediately when the start byte indicates a two-byte sequence
 712       # N2     Two continuation bytes left.
 713       # N3     Three continuation bytes left.
 714       # N4     Start byte is F0.  Continuation bytes A[0-F] are illegal
 715       #        (overlong); the other continuations transition to N2
 716       # N5     Start byte is F1.  B6 and B7 are illegal (surrogates); the
 717       #        other continuations transition to N2
 718       # N6     Start byte is F8.  Continuation bytes A[0-7] are illegal
 719       #        (overlong); the other continuations transition to N3
 720       # N7     Start byte is F9.  Continuation bytes A0 and A1 transition to
 721       #        N3; the other continuation bytes are illegal (non-Unicode)
 722       # 1      Reject.  All transitions not mentioned above (except the single
 723       #        byte ones (as they are always legal) are to this state.
 724
 725         my $NUM_CLASSES = 14;
 726         my $N0 = 0;
 727         my $N1 =  $N0 + $NUM_CLASSES;
 728         my $N2 =  $N1 + $NUM_CLASSES;
 729         my $N3 =  $N2 + $NUM_CLASSES;
 730         my $N4 =  $N3 + $NUM_CLASSES;
 731         my $N5 =  $N4 + $NUM_CLASSES;
 732         my $N6 =  $N5 + $NUM_CLASSES;
 733         my $N7 =  $N6 + $NUM_CLASSES;
 734
 735         my @C9_utf8_dfa;
 736         my @i8 = (
 737                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0F
 738                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1F
 739                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2F
 740                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3F
 741                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4F
 742                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5F
 743                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6F
 744                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7F
 745                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
 746                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 90-9F
 747                   9, 9,10,10,10,10,10,10,11,11,11,11,11,11,11,11, # A0-AF
 748                  12,12,12,12,12,12,13,13,12,12,12,12,12,12,12,12, # B0-BF
 749                   1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-CF
 750                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
 751                   1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
 752                   6, 5, 4, 4, 4, 4, 4, 4, 8, 7, 1, 1, 1, 1, 1, 1, # F0-FF
 753                 );
 754         $C9_utf8_dfa[$i82utf[$_]] = $i8[$_] for (0 .. 255);
 755         push @C9_utf8_dfa, (
 756           # Class:
 757           # 0 1   2   3   4   5   6   7   8   9   10   11   12   13
 758             0,1,$N1,$N2,$N3,$N5,$N4,$N7,$N6,  1,   1,   1,   1,   1, # N0
 759             1,1,  1,  1,  1,  1,  1,  1,  1,  0,   0,   0,   0,   0, # N1
 760             1,1,  1,  1,  1,  1,  1,  1,  1,$N1, $N1, $N1, $N1, $N1, # N2
 761             1,1,  1,  1,  1,  1,  1,  1,  1,$N2, $N2, $N2, $N2, $N2, # N3
 762
 763             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1,   1, $N2, $N2, # N4
 764             1,1,  1,  1,  1,  1,  1,  1,  1,$N2, $N2, $N2, $N2,   1, # N5
 765             1,1,  1,  1,  1,  1,  1,  1,  1,  1,   1, $N3, $N3, $N3, # N6
 766             1,1,  1,  1,  1,  1,  1,  1,  1,$N3,   1,   1,   1,   1, # N7
 767         );
 768         output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
 769     }
 770
 771     {
 772         print $out_fh <<EOF;
 773 /* This table partitions all the code points of the platform into ranges which
 774  * have the property that all the code points in each range have the same
 775  * number of bytes in their UTF-EBCDIC representations, and the adjacent
 776  * ranges have a different number of bytes.
 777  *
 778  * Each number in the table begins such a range, which extends up to just
 779  * before the following table entry, except the final entry is understood to
 780  * extend to the platform's infinity
 781  */
 782 EOF
 783         # The lengths of the characters between 0 and 255 are either 1 or 2,
 784         # with those whose ASCII platform equivalents below 160 being 1, and
 785         # the rest being 2.
 786         my @list;
 787         push @list, 0;
 788         my $pushed_range_is_length_1 = 1;
 789
 790         for my $i (1 .. 0xFF) {
 791             my $this_code_point_is_length_1 = ($e2a[$i] < 160);
 792             if ($pushed_range_is_length_1 != $this_code_point_is_length_1) {
 793                 push @list, $i;
 794                 $pushed_range_is_length_1 = $this_code_point_is_length_1;
 795             }
 796         }
 797
 798         # Starting at 256, the length is 2.
 799         push @list, 0x100 if $pushed_range_is_length_1;
 800
 801         # These are based on the fundamental properties of UTF-EBCDIC.  Each
 802         # continuation byte has 5 bits of information.  Comments in utf8.h
 803         # explain the rest.
 804         my $UTF_ACCUMULATION_SHIFT = 5;
 805         push @list, (32 * (1 << (    $UTF_ACCUMULATION_SHIFT)));
 806         push @list, (16 * (1 << (2 * $UTF_ACCUMULATION_SHIFT)));
 807         push @list, ( 8 * (1 << (3 * $UTF_ACCUMULATION_SHIFT)));
 808         push @list, ( 4 * (1 << (4 * $UTF_ACCUMULATION_SHIFT)));
 809         push @list, ( 2 * (1 << (5 * $UTF_ACCUMULATION_SHIFT)));
 810         push @list, (     (1 << (6 * $UTF_ACCUMULATION_SHIFT)));
 811
 812         output_table_start($out_fh, "UV", "PL_partition_by_byte_length", scalar @list);
 813         print $out_fh "\t";
 814
 815         print $out_fh join ",\n\t", map { sprintf "0x%02x", $_ } @list;
 816         print $out_fh "\n";
 817
 818         output_table_end($out_fh);
 819     }
 820
 821     print $out_fh get_conditional_compile_line_end();
 822 }
 823
 824 print $out_fh "\n#endif /* PERL_EBCDIC_TABLES_H_ */\n";
 825
 826 read_only_bottom_close_and_rename($out_fh);