regen/mk_invlists.pl

   1 #!perl -w
   2 use 5.015;
   3 use strict;
   4 use warnings;
   5 use Unicode::UCD qw(prop_aliases
   6                     prop_values
   7                     prop_value_aliases
   8                     prop_invlist
   9                     prop_invmap search_invlist
  10                    );
  11 require './regen/regen_lib.pl';
  12 require './regen/charset_translations.pl';
  13
  14 # This program outputs charclass_invlists.h, which contains various inversion
  15 # lists in the form of C arrays that are to be used as-is for inversion lists.
  16 # Thus, the lists it contains are essentially pre-compiled, and need only a
  17 # light-weight fast wrapper to make them usable at run-time.
  18
  19 # As such, this code knows about the internal structure of these lists, and
  20 # any change made to that has to be done here as well.  A random number stored
  21 # in the headers is used to minimize the possibility of things getting
  22 # out-of-sync, or the wrong data structure being passed.  Currently that
  23 # random number is:
  24
  25 # charclass_invlists.h now also has a partial implementation of inversion
  26 # maps; enough to generate tables for the line break properties, such as GCB
  27
  28 my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
  29
  30 # integer or float
  31 my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
  32
  33 # Matches valid C language enum names: begins with ASCII alphabetic, then any
  34 # ASCII \w
  35 my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
  36
  37 my $out_fh = open_new('charclass_invlists.h', '>',
  38                       {style => '*', by => $0,
  39                       from => "Unicode::UCD"});
  40
  41 my $in_file_pound_if = 0;
  42
  43 my $max_hdr_len = 3;    # In headings, how wide a name is allowed?
  44
  45 print $out_fh "/* See the generating file for comments */\n\n";
  46
  47 # The symbols generated by this program are all currently defined only in a
  48 # single dot c each.  The code knows where most of them go, but this hash
  49 # gives overrides for the exceptions to the typical place
  50 my %exceptions_to_where_to_define =
  51                         ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
  52                           AboveLatin1                => 'PERL_IN_REGCOMP_C',
  53                           Latin1                     => 'PERL_IN_REGCOMP_C',
  54                           UpperLatin1                => 'PERL_IN_REGCOMP_C',
  55                           _Perl_Any_Folds            => 'PERL_IN_REGCOMP_C',
  56                           _Perl_Folds_To_Multi_Char  => 'PERL_IN_REGCOMP_C',
  57                           _Perl_IDCont               => 'PERL_IN_UTF8_C',
  58                           _Perl_IDStart              => 'PERL_IN_UTF8_C',
  59                         );
  60
  61 # This hash contains the properties with enums that have hard-coded references
  62 # to them in C code.  It is neeed to make sure that if perl is compiled
  63 # with an older Unicode data set, that all the enum values the code is
  64 # expecting will still be in the enum typedef.  Thus the code doesn't have to
  65 # change.  The Unicode version won't have any code points that have the enum
  66 # values not in that version, so the code that handles them will not get
  67 # exercised.  This is far better than having to #ifdef things.  The names here
  68 # should be the long names of the respective property values.  The reason for
  69 # this is because regexec.c uses them as case labels, and the long name is
  70 # generally more understandable than the short.
  71 my %hard_coded_enums =
  72  ( gcb => [
  73             'Control',
  74             'CR',
  75             'E_Base',
  76             'E_Base_GAZ',
  77             'E_Modifier',
  78             'Extend',
  79             'Glue_After_Zwj',
  80             'L',
  81             'LF',
  82             'LV',
  83             'LVT',
  84             'Other',
  85             'Prepend',
  86             'Regional_Indicator',
  87             'SpacingMark',
  88             'T',
  89             'V',
  90             'ZWJ',
  91         ],
  92     lb => [
  93             'Alphabetic',
  94             'Break_After',
  95             'Break_Before',
  96             'Break_Both',
  97             'Break_Symbols',
  98             'Carriage_Return',
  99             'Close_Parenthesis',
 100             'Close_Punctuation',
 101             'Combining_Mark',
 102             'Contingent_Break',
 103             'E_Base',
 104             'E_Modifier',
 105             'Exclamation',
 106             'Glue',
 107             'H2',
 108             'H3',
 109             'Hebrew_Letter',
 110             'Hyphen',
 111             'Ideographic',
 112             'Infix_Numeric',
 113             'Inseparable',
 114             'JL',
 115             'JT',
 116             'JV',
 117             'Line_Feed',
 118             'Mandatory_Break',
 119             'Next_Line',
 120             'Nonstarter',
 121             'Numeric',
 122             'Open_Punctuation',
 123             'Postfix_Numeric',
 124             'Prefix_Numeric',
 125             'Quotation',
 126             'Regional_Indicator',
 127             'Space',
 128             'Word_Joiner',
 129             'ZWJ',
 130             'ZWSpace',
 131         ],
 132    sb  => [
 133             'ATerm',
 134             'Close',
 135             'CR',
 136             'Extend',
 137             'Format',
 138             'LF',
 139             'Lower',
 140             'Numeric',
 141             'OLetter',
 142             'Other',
 143             'SContinue',
 144             'Sep',
 145             'Sp',
 146             'STerm',
 147             'Upper',
 148         ],
 149    wb  => [
 150             'ALetter',
 151             'CR',
 152             'Double_Quote',
 153             'E_Base',
 154             'E_Base_GAZ',
 155             'E_Modifier',
 156             'Extend',
 157             'ExtendNumLet',
 158             'Format',
 159             'Glue_After_Zwj',
 160             'Hebrew_Letter',
 161             'Katakana',
 162             'LF',
 163             'MidLetter',
 164             'MidNum',
 165             'MidNumLet',
 166             'Newline',
 167             'Numeric',
 168             'Other',
 169             'Perl_Tailored_HSpace',
 170             'Regional_Indicator',
 171             'Single_Quote',
 172             'ZWJ',
 173         ],
 174 );
 175
 176 my %gcb_enums;
 177 my @gcb_short_enums;
 178 my %gcb_abbreviations;
 179 my %lb_enums;
 180 my @lb_short_enums;
 181 my %lb_abbreviations;
 182 my %wb_enums;
 183 my @wb_short_enums;
 184 my %wb_abbreviations;
 185
 186 my @a2n;
 187
 188 sub uniques {
 189     # Returns non-duplicated input values.  From "Perl Best Practices:
 190     # Encapsulated Cleverness".  p. 455 in first edition.
 191
 192     my %seen;
 193     return grep { ! $seen{$_}++ } @_;
 194 }
 195
 196 sub a2n($) {
 197     my $cp = shift;
 198
 199     # Returns the input Unicode code point translated to native.
 200
 201     return $cp if $cp !~ $numeric_re || $cp > 255;
 202     return $a2n[$cp];
 203 }
 204
 205 sub end_file_pound_if {
 206     if ($in_file_pound_if) {
 207         print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
 208         $in_file_pound_if = 0;
 209     }
 210 }
 211
 212 sub switch_pound_if ($$) {
 213     my $name = shift;
 214     my $new_pound_if = shift;
 215
 216     # Switch to new #if given by the 2nd argument.  If there is an override
 217     # for this, it instead switches to that.  The 1st argument is the
 218     # static's name, used to look up the overrides
 219
 220     if (exists $exceptions_to_where_to_define{$name}) {
 221         $new_pound_if = $exceptions_to_where_to_define{$name};
 222     }
 223
 224     # Exit current #if if the new one is different from the old
 225     if ($in_file_pound_if
 226         && $in_file_pound_if !~ /$new_pound_if/)
 227     {
 228         end_file_pound_if;
 229     }
 230
 231     # Enter new #if, if not already in it.
 232     if (! $in_file_pound_if) {
 233         $in_file_pound_if = "defined($new_pound_if)";
 234         print $out_fh "\n#if $in_file_pound_if\n";
 235     }
 236 }
 237
 238 sub output_invlist ($$;$) {
 239     my $name = shift;
 240     my $invlist = shift;     # Reference to inversion list array
 241     my $charset = shift // "";  # name of character set for comment
 242
 243     die "No inversion list for $name" unless defined $invlist
 244                                              && ref $invlist eq 'ARRAY';
 245
 246     # Output the inversion list $invlist using the name $name for it.
 247     # It is output in the exact internal form for inversion lists.
 248
 249     # Is the last element of the header 0, or 1 ?
 250     my $zero_or_one = 0;
 251     if (@$invlist && $invlist->[0] != 0) {
 252         unshift @$invlist, 0;
 253         $zero_or_one = 1;
 254     }
 255     my $count = @$invlist;
 256
 257     switch_pound_if ($name, 'PERL_IN_PERL_C');
 258
 259     print $out_fh "\nstatic const UV ${name}_invlist[] = {";
 260     print $out_fh " /* for $charset */" if $charset;
 261     print $out_fh "\n";
 262
 263     print $out_fh "\t$count,\t/* Number of elements */\n";
 264     print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
 265     print $out_fh "\t", $zero_or_one,
 266                   ",\t/* 0 if the list starts at 0;",
 267                   "\n\t\t   1 if it starts at the element beyond 0 */\n";
 268
 269     # The main body are the UVs passed in to this routine.  Do the final
 270     # element separately
 271     for my $i (0 .. @$invlist - 1) {
 272         printf $out_fh "\t0x%X", $invlist->[$i];
 273         print $out_fh "," if $i < @$invlist - 1;
 274         print $out_fh "\n";
 275     }
 276
 277     print $out_fh "};\n";
 278 }
 279
 280 sub output_invmap ($$$$$$$) {
 281     my $name = shift;
 282     my $invmap = shift;     # Reference to inversion map array
 283     my $prop_name = shift;
 284     my $input_format = shift;   # The inversion map's format
 285     my $default = shift;        # The property value for code points who
 286                                 # otherwise don't have a value specified.
 287     my $extra_enums = shift;    # comma-separated list of our additions to the
 288                                 # property's standard possible values
 289     my $charset = shift // "";  # name of character set for comment
 290
 291     # Output the inversion map $invmap for property $prop_name, but use $name
 292     # as the actual data structure's name.
 293
 294     my $count = @$invmap;
 295
 296     my $output_format;
 297     my $declaration_type;
 298     my %enums;
 299     my $name_prefix;
 300
 301     if ($input_format eq 's') {
 302         my $orig_prop_name = $prop_name;
 303         $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
 304         my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
 305             my @enums;
 306             if ($orig_prop_name eq $prop_name) {
 307                 @enums = prop_values($prop_name);
 308             }
 309             else {
 310                 @enums = uniques(@$invmap);
 311             }
 312
 313             if (! @enums) {
 314                 die "Only enum properties are currently handled; '$prop_name' isn't one";
 315             }
 316             else {
 317                 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
 318                 my @canonical_input_enums;
 319                 if (@expected_enums) {
 320                     if (@expected_enums < @enums) {
 321                         die 'You need to update %hard_coded_enums to reflect new'
 322                         . " entries in this Unicode version\n"
 323                         . "Expected: " . join(", ", sort @expected_enums) . "\n"
 324                         . "     Got: " . join(", ", sort @enums);
 325                     }
 326
 327                     if (! defined prop_aliases($prop_name)) {
 328
 329                         # Convert the input enums into canonical form and
 330                         # save for use below
 331                         @canonical_input_enums = map { lc ($_ =~ s/_//gr) }
 332                                                                      @enums;
 333                     }
 334                     @enums = sort @expected_enums;
 335                 }
 336
 337                 # The internal enums come last, and in the order specified
 338                 my @extras;
 339                 if ($extra_enums ne "") {
 340                     @extras = split /,/, $extra_enums;
 341                     push @enums, @extras;
 342                 }
 343
 344                 # Assign a value to each element of the enum.  The default
 345                 # value always gets 0; the others are arbitrarily assigned.
 346                 my $enum_val = 0;
 347                 my $canonical_default = prop_value_aliases($prop_name, $default);
 348                 $default = $canonical_default if defined $canonical_default;
 349                 $enums{$default} = $enum_val++;
 350                 for my $enum (@enums) {
 351                     $enums{$enum} = $enum_val++ unless exists $enums{$enum};
 352                 }
 353
 354                 # Calculate the enum values for certain properties like
 355                 # _Perl_GCB and _Perl_LB, because we output special tables for
 356                 # them.
 357                 if ($name =~ / ^  _Perl_ (?: GCB | LB | WB ) $ /x) {
 358
 359                     # We use string evals to allow the same code to work on
 360                     # all tables we're doing.
 361                     my $type = lc $prop_name;
 362
 363                     # We use lowercase single letter names for any property
 364                     # values not in the release of Unicode being compiled now.
 365                     my $placeholder = "a";
 366
 367                     # Skip if we've already done this code, which populated
 368                     # this hash
 369                     if (eval "! \%${type}_enums") {
 370
 371                         # For each enum ...
 372                         foreach my $enum (sort keys %enums) {
 373                             my $value = $enums{$enum};
 374                             my $short;
 375                             my $abbreviated_from;
 376
 377                             # Special case this wb property value to make the
 378                             # name more clear
 379                             if ($enum eq 'Perl_Tailored_HSpace') {
 380                                 $short = 'hs';
 381                                 $abbreviated_from = $enum;
 382                             }
 383                             elsif (grep { $_ eq $enum } @extras) {
 384
 385                                 # The 'short' name for one of the property
 386                                 # values added by this file is just the
 387                                 # lowercase of it
 388                                 $short = lc $enum;
 389                             }
 390                             elsif (grep {$_ eq lc ( $enum =~ s/_//gr) }
 391                                                         @canonical_input_enums)
 392                             {   # On Unicode versions that predate the
 393                                 # official property, we have set up this array
 394                                 # to be the canonical form of each enum in the
 395                                 # substitute property.  If the enum we're
 396                                 # looking at is canonically the same as one of
 397                                 # these, use its name instead of generating a
 398                                 # placeholder one in the next clause (which
 399                                 # will happen because prop_value_aliases()
 400                                 # will fail because it only works on official
 401                                 # properties)
 402                                 $short = $enum;
 403                             }
 404                             else {
 405                                 # Use the official short name for the other
 406                                 # property values, which should all be
 407                                 # official ones.
 408                                 ($short) = prop_value_aliases($type, $enum);
 409
 410                                 # But create a placeholder for ones not in
 411                                 # this Unicode version.
 412                                 $short = $placeholder++ unless defined $short;
 413                             }
 414
 415                             # If our short name is too long, or we already
 416                             # know that the name is an abbreviation, truncate
 417                             # to make sure it's short enough, and remember
 418                             # that we did this so we can later place in a
 419                             # comment in the generated file
 420                             if (   $abbreviated_from
 421                                 || length $short > $max_hdr_len)
 422                                 {
 423                                 $short = substr($short, 0, $max_hdr_len);
 424                                 $abbreviated_from = $enum
 425                                                     unless $abbreviated_from;
 426                                 # If the name we are to display conflicts, try
 427                                 # another.
 428                                 while (eval "exists
 429                                              \$${type}_abbreviations{$short}")
 430                                 {
 431                                     die $@ if $@;
 432                                     $short++;
 433                                 }
 434
 435                                 eval "\$${type}_abbreviations{$short} = '$enum'";
 436                                 die $@ if $@;
 437                             }
 438
 439                             # Remember the mapping from the property value
 440                             # (enum) name to its value.
 441                             eval "\$${type}_enums{$enum} = $value";
 442                             die $@ if $@;
 443
 444                             # Remember the inverse mapping to the short name
 445                             # so that we can properly label the generated
 446                             # table's rows and columns
 447                             eval "\$${type}_short_enums[$value] = '$short'";
 448                             die $@ if $@;
 449                         }
 450                     }
 451                 }
 452             }
 453
 454             # Inversion map stuff is currently used only by regexec
 455             switch_pound_if($name, 'PERL_IN_REGEXEC_C');
 456         {
 457
 458             # The short names tend to be two lower case letters, but it looks
 459             # better for those if they are upper. XXX
 460             $short_name = uc($short_name) if length($short_name) < 3
 461                                              || substr($short_name, 0, 1) =~ /[[:lower:]]/;
 462             $name_prefix = "${short_name}_";
 463             my $enum_count = keys %enums;
 464             print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
 465
 466             print $out_fh "\ntypedef enum {\n";
 467             my @enum_list;
 468             foreach my $enum (keys %enums) {
 469                 $enum_list[$enums{$enum}] = $enum;
 470             }
 471             foreach my $i (0 .. @enum_list - 1) {
 472                 my $name = $enum_list[$i];
 473                 print $out_fh  "\t${name_prefix}$name = $i";
 474                 print $out_fh "," if $i < $enum_count - 1;
 475                 print $out_fh "\n";
 476             }
 477             $declaration_type = "${name_prefix}enum";
 478             print $out_fh "} $declaration_type;\n";
 479
 480             $output_format = "${name_prefix}%s";
 481         }
 482     }
 483     else {
 484         die "'$input_format' invmap() format for '$prop_name' unimplemented";
 485     }
 486
 487     die "No inversion map for $prop_name" unless defined $invmap
 488                                              && ref $invmap eq 'ARRAY'
 489                                              && $count;
 490
 491     print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
 492     print $out_fh " /* for $charset */" if $charset;
 493     print $out_fh "\n";
 494
 495     # The main body are the scalars passed in to this routine.
 496     for my $i (0 .. $count - 1) {
 497         my $element = $invmap->[$i];
 498         my $full_element_name = prop_value_aliases($prop_name, $element);
 499         $element = $full_element_name if defined $full_element_name;
 500         $element = $name_prefix . $element;
 501         print $out_fh "\t$element";
 502         print $out_fh "," if $i < $count - 1;
 503         print $out_fh  "\n";
 504     }
 505     print $out_fh "};\n";
 506 }
 507
 508 sub mk_invlist_from_sorted_cp_list {
 509
 510     # Returns an inversion list constructed from the sorted input array of
 511     # code points
 512
 513     my $list_ref = shift;
 514
 515     return unless @$list_ref;
 516
 517     # Initialize to just the first element
 518     my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
 519
 520     # For each succeeding element, if it extends the previous range, adjust
 521     # up, otherwise add it.
 522     for my $i (1 .. @$list_ref - 1) {
 523         if ($invlist[-1] == $list_ref->[$i]) {
 524             $invlist[-1]++;
 525         }
 526         else {
 527             push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
 528         }
 529     }
 530     return @invlist;
 531 }
 532
 533 # Read in the Case Folding rules, and construct arrays of code points for the
 534 # properties we need.
 535 my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
 536 die "Could not find inversion map for Case_Folding" unless defined $format;
 537 die "Incorrect format '$format' for Case_Folding inversion map"
 538                                                     unless $format eq 'al'
 539                                                            || $format eq 'a';
 540 my @has_multi_char_fold;
 541 my @is_non_final_fold;
 542
 543 for my $i (0 .. @$folds_ref - 1) {
 544     next unless ref $folds_ref->[$i];   # Skip single-char folds
 545     push @has_multi_char_fold, $cp_ref->[$i];
 546
 547     # Add to the non-finals list each code point that is in a non-final
 548     # position
 549     for my $j (0 .. @{$folds_ref->[$i]} - 2) {
 550         push @is_non_final_fold, $folds_ref->[$i][$j]
 551                 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
 552     }
 553 }
 554
 555 sub _Perl_Non_Final_Folds {
 556     @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
 557     return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
 558 }
 559
 560 sub prop_name_for_cmp ($) { # Sort helper
 561     my $name = shift;
 562
 563     # Returns the input lowercased, with non-alphas removed, as well as
 564     # everything starting with a comma
 565
 566     $name =~ s/,.*//;
 567     $name =~ s/[[:^alpha:]]//g;
 568     return lc $name;
 569 }
 570
 571 sub UpperLatin1 {
 572     return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
 573 }
 574
 575 sub output_table_common {
 576
 577     # Common subroutine to actually output the generated rules table.
 578
 579     my ($property,
 580         $table_value_defines_ref,
 581         $table_ref,
 582         $names_ref,
 583         $abbreviations_ref) = @_;
 584     my $size = @$table_ref;
 585
 586     # Output the #define list, sorted by numeric value
 587     if ($table_value_defines_ref) {
 588         my $max_name_length = 0;
 589         my @defines;
 590
 591         # Put in order, and at the same time find the longest name
 592         while (my ($enum, $value) = each %$table_value_defines_ref) {
 593             $defines[$value] = $enum;
 594
 595             my $length = length $enum;
 596             $max_name_length = $length if $length > $max_name_length;
 597         }
 598
 599         print $out_fh "\n";
 600
 601         # Output, so that the values are vertically aligned in a column after
 602         # the longest name
 603         foreach my $i (0 .. @defines - 1) {
 604             next unless defined $defines[$i];
 605             printf $out_fh "#define %-*s  %2d\n",
 606                                       $max_name_length,
 607                                        $defines[$i],
 608                                           $i;
 609         }
 610     }
 611
 612     my $column_width = 2;   # We currently allow 2 digits for the number
 613
 614     # If the maximum value in the table is 1, it can be a bool.  (Being above
 615     # a U8 is not currently handled
 616     my $max_element = 0;
 617     for my $i (0 .. $size - 1) {
 618         for my $j (0 .. $size - 1) {
 619             next if $max_element >= $table_ref->[$i][$j];
 620             $max_element = $table_ref->[$i][$j];
 621         }
 622     }
 623     die "Need wider table column width given '$max_element"
 624                                     if length $max_element > $column_width;
 625
 626     my $table_type = ($max_element == 1)
 627                      ? 'bool'
 628                      : 'U8';
 629
 630     # If a name is longer than the width set aside for a column, its column
 631     # needs to have increased spacing so that the name doesn't get truncated
 632     # nor run into an adjacent column
 633     my @spacers;
 634
 635     # If we are being compiled on a Unicode version earlier than that which
 636     # this file was designed for, it may be that some of the property values
 637     # aren't in the current release, and so would be undefined if we didn't
 638     # define them ourselves.  Earlier code has done this, making them
 639     # lowercase characters of length one.  We look to see if any exist, so
 640     # that we can add an annotation to the output table
 641     my $has_placeholder = 0;
 642
 643     for my $i (0 .. $size - 1) {
 644         no warnings 'numeric';
 645         $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
 646         $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
 647     }
 648
 649     print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
 650
 651     # Calculate the column heading line
 652     my $header_line = "/* "
 653                     . (" " x $max_hdr_len)  # We let the row heading meld to
 654                                             # the '*/' for those that are at
 655                                             # the max
 656                     . " " x 3;    # Space for '*/ '
 657     # Now each column
 658     for my $i (0 .. $size - 1) {
 659         $header_line .= sprintf "%s%*s",
 660                                 $spacers[$i],
 661                                     $column_width + 1, # 1 for the ','
 662                                      $names_ref->[$i];
 663     }
 664     $header_line .= " */\n";
 665
 666     # If we have annotations, output it now.
 667     if ($has_placeholder || scalar %$abbreviations_ref) {
 668         my $text = "";
 669         foreach my $abbr (sort keys %$abbreviations_ref) {
 670             $text .= "; " if $text;
 671             $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
 672         }
 673         if ($has_placeholder) {
 674             $text .= "; other " if $text;
 675             $text .= "lowercase names are placeholders for"
 676                   .  " property values not defined until a later Unicode"
 677                   .  " release, so are irrelevant in this one, as they are"
 678                   .  " not assigned to any code points";
 679         }
 680
 681         my $indent = " " x 3;
 682         $text = $indent . "/* $text */";
 683
 684         # Wrap the text so that it is no wider than the table, which the
 685         # header line gives.
 686         my $output_width = length $header_line;
 687         while (length $text > $output_width) {
 688             my $cur_line = substr($text, 0, $output_width);
 689
 690             # Find the first blank back from the right end to wrap at.
 691             for (my $i = $output_width -1; $i > 0; $i--) {
 692                 if (substr($text, $i, 1) eq " ") {
 693                     print $out_fh substr($text, 0, $i), "\n";
 694
 695                     # Set so will look at just the remaining tail (which will
 696                     # be indented and have a '*' after the indent
 697                     $text = $indent . " * " . substr($text, $i + 1);
 698                     last;
 699                 }
 700             }
 701         }
 702
 703         # And any remaining
 704         print $out_fh $text, "\n" if $text;
 705     }
 706
 707     # We calculated the header line earlier just to get its width so that we
 708     # could make sure the annotations fit into that.
 709     print $out_fh $header_line;
 710
 711     # Now output the bulk of the table.
 712     for my $i (0 .. $size - 1) {
 713
 714         # First the row heading.
 715         printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i];
 716         print $out_fh "{";  # Then the brace for this row
 717
 718         # Then each column
 719         for my $j (0 .. $size -1) {
 720             print $out_fh $spacers[$j];
 721             printf $out_fh "%*d", $column_width, $table_ref->[$i][$j];
 722             print $out_fh "," if $j < $size - 1;
 723         }
 724         print $out_fh " }";
 725         print $out_fh "," if $i < $size - 1;
 726         print $out_fh "\n";
 727     }
 728
 729     print $out_fh "};\n";
 730 }
 731
 732 sub output_GCB_table() {
 733
 734     # Create and output the pair table for use in determining Grapheme Cluster
 735     # Breaks, given in http://www.unicode.org/reports/tr29/.
 736     my %gcb_actions = (
 737         GCB_NOBREAK                      => 0,
 738         GCB_BREAKABLE                    => 1,
 739         GCB_RI_then_RI                   => 2,   # Rules 12 and 13
 740         GCB_EX_then_EM                   => 3,   # Rule 10
 741     );
 742
 743     # The table is constructed in reverse order of the rules, to make the
 744     # lower-numbered, higher priority ones override the later ones, as the
 745     # algorithm stops at the earliest matching rule
 746
 747     my @gcb_table;
 748     my $table_size = @gcb_short_enums;
 749
 750     # Otherwise, break everywhere.
 751     # GB99   Any ÷  Any
 752     for my $i (0 .. $table_size - 1) {
 753         for my $j (0 .. $table_size - 1) {
 754             $gcb_table[$i][$j] = 1;
 755         }
 756     }
 757
 758     # Do not break within emoji flag sequences. That is, do not break between
 759     # regional indicator (RI) symbols if there is an odd number of RI
 760     # characters before the break point.  Must be resolved in runtime code.
 761     #
 762     # GB12 ^ (RI RI)* RI × RI
 763     # GB13 [^RI] (RI RI)* RI × RI
 764     $gcb_table[$gcb_enums{'Regional_Indicator'}]
 765               [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI};
 766
 767     # Do not break within emoji modifier sequences or emoji zwj sequences.
 768     # GB11  ZWJ  × ( Glue_After_Zwj | E_Base_GAZ )
 769     $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0;
 770     $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0;
 771
 772     # GB10  ( E_Base | E_Base_GAZ ) Extend* ×  E_Modifier
 773     $gcb_table[$gcb_enums{'Extend'}][$gcb_enums{'E_Modifier'}]
 774                                                 = $gcb_actions{GCB_EX_then_EM};
 775     $gcb_table[$gcb_enums{'E_Base'}][$gcb_enums{'E_Modifier'}] = 0;
 776     $gcb_table[$gcb_enums{'E_Base_GAZ'}][$gcb_enums{'E_Modifier'}] = 0;
 777
 778     # Do not break before extending characters or ZWJ.
 779     # Do not break before SpacingMarks, or after Prepend characters.
 780     # GB9b  Prepend  ×
 781     # GB9a  × SpacingMark
 782     # GB9   ×  ( Extend | ZWJ )
 783     for my $i (0 .. @gcb_table - 1) {
 784         $gcb_table[$gcb_enums{'Prepend'}][$i] = 0;
 785         $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0;
 786         $gcb_table[$i][$gcb_enums{'Extend'}] = 0;
 787         $gcb_table[$i][$gcb_enums{'ZWJ'}] = 0;
 788     }
 789
 790     # Do not break Hangul syllable sequences.
 791     # GB8  ( LVT | T)  ×  T
 792     $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
 793     $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
 794
 795     # GB7  ( LV | V )  ×  ( V | T )
 796     $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
 797     $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
 798     $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
 799     $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
 800
 801     # GB6  L  ×  ( L | V | LV | LVT )
 802     $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
 803     $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
 804     $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
 805     $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
 806
 807     # Do not break between a CR and LF. Otherwise, break before and after
 808     # controls.
 809     # GB5   ÷  ( Control | CR | LF )
 810     # GB4  ( Control | CR | LF )  ÷
 811     for my $i (0 .. @gcb_table - 1) {
 812         $gcb_table[$i][$gcb_enums{'Control'}] = 1;
 813         $gcb_table[$i][$gcb_enums{'CR'}] = 1;
 814         $gcb_table[$i][$gcb_enums{'LF'}] = 1;
 815         $gcb_table[$gcb_enums{'Control'}][$i] = 1;
 816         $gcb_table[$gcb_enums{'CR'}][$i] = 1;
 817         $gcb_table[$gcb_enums{'LF'}][$i] = 1;
 818     }
 819
 820     # GB3  CR  ×  LF
 821     $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
 822
 823     # Break at the start and end of text, unless the text is empty
 824     # GB1  sot  ÷
 825     # GB2   ÷  eot
 826     for my $i (0 .. @gcb_table - 1) {
 827         $gcb_table[$i][$gcb_enums{'EDGE'}] = 1;
 828         $gcb_table[$gcb_enums{'EDGE'}][$i] = 1;
 829     }
 830     $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0;
 831
 832     output_table_common('GCB', \%gcb_actions,
 833                         \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations);
 834 }
 835
 836 sub output_LB_table() {
 837
 838     # Create and output the enums, #defines, and pair table for use in
 839     # determining Line Breaks.  This uses the default line break algorithm,
 840     # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
 841     # in that page, as the Unicode-furnished tests assume that tailoring.
 842
 843     # The result is really just true or false.  But we follow along with tr14,
 844     # creating a rule which is false for something like X SP* X.  That gets
 845     # encoding 2.  The rest of the actions are synthetic ones that indicate
 846     # some context handling is required.  These each are added to the
 847     # underlying 0, 1, or 2, instead of replacing them, so that the underlying
 848     # value can be retrieved.  Actually only rules from 7 through 18 (which
 849     # are the ones where space matter) are possible to have 2 added to them.
 850     # The others below add just 0 or 1.  It might be possible for one
 851     # synthetic rule to be added to another, yielding a larger value.  This
 852     # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
 853     # names of the middle grouping below, it is impossible for that to occur
 854     # for them because they all start with mutually exclusive classes.  That
 855     # the final rule can't be added to any of the others isn't obvious from
 856     # its name, so it is assigned a power of 2 higher than the others can get
 857     # to so any addition would preserve all data.  (And the code will reach an
 858     # assert(0) on debugging builds should this happen.)
 859     my %lb_actions = (
 860         LB_NOBREAK                      => 0,
 861         LB_BREAKABLE                    => 1,
 862         LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
 863
 864         LB_CM_ZWJ_foo                   => 3,   # Rule 9
 865         LB_SP_foo                       => 6,   # Rule 18
 866         LB_PR_or_PO_then_OP_or_HY       => 9,   # Rule 25
 867         LB_SY_or_IS_then_various        => 11,  # Rule 25
 868         LB_HY_or_BA_then_foo            => 13,  # Rule 21
 869         LB_RI_then_RI                   => 15,  # Rule 30a
 870
 871         LB_various_then_PO_or_PR        => (1<<5),  # Rule 25
 872     );
 873
 874     # Construct the LB pair table.  This is based on the rules in
 875     # http://www.unicode.org/reports/tr14/, but modified as those rules are
 876     # designed for someone taking a string of text and sequentially going
 877     # through it to find the break opportunities, whereas, Perl requires
 878     # determining if a given random spot is a break opportunity, without
 879     # knowing all the entire string before it.
 880     #
 881     # The table is constructed in reverse order of the rules, to make the
 882     # lower-numbered, higher priority ones override the later ones, as the
 883     # algorithm stops at the earliest matching rule
 884
 885     my @lb_table;
 886     my $table_size = @lb_short_enums;
 887
 888     # LB31. Break everywhere else
 889     for my $i (0 .. $table_size - 1) {
 890         for my $j (0 .. $table_size - 1) {
 891             $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
 892         }
 893     }
 894
 895     # LB30b Do not break between an emoji base and an emoji modifier.
 896     # EB × EM
 897     $lb_table[$lb_enums{'E_Base'}][$lb_enums{'E_Modifier'}]
 898                                                 = $lb_actions{'LB_NOBREAK'};
 899
 900     # LB30a Break between two regional indicator symbols if and only if there
 901     # are an even number of regional indicators preceding the position of the
 902     # break.
 903     # sot (RI RI)* RI × RI
 904     # [^RI] (RI RI)* RI × RI
 905     $lb_table[$lb_enums{'Regional_Indicator'}]
 906              [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_RI_then_RI'};
 907
 908     # LB30 Do not break between letters, numbers, or ordinary symbols and
 909     # opening or closing parentheses.
 910     # (AL | HL | NU) × OP
 911     $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}]
 912                                                 = $lb_actions{'LB_NOBREAK'};
 913     $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}]
 914                                                 = $lb_actions{'LB_NOBREAK'};
 915     $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}]
 916                                                 = $lb_actions{'LB_NOBREAK'};
 917
 918     # CP × (AL | HL | NU)
 919     $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}]
 920                                                 = $lb_actions{'LB_NOBREAK'};
 921     $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}]
 922                                                 = $lb_actions{'LB_NOBREAK'};
 923     $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}]
 924                                                 = $lb_actions{'LB_NOBREAK'};
 925
 926     # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
 927     # IS × (AL | HL)
 928     $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}]
 929                                                 = $lb_actions{'LB_NOBREAK'};
 930     $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
 931                                                 = $lb_actions{'LB_NOBREAK'};
 932
 933     # LB28 Do not break between alphabetics (“at”).
 934     # (AL | HL) × (AL | HL)
 935     $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}]
 936                                                 = $lb_actions{'LB_NOBREAK'};
 937     $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}]
 938                                                 = $lb_actions{'LB_NOBREAK'};
 939     $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}]
 940                                                 = $lb_actions{'LB_NOBREAK'};
 941     $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}]
 942                                                 = $lb_actions{'LB_NOBREAK'};
 943
 944     # LB27 Treat a Korean Syllable Block the same as ID.
 945     # (JL | JV | JT | H2 | H3) × IN
 946     $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}]
 947                                                 = $lb_actions{'LB_NOBREAK'};
 948     $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}]
 949                                                 = $lb_actions{'LB_NOBREAK'};
 950     $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}]
 951                                                 = $lb_actions{'LB_NOBREAK'};
 952     $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}]
 953                                                 = $lb_actions{'LB_NOBREAK'};
 954     $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}]
 955                                                 = $lb_actions{'LB_NOBREAK'};
 956
 957     # (JL | JV | JT | H2 | H3) × PO
 958     $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}]
 959                                                 = $lb_actions{'LB_NOBREAK'};
 960     $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}]
 961                                                 = $lb_actions{'LB_NOBREAK'};
 962     $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}]
 963                                                 = $lb_actions{'LB_NOBREAK'};
 964     $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}]
 965                                                 = $lb_actions{'LB_NOBREAK'};
 966     $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}]
 967                                                 = $lb_actions{'LB_NOBREAK'};
 968
 969     # PR × (JL | JV | JT | H2 | H3)
 970     $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}]
 971                                                 = $lb_actions{'LB_NOBREAK'};
 972     $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}]
 973                                                 = $lb_actions{'LB_NOBREAK'};
 974     $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}]
 975                                                 = $lb_actions{'LB_NOBREAK'};
 976     $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}]
 977                                                 = $lb_actions{'LB_NOBREAK'};
 978     $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}]
 979                                                 = $lb_actions{'LB_NOBREAK'};
 980
 981     # LB26 Do not break a Korean syllable.
 982     # JL × (JL | JV | H2 | H3)
 983     $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
 984     $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
 985     $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
 986     $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
 987
 988     # (JV | H2) × (JV | JT)
 989     $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
 990     $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
 991     $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
 992     $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
 993
 994     # (JT | H3) × JT
 995     $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
 996     $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
 997
 998     # LB25 Do not break between the following pairs of classes relevant to
 999     # numbers, as tailored by example 7 in
1000     # http://www.unicode.org/reports/tr14/#Examples
1001     # We follow that tailoring because Unicode's test cases expect it
1002     # (PR | PO) × ( OP | HY )? NU
1003     $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}]
1004                                                 = $lb_actions{'LB_NOBREAK'};
1005     $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}]
1006                                                 = $lb_actions{'LB_NOBREAK'};
1007
1008         # Given that (OP | HY )? is optional, we have to test for it in code.
1009         # We add in the action (instead of overriding) for this, so that in
1010         # the code we can recover the underlying break value.
1011     $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}]
1012                                     += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1013     $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}]
1014                                     += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1015     $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}]
1016                                     += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1017     $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}]
1018                                     += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1019
1020     # ( OP | HY ) × NU
1021     $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}]
1022                                                 = $lb_actions{'LB_NOBREAK'};
1023     $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}]
1024                                                 = $lb_actions{'LB_NOBREAK'};
1025
1026     # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
1027     # which can be rewritten as:
1028     # NU (SY | IS)* × (NU | SY | IS | CL | CP )
1029     $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}]
1030                                                 = $lb_actions{'LB_NOBREAK'};
1031     $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}]
1032                                                 = $lb_actions{'LB_NOBREAK'};
1033     $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}]
1034                                                 = $lb_actions{'LB_NOBREAK'};
1035     $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}]
1036                                                 = $lb_actions{'LB_NOBREAK'};
1037     $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}]
1038                                                 = $lb_actions{'LB_NOBREAK'};
1039
1040         # Like earlier where we have to test in code, we add in the action so
1041         # that we can recover the underlying values.  This is done in rules
1042         # below, as well.  The code assumes that we haven't added 2 actions.
1043         # Shoul a later Unicode release break that assumption, then tests
1044         # should start failing.
1045     $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}]
1046                                     += $lb_actions{'LB_SY_or_IS_then_various'};
1047     $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}]
1048                                     += $lb_actions{'LB_SY_or_IS_then_various'};
1049     $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}]
1050                                     += $lb_actions{'LB_SY_or_IS_then_various'};
1051     $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}]
1052                                     += $lb_actions{'LB_SY_or_IS_then_various'};
1053     $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}]
1054                                     += $lb_actions{'LB_SY_or_IS_then_various'};
1055     $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}]
1056                                     += $lb_actions{'LB_SY_or_IS_then_various'};
1057     $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}]
1058                                     += $lb_actions{'LB_SY_or_IS_then_various'};
1059     $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}]
1060                                     += $lb_actions{'LB_SY_or_IS_then_various'};
1061     $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}]
1062                                     += $lb_actions{'LB_SY_or_IS_then_various'};
1063     $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}]
1064                                     += $lb_actions{'LB_SY_or_IS_then_various'};
1065
1066     # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
1067     # which can be rewritten as:
1068     # NU (SY | IS)* (CL | CP)? × (PO | PR)
1069     $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}]
1070                                                 = $lb_actions{'LB_NOBREAK'};
1071     $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}]
1072                                                 = $lb_actions{'LB_NOBREAK'};
1073
1074     $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}]
1075                                     += $lb_actions{'LB_various_then_PO_or_PR'};
1076     $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}]
1077                                     += $lb_actions{'LB_various_then_PO_or_PR'};
1078     $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}]
1079                                     += $lb_actions{'LB_various_then_PO_or_PR'};
1080     $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}]
1081                                     += $lb_actions{'LB_various_then_PO_or_PR'};
1082
1083     $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}]
1084                                     += $lb_actions{'LB_various_then_PO_or_PR'};
1085     $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}]
1086                                     += $lb_actions{'LB_various_then_PO_or_PR'};
1087     $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}]
1088                                     += $lb_actions{'LB_various_then_PO_or_PR'};
1089     $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}]
1090                                     += $lb_actions{'LB_various_then_PO_or_PR'};
1091
1092     # LB24 Do not break between numeric prefix/postfix and letters, or between
1093     # letters and prefix/postfix.
1094     # (PR | PO) × (AL | HL)
1095     $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}]
1096                                                 = $lb_actions{'LB_NOBREAK'};
1097     $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1098                                                 = $lb_actions{'LB_NOBREAK'};
1099     $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}]
1100                                                 = $lb_actions{'LB_NOBREAK'};
1101     $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1102                                                 = $lb_actions{'LB_NOBREAK'};
1103
1104     # (AL | HL) × (PR | PO)
1105     $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Prefix_Numeric'}]
1106                                                 = $lb_actions{'LB_NOBREAK'};
1107     $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Prefix_Numeric'}]
1108                                                 = $lb_actions{'LB_NOBREAK'};
1109     $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Postfix_Numeric'}]
1110                                                 = $lb_actions{'LB_NOBREAK'};
1111     $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Postfix_Numeric'}]
1112                                                 = $lb_actions{'LB_NOBREAK'};
1113
1114     # LB23a Do not break between numeric prefixes and ideographs, or between
1115     # ideographs and numeric postfixes.
1116     # PR × (ID | EB | EM)
1117     $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}]
1118                                                 = $lb_actions{'LB_NOBREAK'};
1119     $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Base'}]
1120                                                 = $lb_actions{'LB_NOBREAK'};
1121     $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Modifier'}]
1122                                                 = $lb_actions{'LB_NOBREAK'};
1123
1124     # (ID | EB | EM) × PO
1125     $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}]
1126                                                 = $lb_actions{'LB_NOBREAK'};
1127     $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Postfix_Numeric'}]
1128                                                 = $lb_actions{'LB_NOBREAK'};
1129     $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Postfix_Numeric'}]
1130                                                 = $lb_actions{'LB_NOBREAK'};
1131
1132     # LB23 Do not break between digits and letters
1133     # (AL | HL) × NU
1134     $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}]
1135                                                 = $lb_actions{'LB_NOBREAK'};
1136     $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}]
1137                                                 = $lb_actions{'LB_NOBREAK'};
1138
1139     # NU × (AL | HL)
1140     $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}]
1141                                                 = $lb_actions{'LB_NOBREAK'};
1142     $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}]
1143                                                 = $lb_actions{'LB_NOBREAK'};
1144
1145     # LB22 Do not break between two ellipses, or between letters, numbers or
1146     # exclamations and ellipsis.
1147     # (AL | HL) × IN
1148     $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}]
1149                                                 = $lb_actions{'LB_NOBREAK'};
1150     $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}]
1151                                                 = $lb_actions{'LB_NOBREAK'};
1152
1153     # Exclamation × IN
1154     $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}]
1155                                                 = $lb_actions{'LB_NOBREAK'};
1156
1157     # (ID | EB | EM) × IN
1158     $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}]
1159                                                 = $lb_actions{'LB_NOBREAK'};
1160     $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Inseparable'}]
1161                                                 = $lb_actions{'LB_NOBREAK'};
1162     $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Inseparable'}]
1163                                                 = $lb_actions{'LB_NOBREAK'};
1164
1165     # IN × IN
1166     $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}]
1167                                                 = $lb_actions{'LB_NOBREAK'};
1168
1169     # NU × IN
1170     $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}]
1171                                                 = $lb_actions{'LB_NOBREAK'};
1172
1173     # LB21b Don’t break between Solidus and Hebrew letters.
1174     # SY × HL
1175     $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}]
1176                                                 = $lb_actions{'LB_NOBREAK'};
1177
1178     # LB21a Don't break after Hebrew + Hyphen.
1179     # HL (HY | BA) ×
1180     for my $i (0 .. @lb_table - 1) {
1181         $lb_table[$lb_enums{'Hyphen'}][$i]
1182                                         += $lb_actions{'LB_HY_or_BA_then_foo'};
1183         $lb_table[$lb_enums{'Break_After'}][$i]
1184                                         += $lb_actions{'LB_HY_or_BA_then_foo'};
1185     }
1186
1187     # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
1188     # spaces, small kana, and other non-starters, or after acute accents.
1189     # × BA
1190     # × HY
1191     # × NS
1192     # BB ×
1193     for my $i (0 .. @lb_table - 1) {
1194         $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'};
1195         $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'};
1196         $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'};
1197         $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'};
1198     }
1199
1200     # LB20 Break before and after unresolved CB.
1201     # ÷ CB
1202     # CB ÷
1203     # Conditional breaks should be resolved external to the line breaking
1204     # rules. However, the default action is to treat unresolved CB as breaking
1205     # before and after.
1206     for my $i (0 .. @lb_table - 1) {
1207         $lb_table[$i][$lb_enums{'Contingent_Break'}]
1208                                                 = $lb_actions{'LB_BREAKABLE'};
1209         $lb_table[$lb_enums{'Contingent_Break'}][$i]
1210                                                 = $lb_actions{'LB_BREAKABLE'};
1211     }
1212
1213     # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
1214     # × QU
1215     # QU ×
1216     for my $i (0 .. @lb_table - 1) {
1217         $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'};
1218         $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'};
1219     }
1220
1221     # LB18 Break after spaces
1222     # SP ÷
1223     for my $i (0 .. @lb_table - 1) {
1224         $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'};
1225     }
1226
1227     # LB17 Do not break within ‘——’, even with intervening spaces.
1228     # B2 SP* × B2
1229     $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}]
1230                            = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1231
1232     # LB16 Do not break between closing punctuation and a nonstarter even with
1233     # intervening spaces.
1234     # (CL | CP) SP* × NS
1235     $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}]
1236                             = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1237     $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}]
1238                             = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1239
1240
1241     # LB15 Do not break within ‘”[’, even with intervening spaces.
1242     # QU SP* × OP
1243     $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}]
1244                             = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1245
1246     # LB14 Do not break after ‘[’, even after spaces.
1247     # OP SP* ×
1248     for my $i (0 .. @lb_table - 1) {
1249         $lb_table[$lb_enums{'Open_Punctuation'}][$i]
1250                             = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1251     }
1252
1253     # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
1254     # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
1255     # [^NU] × CL
1256     # [^NU] × CP
1257     # × EX
1258     # [^NU] × IS
1259     # [^NU] × SY
1260     for my $i (0 .. @lb_table - 1) {
1261         $lb_table[$i][$lb_enums{'Exclamation'}]
1262                             = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1263
1264         next if $i == $lb_enums{'Numeric'};
1265
1266         $lb_table[$i][$lb_enums{'Close_Punctuation'}]
1267                             = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1268         $lb_table[$i][$lb_enums{'Close_Parenthesis'}]
1269                             = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1270         $lb_table[$i][$lb_enums{'Infix_Numeric'}]
1271                             = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1272         $lb_table[$i][$lb_enums{'Break_Symbols'}]
1273                             = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1274     }
1275
1276     # LB12a Do not break before NBSP and related characters, except after
1277     # spaces and hyphens.
1278     # [^SP BA HY] × GL
1279     for my $i (0 .. @lb_table - 1) {
1280         next if    $i == $lb_enums{'Space'}
1281                 || $i == $lb_enums{'Break_After'}
1282                 || $i == $lb_enums{'Hyphen'};
1283
1284         # We don't break, but if a property above has said don't break even
1285         # with space between, don't override that (also in the next few rules)
1286         next if $lb_table[$i][$lb_enums{'Glue'}]
1287                             == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1288         $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'};
1289     }
1290
1291     # LB12 Do not break after NBSP and related characters.
1292     # GL ×
1293     for my $i (0 .. @lb_table - 1) {
1294         next if $lb_table[$lb_enums{'Glue'}][$i]
1295                             == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1296         $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'};
1297     }
1298
1299     # LB11 Do not break before or after Word joiner and related characters.
1300     # × WJ
1301     # WJ ×
1302     for my $i (0 .. @lb_table - 1) {
1303         if ($lb_table[$i][$lb_enums{'Word_Joiner'}]
1304                         != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1305         {
1306             $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'};
1307         }
1308         if ($lb_table[$lb_enums{'Word_Joiner'}][$i]
1309                         != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1310         {
1311             $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'};
1312         }
1313     }
1314
1315     # Special case this here to avoid having to do a special case in the code,
1316     # by making this the same as other things with a SP in front of them that
1317     # don't break, we avoid an extra test
1318     $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}]
1319                             = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1320
1321     # LB9 and LB10 are done in the same loop
1322     #
1323     # LB9 Do not break a combining character sequence; treat it as if it has
1324     # the line breaking class of the base character in all of the
1325     # higher-numbered rules.  Treat ZWJ as if it were CM
1326     # Treat X (CM|ZWJ)* as if it were X.
1327     # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
1328
1329     # LB10 Treat any remaining combining mark or ZWJ as AL.  This catches the
1330     # case where a CM or ZWJ is the first character on the line or follows SP,
1331     # BK, CR, LF, NL, or ZW.
1332     for my $i (0 .. @lb_table - 1) {
1333
1334         # When the CM or ZWJ is the first in the pair, we don't know without
1335         # looking behind whether the CM or ZWJ is going to attach to an
1336         # earlier character, or not.  So have to figure this out at runtime in
1337         # the code
1338         $lb_table[$lb_enums{'Combining_Mark'}][$i]
1339                                         = $lb_actions{'LB_CM_ZWJ_foo'};
1340         $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_CM_ZWJ_foo'};
1341
1342         if (   $i == $lb_enums{'Mandatory_Break'}
1343             || $i == $lb_enums{'EDGE'}
1344             || $i == $lb_enums{'Carriage_Return'}
1345             || $i == $lb_enums{'Line_Feed'}
1346             || $i == $lb_enums{'Next_Line'}
1347             || $i == $lb_enums{'Space'}
1348             || $i == $lb_enums{'ZWSpace'})
1349         {
1350             # For these classes, a following CM doesn't combine, and should do
1351             # whatever 'Alphabetic' would do.
1352             $lb_table[$i][$lb_enums{'Combining_Mark'}]
1353                                     = $lb_table[$i][$lb_enums{'Alphabetic'}];
1354             $lb_table[$i][$lb_enums{'ZWJ'}]
1355                                     = $lb_table[$i][$lb_enums{'Alphabetic'}];
1356         }
1357         else {
1358             # For these classes, the CM or ZWJ combines, so doesn't break,
1359             # inheriting the type of nobreak from the master character.
1360             if ($lb_table[$i][$lb_enums{'Combining_Mark'}]
1361                             != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1362             {
1363                 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1364                                         = $lb_actions{'LB_NOBREAK'};
1365             }
1366             if ($lb_table[$i][$lb_enums{'ZWJ'}]
1367                             != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1368             {
1369                 $lb_table[$i][$lb_enums{'ZWJ'}]
1370                                         = $lb_actions{'LB_NOBREAK'};
1371             }
1372         }
1373     }
1374
1375     # LB8a Do not break between a zero width joiner and an ideograph, emoji
1376     # base or emoji modifier. This rule prevents breaks within emoji joiner
1377     # sequences.
1378     # ZWJ × (ID | EB | EM)
1379     $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}]
1380                                                     = $lb_actions{'LB_NOBREAK'};
1381     $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}]
1382                                                     = $lb_actions{'LB_NOBREAK'};
1383     $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}]
1384                                                     = $lb_actions{'LB_NOBREAK'};
1385
1386     # LB8 Break before any character following a zero-width space, even if one
1387     # or more spaces intervene.
1388     # ZW SP* ÷
1389     for my $i (0 .. @lb_table - 1) {
1390         $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'};
1391     }
1392
1393     # Because of LB8-10, we need to look at context for "SP x", and this must
1394     # be done in the code.  So override the existing rules for that, by adding
1395     # a constant to get new rules that tell the code it needs to look at
1396     # context.  By adding this action instead of replacing the existing one,
1397     # we can get back to the original rule if necessary.
1398     for my $i (0 .. @lb_table - 1) {
1399         $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'};
1400     }
1401
1402     # LB7 Do not break before spaces or zero width space.
1403     # × SP
1404     # × ZW
1405     for my $i (0 .. @lb_table - 1) {
1406         $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'};
1407         $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'};
1408     }
1409
1410     # LB6 Do not break before hard line breaks.
1411     # × ( BK | CR | LF | NL )
1412     for my $i (0 .. @lb_table - 1) {
1413         $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'};
1414         $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'};
1415         $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'};
1416         $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'};
1417     }
1418
1419     # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
1420     # CR × LF
1421     # CR !
1422     # LF !
1423     # NL !
1424     for my $i (0 .. @lb_table - 1) {
1425         $lb_table[$lb_enums{'Carriage_Return'}][$i]
1426                                 = $lb_actions{'LB_BREAKABLE'};
1427         $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'};
1428         $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'};
1429     }
1430     $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}]
1431                             = $lb_actions{'LB_NOBREAK'};
1432
1433     # LB4 Always break after hard line breaks.
1434     # BK !
1435     for my $i (0 .. @lb_table - 1) {
1436         $lb_table[$lb_enums{'Mandatory_Break'}][$i]
1437                                 = $lb_actions{'LB_BREAKABLE'};
1438     }
1439
1440     # LB3 Always break at the end of text.
1441     # ! eot
1442     # LB2 Never break at the start of text.
1443     # sot ×
1444     for my $i (0 .. @lb_table - 1) {
1445         $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'};
1446         $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'};
1447     }
1448
1449     # LB1 Assign a line breaking class to each code point of the input.
1450     # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
1451     # depending on criteria outside the scope of this algorithm.
1452     #
1453     # In the absence of such criteria all characters with a specific
1454     # combination of original class and General_Category property value are
1455     # resolved as follows:
1456     # Original     Resolved  General_Category
1457     # AI, SG, XX      AL      Any
1458     # SA              CM      Only Mn or Mc
1459     # SA              AL      Any except Mn and Mc
1460     # CJ              NS      Any
1461     #
1462     # This is done in mktables, so we never see any of the remapped-from
1463     # classes.
1464
1465     output_table_common('LB', \%lb_actions,
1466                         \@lb_table, \@lb_short_enums, \%lb_abbreviations);
1467 }
1468
1469 sub output_WB_table() {
1470
1471     # Create and output the enums, #defines, and pair table for use in
1472     # determining Word Breaks, given in http://www.unicode.org/reports/tr29/.
1473
1474     # This uses the same mechanism in the other bounds tables generated by
1475     # this file.  The actions that could override a 0 or 1 are added to those
1476     # numbers; the actions that clearly don't depend on the underlying rule
1477     # simply overwrite
1478     my %wb_actions = (
1479         WB_NOBREAK                      => 0,
1480         WB_BREAKABLE                    => 1,
1481         WB_hs_then_hs                   => 2,
1482         WB_Ex_or_FO_or_ZWJ_then_foo     => 3,
1483         WB_DQ_then_HL                   => 4,
1484         WB_HL_then_DQ                   => 6,
1485         WB_LE_or_HL_then_MB_or_ML_or_SQ => 8,
1486         WB_MB_or_ML_or_SQ_then_LE_or_HL => 10,
1487         WB_MB_or_MN_or_SQ_then_NU       => 12,
1488         WB_NU_then_MB_or_MN_or_SQ       => 14,
1489         WB_RI_then_RI                   => 16,
1490     );
1491
1492     # Construct the WB pair table.
1493     # The table is constructed in reverse order of the rules, to make the
1494     # lower-numbered, higher priority ones override the later ones, as the
1495     # algorithm stops at the earliest matching rule
1496
1497     my @wb_table;
1498     my $table_size = @wb_short_enums - 1;   # -1 because we don't use UNKNOWN
1499
1500     # Otherwise, break everywhere (including around ideographs).
1501     # WB99  Any  ÷  Any
1502     for my $i (0 .. $table_size - 1) {
1503         for my $j (0 .. $table_size - 1) {
1504             $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'};
1505         }
1506     }
1507
1508     # Do not break within emoji flag sequences. That is, do not break between
1509     # regional indicator (RI) symbols if there is an odd number of RI
1510     # characters before the break point.
1511     # WB16  [^RI] (RI RI)* RI × RI
1512     # WB15   ^    (RI RI)* RI × RI
1513     $wb_table[$wb_enums{'Regional_Indicator'}]
1514              [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_RI_then_RI'};
1515
1516     # Do not break within emoji modifier sequences.
1517     # WB14  ( E_Base | EBG )  ×  E_Modifier
1518     $wb_table[$wb_enums{'E_Base'}][$wb_enums{'E_Modifier'}]
1519                                                     = $wb_actions{'WB_NOBREAK'};
1520     $wb_table[$wb_enums{'E_Base_GAZ'}][$wb_enums{'E_Modifier'}]
1521                                                     = $wb_actions{'WB_NOBREAK'};
1522
1523     # Do not break from extenders.
1524     # WB13b  ExtendNumLet  ×  (ALetter | Hebrew_Letter | Numeric | Katakana)
1525     $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
1526                                                 = $wb_actions{'WB_NOBREAK'};
1527     $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
1528                                                 = $wb_actions{'WB_NOBREAK'};
1529     $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
1530                                                 = $wb_actions{'WB_NOBREAK'};
1531     $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}]
1532                                                 = $wb_actions{'WB_NOBREAK'};
1533
1534     # WB13a  (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
1535     #        × # ExtendNumLet
1536     $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
1537                                                 = $wb_actions{'WB_NOBREAK'};
1538     $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
1539                                                 = $wb_actions{'WB_NOBREAK'};
1540     $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
1541                                                 = $wb_actions{'WB_NOBREAK'};
1542     $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}]
1543                                                 = $wb_actions{'WB_NOBREAK'};
1544     $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}]
1545                                                 = $wb_actions{'WB_NOBREAK'};
1546
1547     # Do not break between Katakana.
1548     # WB13  Katakana  ×  Katakana
1549     $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}]
1550                                                 = $wb_actions{'WB_NOBREAK'};
1551
1552     # Do not break within sequences, such as “3.2” or “3,456.789”.
1553     # WB12  Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric
1554     $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}]
1555                                     += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1556     $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}]
1557                                     += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1558     $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}]
1559                                     += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1560
1561     # WB11  Numeric (MidNum | (MidNumLet | Single_Quote))  ×  Numeric
1562     $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}]
1563                                     += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1564     $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}]
1565                                     += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1566     $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}]
1567                                     += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1568
1569     # Do not break within sequences of digits, or digits adjacent to letters
1570     # (“3a”, or “A3”).
1571     # WB10  Numeric  ×  (ALetter | Hebrew_Letter)
1572     $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
1573                                                 = $wb_actions{'WB_NOBREAK'};
1574     $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
1575                                                 = $wb_actions{'WB_NOBREAK'};
1576
1577     # WB9  (ALetter | Hebrew_Letter)  ×  Numeric
1578     $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
1579                                                 = $wb_actions{'WB_NOBREAK'};
1580     $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
1581                                                 = $wb_actions{'WB_NOBREAK'};
1582
1583     # WB8  Numeric  ×  Numeric
1584     $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}]
1585                                                 = $wb_actions{'WB_NOBREAK'};
1586
1587     # Do not break letters across certain punctuation.
1588     # WB7c  Hebrew_Letter Double_Quote  ×  Hebrew_Letter
1589     $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}]
1590                                             += $wb_actions{'WB_DQ_then_HL'};
1591
1592     # WB7b  Hebrew_Letter  ×  Double_Quote Hebrew_Letter
1593     $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}]
1594                                             += $wb_actions{'WB_HL_then_DQ'};
1595
1596     # WB7a  Hebrew_Letter  ×  Single_Quote
1597     $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1598                                                 = $wb_actions{'WB_NOBREAK'};
1599
1600     # WB7  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)
1601     #       × (ALetter | Hebrew_Letter)
1602     $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
1603                             += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1604     $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
1605                             += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1606     $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
1607                             += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1608     $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
1609                             += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1610     $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
1611                             += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1612     $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
1613                             += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1614
1615     # WB6  (ALetter | Hebrew_Letter)  ×  (MidLetter | MidNumLet
1616     #       | Single_Quote) (ALetter | Hebrew_Letter)
1617     $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
1618                             += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1619     $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
1620                             += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1621     $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
1622                             += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1623     $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
1624                             += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1625     $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
1626                             += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1627     $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1628                             += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1629
1630     # Do not break between most letters.
1631     # WB5  (ALetter | Hebrew_Letter)  ×  (ALetter | Hebrew_Letter)
1632     $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
1633                                                     = $wb_actions{'WB_NOBREAK'};
1634     $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
1635                                                     = $wb_actions{'WB_NOBREAK'};
1636     $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
1637                                                     = $wb_actions{'WB_NOBREAK'};
1638     $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
1639                                                     = $wb_actions{'WB_NOBREAK'};
1640
1641     # Ignore Format and Extend characters, except after sot, CR, LF, and
1642     # Newline.  This also has the effect of: Any × (Format | Extend | ZWJ)
1643     # WB4  X (Extend | Format | ZWJ)* → X
1644     for my $i (0 .. @wb_table - 1) {
1645         $wb_table[$wb_enums{'Extend'}][$i]
1646                                 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1647         $wb_table[$wb_enums{'Format'}][$i]
1648                                 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1649         $wb_table[$wb_enums{'ZWJ'}][$i]
1650                                 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1651     }
1652     for my $i (0 .. @wb_table - 1) {
1653         $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1654         $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
1655         $wb_table[$i][$wb_enums{'ZWJ'}]    = $wb_actions{'WB_NOBREAK'};
1656     }
1657
1658     # Implied is that these attach to the character before them, except for
1659     # the characters that mark the end of a region of text.  The rules below
1660     # override the ones set up here, for all the characters that need
1661     # overriding.
1662     for my $i (0 .. @wb_table - 1) {
1663         $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1664         $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
1665     }
1666
1667     # Do not break within emoji zwj sequences.
1668     # WB3c ZWJ × ( Glue_After_Zwj | EBG )
1669     $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}]
1670                                                 = $wb_actions{'WB_NOBREAK'};
1671     $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}]
1672                                                 = $wb_actions{'WB_NOBREAK'};
1673
1674     # Break before and after white space
1675     # WB3b     ÷  (Newline | CR | LF)
1676     # WB3a  (Newline | CR | LF)  ÷
1677     # et. al.
1678     for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1679         for my $j (0 .. @wb_table - 1) {
1680             $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'};
1681             $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'};
1682         }
1683     }
1684
1685     # But do not break within white space.
1686     # WB3  CR  ×  LF
1687     # et.al.
1688     for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1689         for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1690             $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'};
1691         }
1692     }
1693
1694     # And do not break horizontal space followed by Extend or Format or ZWJ
1695     $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}]
1696                                                     = $wb_actions{'WB_NOBREAK'};
1697     $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}]
1698                                                     = $wb_actions{'WB_NOBREAK'};
1699     $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'ZWJ'}]
1700                                                     = $wb_actions{'WB_NOBREAK'};
1701     $wb_table[$wb_enums{'Perl_Tailored_HSpace'}]
1702               [$wb_enums{'Perl_Tailored_HSpace'}]
1703                                                 = $wb_actions{'WB_hs_then_hs'};
1704
1705     # Break at the start and end of text, unless the text is empty
1706     # WB2  Any  ÷  eot
1707     # WB1  sot  ÷  Any
1708     for my $i (0 .. @wb_table - 1) {
1709         $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'};
1710         $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'};
1711     }
1712     $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0;
1713
1714     output_table_common('WB', \%wb_actions,
1715                         \@wb_table, \@wb_short_enums, \%wb_abbreviations);
1716 }
1717
1718 output_invlist("Latin1", [ 0, 256 ]);
1719 output_invlist("AboveLatin1", [ 256 ]);
1720
1721 end_file_pound_if;
1722
1723 # We construct lists for all the POSIX and backslash sequence character
1724 # classes in two forms:
1725 #   1) ones which match only in the ASCII range
1726 #   2) ones which match either in the Latin1 range, or the entire Unicode range
1727 #
1728 # These get compiled in, and hence affect the memory footprint of every Perl
1729 # program, even those not using Unicode.  To minimize the size, currently
1730 # the Latin1 version is generated for the beyond ASCII range except for those
1731 # lists that are quite small for the entire range, such as for \s, which is 22
1732 # UVs long plus 4 UVs (currently) for the header.
1733 #
1734 # To save even more memory, the ASCII versions could be derived from the
1735 # larger ones at runtime, saving some memory (minus the expense of the machine
1736 # instructions to do so), but these are all small anyway, so their total is
1737 # about 100 UVs.
1738 #
1739 # In the list of properties below that get generated, the L1 prefix is a fake
1740 # property that means just the Latin1 range of the full property (whose name
1741 # has an X prefix instead of L1).
1742 #
1743 # An initial & means to use the subroutine from this file instead of an
1744 # official inversion list.
1745
1746 for my $charset (get_supported_code_pages()) {
1747     print $out_fh "\n" . get_conditional_compile_line_start($charset);
1748
1749     @a2n = @{get_a2n($charset)};
1750     no warnings 'qw';
1751                          # Ignore non-alpha in sort
1752     for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
1753                              Assigned
1754                              ASCII
1755                              Cased
1756                              VertSpace
1757                              XPerlSpace
1758                              XPosixAlnum
1759                              XPosixAlpha
1760                              XPosixBlank
1761                              XPosixCntrl
1762                              XPosixDigit
1763                              XPosixGraph
1764                              XPosixLower
1765                              XPosixPrint
1766                              XPosixPunct
1767                              XPosixSpace
1768                              XPosixUpper
1769                              XPosixWord
1770                              XPosixXDigit
1771                              _Perl_Any_Folds
1772                              &NonL1_Perl_Non_Final_Folds
1773                              _Perl_Folds_To_Multi_Char
1774                              &UpperLatin1
1775                              _Perl_IDStart
1776                              _Perl_IDCont
1777                              _Perl_GCB,EDGE
1778                              _Perl_LB,EDGE
1779                              _Perl_SB,EDGE
1780                              _Perl_WB,EDGE,UNKNOWN
1781                            )
1782     ) {
1783
1784         # For the Latin1 properties, we change to use the eXtended version of the
1785         # base property, then go through the result and get rid of everything not
1786         # in Latin1 (above 255).  Actually, we retain the element for the range
1787         # that crosses the 255/256 boundary if it is one that matches the
1788         # property.  For example, in the Word property, there is a range of code
1789         # points that start at U+00F8 and goes through U+02C1.  Instead of
1790         # artificially cutting that off at 256 because 256 is the first code point
1791         # above Latin1, we let the range go to its natural ending.  That gives us
1792         # extra information with no added space taken.  But if the range that
1793         # crosses the boundary is one that doesn't match the property, we don't
1794         # start a new range above 255, as that could be construed as going to
1795         # infinity.  For example, the Upper property doesn't include the character
1796         # at 255, but does include the one at 256.  We don't include the 256 one.
1797         my $prop_name = $prop;
1798         my $is_local_sub = $prop_name =~ s/^&//;
1799         my $extra_enums = "";
1800         $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
1801         my $lookup_prop = $prop_name;
1802         my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1803                        or $lookup_prop =~ s/^L1//);
1804         my $nonl1_only = 0;
1805         $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
1806         ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
1807
1808         my @invlist;
1809         my @invmap;
1810         my $map_format;
1811         my $map_default;
1812         my $maps_to_code_point;
1813         my $to_adjust;
1814         if ($is_local_sub) {
1815             @invlist = eval $lookup_prop;
1816             die $@ if $@;
1817         }
1818         else {
1819             @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
1820             if (! @invlist) {
1821
1822                 # If couldn't find a non-empty inversion list, see if it is
1823                 # instead an inversion map
1824                 my ($list_ref, $map_ref, $format, $default)
1825                           = prop_invmap($lookup_prop, '_perl_core_internal_ok');
1826                 if (! $list_ref) {
1827                     # An empty return here could mean an unknown property, or
1828                     # merely that the original inversion list is empty.  Call
1829                     # in scalar context to differentiate
1830                     my $count = prop_invlist($lookup_prop,
1831                                              '_perl_core_internal_ok');
1832                     die "Could not find inversion list for '$lookup_prop'"
1833                                                           unless defined $count;
1834                 }
1835                 else {
1836                     @invlist = @$list_ref;
1837                     @invmap = @$map_ref;
1838                     $map_format = $format;
1839                     $map_default = $default;
1840                     $maps_to_code_point = $map_format =~ /x/;
1841                     $to_adjust = $map_format =~ /a/;
1842                 }
1843             }
1844         }
1845
1846
1847         # Short-circuit an empty inversion list.
1848         if (! @invlist) {
1849             output_invlist($prop_name, \@invlist, $charset);
1850             next;
1851         }
1852
1853         # Re-order the Unicode code points to native ones for this platform.
1854         # This is only needed for code points below 256, because native code
1855         # points are only in that range.  For inversion maps of properties
1856         # where the mappings are adjusted (format =~ /a/), this reordering
1857         # could mess up the adjustment pattern that was in the input, so that
1858         # has to be dealt with.
1859         #
1860         # And inversion maps that map to code points need to eventually have
1861         # all those code points remapped to native, and it's better to do that
1862         # here, going through the whole list not just those below 256.  This
1863         # is because some inversion maps have adjustments (format =~ /a/)
1864         # which may be affected by the reordering.  This code needs to be done
1865         # both for when we are translating the inversion lists for < 256, and
1866         # for the inversion maps for everything.  By doing both in this loop,
1867         # we can share that code.
1868         #
1869         # So, we go through everything for an inversion map to code points;
1870         # otherwise, we can skip any remapping at all if we are going to
1871         # output only the above-Latin1 values, or if the range spans the whole
1872         # of 0..256, as the remap will also include all of 0..256  (256 not
1873         # 255 because a re-ordering could cause 256 to need to be in the same
1874         # range as 255.)
1875         if ((@invmap && $maps_to_code_point)
1876             || (! $nonl1_only || ($invlist[0] < 256
1877                                   && ! ($invlist[0] == 0 && $invlist[1] > 256))))
1878         {
1879
1880             if (! @invmap) {    # Straight inversion list
1881             # Look at all the ranges that start before 257.
1882             my @latin1_list;
1883             while (@invlist) {
1884                 last if $invlist[0] > 256;
1885                 my $upper = @invlist > 1
1886                             ? $invlist[1] - 1      # In range
1887
1888                               # To infinity.  You may want to stop much much
1889                               # earlier; going this high may expose perl
1890                               # deficiencies with very large numbers.
1891                             : $Unicode::UCD::MAX_CP;
1892                 for my $j ($invlist[0] .. $upper) {
1893                     push @latin1_list, a2n($j);
1894                 }
1895
1896                 shift @invlist; # Shift off the range that's in the list
1897                 shift @invlist; # Shift off the range not in the list
1898             }
1899
1900             # Here @invlist contains all the ranges in the original that start
1901             # at code points above 256, and @latin1_list contains all the
1902             # native code points for ranges that start with a Unicode code
1903             # point below 257.  We sort the latter and convert it to inversion
1904             # list format.  Then simply prepend it to the list of the higher
1905             # code points.
1906             @latin1_list = sort { $a <=> $b } @latin1_list;
1907             @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
1908             unshift @invlist, @latin1_list;
1909             }
1910             else {  # Is an inversion map
1911
1912                 # This is a similar procedure as plain inversion list, but has
1913                 # multiple buckets.  A plain inversion list just has two
1914                 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
1915                 # pretty much can ignore the 2nd bucket, as it is completely
1916                 # defined by the 1st.  But here, what we do is create buckets
1917                 # which contain the code points that map to each, translated
1918                 # to native and turned into an inversion list.  Thus each
1919                 # bucket is an inversion list of native code points that map
1920                 # to it or don't map to it.  We use these to create an
1921                 # inversion map for the whole property.
1922
1923                 # As mentioned earlier, we use this procedure to not just
1924                 # remap the inversion list to native values, but also the maps
1925                 # of code points to native ones.  In the latter case we have
1926                 # to look at the whole of the inversion map (or at least to
1927                 # above Unicode; as the maps of code points above that should
1928                 # all be to the default).
1929                 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
1930
1931                 my %mapped_lists;   # A hash whose keys are the buckets.
1932                 while (@invlist) {
1933                     last if $invlist[0] > $upper_limit;
1934
1935                     # This shouldn't actually happen, as prop_invmap() returns
1936                     # an extra element at the end that is beyond $upper_limit
1937                     die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
1938
1939                     my $bucket;
1940
1941                     # A hash key can't be a ref (we are only expecting arrays
1942                     # of scalars here), so convert any such to a string that
1943                     # will be converted back later (using a vertical tab as
1944                     # the separator).  Even if the mapping is to code points,
1945                     # we don't translate to native here because the code
1946                     # output_map() calls to output these arrays assumes the
1947                     # input is Unicode, not native.
1948                     if (ref $invmap[0]) {
1949                         $bucket = join "\cK", @{$invmap[0]};
1950                     }
1951                     elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
1952
1953                         # Do convert to native for maps to single code points.
1954                         # There are some properties that have a few outlier
1955                         # maps that aren't code points, so the above test
1956                         # skips those.
1957                         $bucket = a2n($invmap[0]);
1958                     } else {
1959                         $bucket = $invmap[0];
1960                     }
1961
1962                     # We now have the bucket that all code points in the range
1963                     # map to, though possibly they need to be adjusted.  Go
1964                     # through the range and put each translated code point in
1965                     # it into its bucket.
1966                     my $base_map = $invmap[0];
1967                     for my $j ($invlist[0] .. $invlist[1] - 1) {
1968                         if ($to_adjust
1969                                # The 1st code point doesn't need adjusting
1970                             && $j > $invlist[0]
1971
1972                                # Skip any non-numeric maps: these are outliers
1973                                # that aren't code points.
1974                             && $base_map =~ $numeric_re
1975
1976                                #  'ne' because the default can be a string
1977                             && $base_map ne $map_default)
1978                         {
1979                             # We adjust, by incrementing each the bucket and
1980                             # the map.  For code point maps, translate to
1981                             # native
1982                             $base_map++;
1983                             $bucket = ($maps_to_code_point)
1984                                       ? a2n($base_map)
1985                                       : $base_map;
1986                         }
1987
1988                         # Add the native code point to the bucket for the
1989                         # current map
1990                         push @{$mapped_lists{$bucket}}, a2n($j);
1991                     } # End of loop through all code points in the range
1992
1993                     # Get ready for the next range
1994                     shift @invlist;
1995                     shift @invmap;
1996                 } # End of loop through all ranges in the map.
1997
1998                 # Here, @invlist and @invmap retain all the ranges from the
1999                 # originals that start with code points above $upper_limit.
2000                 # Each bucket in %mapped_lists contains all the code points
2001                 # that map to that bucket.  If the bucket is for a map to a
2002                 # single code point is a single code point, the bucket has
2003                 # been converted to native.  If something else (including
2004                 # multiple code points), no conversion is done.
2005                 #
2006                 # Now we recreate the inversion map into %xlated, but this
2007                 # time for the native character set.
2008                 my %xlated;
2009                 foreach my $bucket (keys %mapped_lists) {
2010
2011                     # Sort and convert this bucket to an inversion list.  The
2012                     # result will be that ranges that start with even-numbered
2013                     # indexes will be for code points that map to this bucket;
2014                     # odd ones map to some other bucket, and are discarded
2015                     # below.
2016                     @{$mapped_lists{$bucket}}
2017                                     = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
2018                     @{$mapped_lists{$bucket}}
2019                      = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
2020
2021                     # Add each even-numbered range in the bucket to %xlated;
2022                     # so that the keys of %xlated become the range start code
2023                     # points, and the values are their corresponding maps.
2024                     while (@{$mapped_lists{$bucket}}) {
2025                         my $range_start = $mapped_lists{$bucket}->[0];
2026                         if ($bucket =~ /\cK/) {
2027                             @{$xlated{$range_start}} = split /\cK/, $bucket;
2028                         }
2029                         else {
2030                             $xlated{$range_start} = $bucket;
2031                         }
2032                         shift @{$mapped_lists{$bucket}}; # Discard odd ranges
2033                         shift @{$mapped_lists{$bucket}}; # Get ready for next
2034                                                          # iteration
2035                     }
2036                 } # End of loop through all the buckets.
2037
2038                 # Here %xlated's keys are the range starts of all the code
2039                 # points in the inversion map.  Construct an inversion list
2040                 # from them.
2041                 my @new_invlist = sort { $a <=> $b } keys %xlated;
2042
2043                 # If the list is adjusted, we want to munge this list so that
2044                 # we only have one entry for where consecutive code points map
2045                 # to consecutive values.  We just skip the subsequent entries
2046                 # where this is the case.
2047                 if ($to_adjust) {
2048                     my @temp;
2049                     for my $i (0 .. @new_invlist - 1) {
2050                         next if $i > 0
2051                                 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
2052                                 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
2053                                 && $xlated{$new_invlist[$i]} =~ $numeric_re
2054                                 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
2055                         push @temp, $new_invlist[$i];
2056                     }
2057                     @new_invlist = @temp;
2058                 }
2059
2060                 # The inversion map comes from %xlated's values.  We can
2061                 # unshift each onto the front of the untouched portion, in
2062                 # reverse order of the portion we did process.
2063                 foreach my $start (reverse @new_invlist) {
2064                     unshift @invmap, $xlated{$start};
2065                 }
2066
2067                 # Finally prepend the inversion list we have just constructed to the
2068                 # one that contains anything we didn't process.
2069                 unshift @invlist, @new_invlist;
2070             }
2071         }
2072
2073         # prop_invmap() returns an extra final entry, which we can now
2074         # discard.
2075         if (@invmap) {
2076             pop @invlist;
2077             pop @invmap;
2078         }
2079
2080         if ($l1_only) {
2081             die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
2082             for my $i (0 .. @invlist - 1 - 1) {
2083                 if ($invlist[$i] > 255) {
2084
2085                     # In an inversion list, even-numbered elements give the code
2086                     # points that begin ranges that match the property;
2087                     # odd-numbered give ones that begin ranges that don't match.
2088                     # If $i is odd, we are at the first code point above 255 that
2089                     # doesn't match, which means the range it is ending does
2090                     # match, and crosses the 255/256 boundary.  We want to include
2091                     # this ending point, so increment $i, so the splice below
2092                     # includes it.  Conversely, if $i is even, it is the first
2093                     # code point above 255 that matches, which means there was no
2094                     # matching range that crossed the boundary, and we don't want
2095                     # to include this code point, so splice before it.
2096                     $i++ if $i % 2 != 0;
2097
2098                     # Remove everything past this.
2099                     splice @invlist, $i;
2100                     splice @invmap, $i if @invmap;
2101                     last;
2102                 }
2103             }
2104         }
2105         elsif ($nonl1_only) {
2106             my $found_nonl1 = 0;
2107             for my $i (0 .. @invlist - 1 - 1) {
2108                 next if $invlist[$i] < 256;
2109
2110                 # Here, we have the first element in the array that indicates an
2111                 # element above Latin1.  Get rid of all previous ones.
2112                 splice @invlist, 0, $i;
2113                 splice @invmap, 0, $i if @invmap;
2114
2115                 # If this one's index is not divisible by 2, it means that this
2116                 # element is inverting away from being in the list, which means
2117                 # all code points from 256 to this one are in this list (or
2118                 # map to the default for inversion maps)
2119                 if ($i % 2 != 0) {
2120                     unshift @invlist, 256;
2121                     unshift @invmap, $map_default if @invmap;
2122                 }
2123                 $found_nonl1 = 1;
2124                 last;
2125             }
2126             die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
2127         }
2128
2129         output_invlist($prop_name, \@invlist, $charset);
2130         output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
2131     }
2132     end_file_pound_if;
2133     print $out_fh "\n" . get_conditional_compile_line_end();
2134 }
2135
2136 switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
2137
2138 output_GCB_table();
2139 output_LB_table();
2140 output_WB_table();
2141
2142 end_file_pound_if;
2143
2144 my $sources_list = "lib/unicore/mktables.lst";
2145 my @sources = ($0, qw(lib/unicore/mktables
2146                       lib/Unicode/UCD.pm
2147                       regen/charset_translations.pl
2148                       ));
2149 {
2150     # Depend on mktables’ own sources.  It’s a shorter list of files than
2151     # those that Unicode::UCD uses.
2152     if (! open my $mktables_list, '<', $sources_list) {
2153
2154           # This should force a rebuild once $sources_list exists
2155           push @sources, $sources_list;
2156     }
2157     else {
2158         while(<$mktables_list>) {
2159             last if /===/;
2160             chomp;
2161             push @sources, "lib/unicore/$_" if /^[^#]/;
2162         }
2163     }
2164 }
2165
2166 read_only_bottom_close_and_rename($out_fh, \@sources);