regen/mk_invlists.pl

   1 #!perl -w
   2 use 5.015;
   3 use strict;
   4 use warnings;
   5 use Unicode::UCD qw(prop_aliases
   6                     prop_values
   7                     prop_value_aliases
   8                     prop_invlist
   9                     prop_invmap search_invlist
  10                    );
  11 require 'regen/regen_lib.pl';
  12 require 'regen/charset_translations.pl';
  13
  14 # This program outputs charclass_invlists.h, which contains various inversion
  15 # lists in the form of C arrays that are to be used as-is for inversion lists.
  16 # Thus, the lists it contains are essentially pre-compiled, and need only a
  17 # light-weight fast wrapper to make them usable at run-time.
  18
  19 # As such, this code knows about the internal structure of these lists, and
  20 # any change made to that has to be done here as well.  A random number stored
  21 # in the headers is used to minimize the possibility of things getting
  22 # out-of-sync, or the wrong data structure being passed.  Currently that
  23 # random number is:
  24
  25 # charclass_invlists.h now also has a partial implementation of inversion
  26 # maps; enough to generate tables for the line break properties, such as GCB
  27
  28 my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
  29
  30 # integer or float
  31 my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
  32
  33 # Matches valid C language enum names: begins with ASCII alphabetic, then any
  34 # ASCII \w
  35 my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
  36
  37 my $out_fh = open_new('charclass_invlists.h', '>',
  38                       {style => '*', by => $0,
  39                       from => "Unicode::UCD"});
  40
  41 my $in_file_pound_if = 0;
  42
  43 print $out_fh "/* See the generating file for comments */\n\n";
  44
  45 # The symbols generated by this program are all currently defined only in a
  46 # single dot c each.  The code knows where most of them go, but this hash
  47 # gives overrides for the exceptions to the typical place
  48 my %exceptions_to_where_to_define =
  49                         ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
  50                           AboveLatin1                => 'PERL_IN_REGCOMP_C',
  51                           Latin1                     => 'PERL_IN_REGCOMP_C',
  52                           UpperLatin1                => 'PERL_IN_REGCOMP_C',
  53                           _Perl_Any_Folds            => 'PERL_IN_REGCOMP_C',
  54                           _Perl_Folds_To_Multi_Char  => 'PERL_IN_REGCOMP_C',
  55                           _Perl_IDCont               => 'PERL_IN_UTF8_C',
  56                           _Perl_IDStart              => 'PERL_IN_UTF8_C',
  57                         );
  58
  59 # This hash contains the properties with enums that have hard-coded references
  60 # to them in C code.  Its only use is to make sure that if perl is compiled
  61 # with an older Unicode data set, that all the enum values the code is
  62 # expecting will still be in the enum typedef.  Thus the code doesn't have to
  63 # change.  The Unicode version won't have any code points that have these enum
  64 # values, so the code that handles them will not get exercised.  This is far
  65 # better than having to #ifdef things.
  66 my %hard_coded_enums =
  67  ( gcb => [
  68             'Control',
  69             'CR',
  70             'Extend',
  71             'L',
  72             'LF',
  73             'LV',
  74             'LVT',
  75             'Other',
  76             'Prepend',
  77             'Regional_Indicator',
  78             'SpacingMark',
  79             'T',
  80             'V',
  81         ],
  82    sb  => [
  83             'ATerm',
  84             'Close',
  85             'CR',
  86             'Extend',
  87             'Format',
  88             'LF',
  89             'Lower',
  90             'Numeric',
  91             'OLetter',
  92             'Other',
  93             'SContinue',
  94             'Sep',
  95             'Sp',
  96             'STerm',
  97             'Upper',
  98         ],
  99    wb  => [
 100             'ALetter',
 101             'CR',
 102             'Double_Quote',
 103             'Extend',
 104             'ExtendNumLet',
 105             'Format',
 106             'Hebrew_Letter',
 107             'Katakana',
 108             'LF',
 109             'MidLetter',
 110             'MidNum',
 111             'MidNumLet',
 112             'Newline',
 113             'Numeric',
 114             'Other',
 115             'Regional_Indicator',
 116             'Single_Quote',
 117             'UNKNOWN',
 118         ],
 119 );
 120
 121 my @a2n;
 122
 123 sub uniques {
 124     # Returns non-duplicated input values.  From "Perl Best Practices:
 125     # Encapsulated Cleverness".  p. 455 in first edition.
 126
 127     my %seen;
 128     return grep { ! $seen{$_}++ } @_;
 129 }
 130
 131 sub a2n($) {
 132     my $cp = shift;
 133
 134     # Returns the input Unicode code point translated to native.
 135
 136     return $cp if $cp !~ $numeric_re || $cp > 255;
 137     return $a2n[$cp];
 138 }
 139
 140 sub end_file_pound_if {
 141     if ($in_file_pound_if) {
 142         print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
 143         $in_file_pound_if = 0;
 144     }
 145 }
 146
 147 sub switch_pound_if ($$) {
 148     my $name = shift;
 149     my $new_pound_if = shift;
 150
 151     # Switch to new #if given by the 2nd argument.  If there is an override
 152     # for this, it instead switches to that.  The 1st argument is the
 153     # static's name, used to look up the overrides
 154
 155     if (exists $exceptions_to_where_to_define{$name}) {
 156         $new_pound_if = $exceptions_to_where_to_define{$name};
 157     }
 158
 159     # Exit current #if if the new one is different from the old
 160     if ($in_file_pound_if
 161         && $in_file_pound_if !~ /$new_pound_if/)
 162     {
 163         end_file_pound_if;
 164     }
 165
 166     # Enter new #if, if not already in it.
 167     if (! $in_file_pound_if) {
 168         $in_file_pound_if = "defined($new_pound_if)";
 169         print $out_fh "\n#if $in_file_pound_if\n";
 170     }
 171 }
 172
 173 sub output_invlist ($$;$) {
 174     my $name = shift;
 175     my $invlist = shift;     # Reference to inversion list array
 176     my $charset = shift // "";  # name of character set for comment
 177
 178     die "No inversion list for $name" unless defined $invlist
 179                                              && ref $invlist eq 'ARRAY'
 180                                              && @$invlist;
 181
 182     # Output the inversion list $invlist using the name $name for it.
 183     # It is output in the exact internal form for inversion lists.
 184
 185     # Is the last element of the header 0, or 1 ?
 186     my $zero_or_one = 0;
 187     if ($invlist->[0] != 0) {
 188         unshift @$invlist, 0;
 189         $zero_or_one = 1;
 190     }
 191     my $count = @$invlist;
 192
 193     switch_pound_if ($name, 'PERL_IN_PERL_C');
 194
 195     print $out_fh "\nstatic const UV ${name}_invlist[] = {";
 196     print $out_fh " /* for $charset */" if $charset;
 197     print $out_fh "\n";
 198
 199     print $out_fh "\t$count,\t/* Number of elements */\n";
 200     print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
 201     print $out_fh "\t", $zero_or_one,
 202                   ",\t/* 0 if the list starts at 0;",
 203                   "\n\t\t   1 if it starts at the element beyond 0 */\n";
 204
 205     # The main body are the UVs passed in to this routine.  Do the final
 206     # element separately
 207     for my $i (0 .. @$invlist - 1) {
 208         printf $out_fh "\t0x%X", $invlist->[$i];
 209         print $out_fh "," if $i < @$invlist - 1;
 210         print $out_fh "\n";
 211     }
 212
 213     print $out_fh "};\n";
 214 }
 215
 216 sub output_invmap ($$$$$$$) {
 217     my $name = shift;
 218     my $invmap = shift;     # Reference to inversion map array
 219     my $prop_name = shift;
 220     my $input_format = shift;   # The inversion map's format
 221     my $default = shift;        # The property value for code points who
 222                                 # otherwise don't have a value specified.
 223     my $extra_enums = shift;    # comma-separated list of our additions to the
 224                                 # property's standard possible values
 225     my $charset = shift // "";  # name of character set for comment
 226
 227     # Output the inversion map $invmap for property $prop_name, but use $name
 228     # as the actual data structure's name.
 229
 230     my $count = @$invmap;
 231
 232     my $output_format;
 233     my $declaration_type;
 234     my %enums;
 235     my $name_prefix;
 236
 237     if ($input_format eq 's') {
 238         $prop_name = (prop_aliases($prop_name))[1]; # Get full name
 239         my $short_name = (prop_aliases($prop_name))[0];
 240             my @enums = prop_values($prop_name);
 241             if (! @enums) {
 242                 die "Only enum properties are currently handled; '$prop_name' isn't one";
 243             }
 244             else {
 245
 246                 # Convert short names to long
 247                 @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums;
 248
 249                 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
 250                 die 'You need to update %hard_coded_enums to reflect new entries in this Unicode version'
 251                     if @expected_enums < @enums;
 252
 253                 # Remove the enums found in the input from the ones we expect
 254                 for (my $i = @expected_enums - 1; $i >= 0; $i--) {
 255                     splice(@expected_enums, $i, 1)
 256                                 if grep { $expected_enums[$i] eq $_ } @enums;
 257                 }
 258
 259                 # The ones remaining must be because we're using an older
 260                 # Unicode version.  Add them to the list.
 261                 push @enums, @expected_enums;
 262
 263                 # Add in the extra values coded into this program, and sort.
 264                 push @enums, split /,/, $extra_enums if $extra_enums ne "";
 265                 @enums = sort @enums;
 266
 267                 # Assign a value to each element of the enum.  The default
 268                 # value always gets 0; the others are arbitrarily assigned.
 269                 my $enum_val = 0;
 270                 $default = prop_value_aliases($prop_name, $default);
 271                 $enums{$default} = $enum_val++;
 272                 for my $enum (@enums) {
 273                     $enums{$enum} = $enum_val++ unless exists $enums{$enum};
 274                 }
 275             }
 276
 277             # Inversion map stuff is currently used only by regexec
 278             switch_pound_if($name, 'PERL_IN_REGEXEC_C');
 279         {
 280
 281             # The short names tend to be two lower case letters, but it looks
 282             # better for those if they are upper. XXX
 283             $short_name = uc($short_name) if length($short_name) < 3
 284                                              || substr($short_name, 0, 1) =~ /[[:lower:]]/;
 285             $name_prefix = "${short_name}_";
 286             my $enum_count = keys %enums;
 287             print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
 288
 289             print $out_fh "\ntypedef enum {\n";
 290             print $out_fh "\t${name_prefix}$default = $enums{$default},\n";
 291             delete $enums{$default};
 292             foreach my $enum (sort { $a cmp $b } keys %enums) {
 293                 print $out_fh "\t${name_prefix}$enum = $enums{$enum}";
 294                 print $out_fh "," if $enums{$enum} < $enum_count - 1;
 295                 print $out_fh  "\n";
 296             }
 297             $declaration_type = "${name_prefix}enum";
 298             print $out_fh "} $declaration_type;\n";
 299
 300             $output_format = "${name_prefix}%s";
 301         }
 302     }
 303     else {
 304         die "'$input_format' invmap() format for '$prop_name' unimplemented";
 305     }
 306
 307     die "No inversion map for $prop_name" unless defined $invmap
 308                                              && ref $invmap eq 'ARRAY'
 309                                              && $count;
 310
 311     print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
 312     print $out_fh " /* for $charset */" if $charset;
 313     print $out_fh "\n";
 314
 315     # The main body are the scalars passed in to this routine.
 316     for my $i (0 .. $count - 1) {
 317         my $element = $invmap->[$i];
 318         $element = $name_prefix . prop_value_aliases($prop_name, $element);
 319         print $out_fh "\t$element";
 320         print $out_fh "," if $i < $count - 1;
 321         print $out_fh  "\n";
 322     }
 323     print $out_fh "};\n";
 324 }
 325
 326 sub mk_invlist_from_sorted_cp_list {
 327
 328     # Returns an inversion list constructed from the sorted input array of
 329     # code points
 330
 331     my $list_ref = shift;
 332
 333     return unless @$list_ref;
 334
 335     # Initialize to just the first element
 336     my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
 337
 338     # For each succeeding element, if it extends the previous range, adjust
 339     # up, otherwise add it.
 340     for my $i (1 .. @$list_ref - 1) {
 341         if ($invlist[-1] == $list_ref->[$i]) {
 342             $invlist[-1]++;
 343         }
 344         else {
 345             push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
 346         }
 347     }
 348     return @invlist;
 349 }
 350
 351 # Read in the Case Folding rules, and construct arrays of code points for the
 352 # properties we need.
 353 my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
 354 die "Could not find inversion map for Case_Folding" unless defined $format;
 355 die "Incorrect format '$format' for Case_Folding inversion map"
 356                                                     unless $format eq 'al';
 357 my @has_multi_char_fold;
 358 my @is_non_final_fold;
 359
 360 for my $i (0 .. @$folds_ref - 1) {
 361     next unless ref $folds_ref->[$i];   # Skip single-char folds
 362     push @has_multi_char_fold, $cp_ref->[$i];
 363
 364     # Add to the non-finals list each code point that is in a non-final
 365     # position
 366     for my $j (0 .. @{$folds_ref->[$i]} - 2) {
 367         push @is_non_final_fold, $folds_ref->[$i][$j]
 368                 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
 369     }
 370 }
 371
 372 sub _Perl_Non_Final_Folds {
 373     @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
 374     return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
 375 }
 376
 377 sub prop_name_for_cmp ($) { # Sort helper
 378     my $name = shift;
 379
 380     # Returns the input lowercased, with non-alphas removed, as well as
 381     # everything starting with a comma
 382
 383     $name =~ s/,.*//;
 384     $name =~ s/[[:^alpha:]]//g;
 385     return lc $name;
 386 }
 387
 388 sub UpperLatin1 {
 389     return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
 390 }
 391
 392 output_invlist("Latin1", [ 0, 256 ]);
 393 output_invlist("AboveLatin1", [ 256 ]);
 394
 395 end_file_pound_if;
 396
 397 # We construct lists for all the POSIX and backslash sequence character
 398 # classes in two forms:
 399 #   1) ones which match only in the ASCII range
 400 #   2) ones which match either in the Latin1 range, or the entire Unicode range
 401 #
 402 # These get compiled in, and hence affect the memory footprint of every Perl
 403 # program, even those not using Unicode.  To minimize the size, currently
 404 # the Latin1 version is generated for the beyond ASCII range except for those
 405 # lists that are quite small for the entire range, such as for \s, which is 22
 406 # UVs long plus 4 UVs (currently) for the header.
 407 #
 408 # To save even more memory, the ASCII versions could be derived from the
 409 # larger ones at runtime, saving some memory (minus the expense of the machine
 410 # instructions to do so), but these are all small anyway, so their total is
 411 # about 100 UVs.
 412 #
 413 # In the list of properties below that get generated, the L1 prefix is a fake
 414 # property that means just the Latin1 range of the full property (whose name
 415 # has an X prefix instead of L1).
 416 #
 417 # An initial & means to use the subroutine from this file instead of an
 418 # official inversion list.
 419
 420 for my $charset (get_supported_code_pages()) {
 421     print $out_fh "\n" . get_conditional_compile_line_start($charset);
 422
 423     @a2n = @{get_a2n($charset)};
 424     no warnings 'qw';
 425                          # Ignore non-alpha in sort
 426     for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
 427                              ASCII
 428                              Cased
 429                              VertSpace
 430                              XPerlSpace
 431                              XPosixAlnum
 432                              XPosixAlpha
 433                              XPosixBlank
 434                              XPosixCntrl
 435                              XPosixDigit
 436                              XPosixGraph
 437                              XPosixLower
 438                              XPosixPrint
 439                              XPosixPunct
 440                              XPosixSpace
 441                              XPosixUpper
 442                              XPosixWord
 443                              XPosixXDigit
 444                              _Perl_Any_Folds
 445                              &NonL1_Perl_Non_Final_Folds
 446                              _Perl_Folds_To_Multi_Char
 447                              &UpperLatin1
 448                              _Perl_IDStart
 449                              _Perl_IDCont
 450                              Grapheme_Cluster_Break,EDGE
 451                              Word_Break,EDGE,UNKNOWN
 452                              Sentence_Break,EDGE
 453                            )
 454     ) {
 455
 456         # For the Latin1 properties, we change to use the eXtended version of the
 457         # base property, then go through the result and get rid of everything not
 458         # in Latin1 (above 255).  Actually, we retain the element for the range
 459         # that crosses the 255/256 boundary if it is one that matches the
 460         # property.  For example, in the Word property, there is a range of code
 461         # points that start at U+00F8 and goes through U+02C1.  Instead of
 462         # artificially cutting that off at 256 because 256 is the first code point
 463         # above Latin1, we let the range go to its natural ending.  That gives us
 464         # extra information with no added space taken.  But if the range that
 465         # crosses the boundary is one that doesn't match the property, we don't
 466         # start a new range above 255, as that could be construed as going to
 467         # infinity.  For example, the Upper property doesn't include the character
 468         # at 255, but does include the one at 256.  We don't include the 256 one.
 469         my $prop_name = $prop;
 470         my $is_local_sub = $prop_name =~ s/^&//;
 471         my $extra_enums = "";
 472         $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
 473         my $lookup_prop = $prop_name;
 474         my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
 475                        or $lookup_prop =~ s/^L1//);
 476         my $nonl1_only = 0;
 477         $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
 478         ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
 479
 480         my @invlist;
 481         my @invmap;
 482         my $map_format;
 483         my $map_default;
 484         my $maps_to_code_point;
 485         my $to_adjust;
 486         if ($is_local_sub) {
 487             @invlist = eval $lookup_prop;
 488         }
 489         else {
 490             @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
 491             if (! @invlist) {
 492                 my ($list_ref, $map_ref, $format, $default);
 493
 494                 ($list_ref, $map_ref, $format, $default)
 495                           = prop_invmap($lookup_prop, '_perl_core_internal_ok');
 496                 die "Could not find inversion list for '$lookup_prop'" unless $list_ref;
 497                 @invlist = @$list_ref;
 498                 @invmap = @$map_ref;
 499                 $map_format = $format;
 500                 $map_default = $default;
 501                 $maps_to_code_point = $map_format =~ /x/;
 502                 $to_adjust = $map_format =~ /a/;
 503             }
 504         }
 505         die "Could not find inversion list for '$lookup_prop'" unless @invlist;
 506
 507         # Re-order the Unicode code points to native ones for this platform.
 508         # This is only needed for code points below 256, because native code
 509         # points are only in that range.  For inversion maps of properties
 510         # where the mappings are adjusted (format =~ /a/), this reordering
 511         # could mess up the adjustment pattern that was in the input, so that
 512         # has to be dealt with.
 513         #
 514         # And inversion maps that map to code points need to eventually have
 515         # all those code points remapped to native, and it's better to do that
 516         # here, going through the whole list not just those below 256.  This
 517         # is because some inversion maps have adjustments (format =~ /a/)
 518         # which may be affected by the reordering.  This code needs to be done
 519         # both for when we are translating the inversion lists for < 256, and
 520         # for the inversion maps for everything.  By doing both in this loop,
 521         # we can share that code.
 522         #
 523         # So, we go through everything for an inversion map to code points;
 524         # otherwise, we can skip any remapping at all if we are going to
 525         # output only the above-Latin1 values, or if the range spans the whole
 526         # of 0..256, as the remap will also include all of 0..256  (256 not
 527         # 255 because a re-ordering could cause 256 to need to be in the same
 528         # range as 255.)
 529         if ((@invmap && $maps_to_code_point)
 530             || (! $nonl1_only || ($invlist[0] < 256
 531                                   && ! ($invlist[0] == 0 && $invlist[1] > 256))))
 532         {
 533
 534             if (! @invmap) {    # Straight inversion list
 535             # Look at all the ranges that start before 257.
 536             my @latin1_list;
 537             while (@invlist) {
 538                 last if $invlist[0] > 256;
 539                 my $upper = @invlist > 1
 540                             ? $invlist[1] - 1      # In range
 541
 542                               # To infinity.  You may want to stop much much
 543                               # earlier; going this high may expose perl
 544                               # deficiencies with very large numbers.
 545                             : $Unicode::UCD::MAX_CP;
 546                 for my $j ($invlist[0] .. $upper) {
 547                     push @latin1_list, a2n($j);
 548                 }
 549
 550                 shift @invlist; # Shift off the range that's in the list
 551                 shift @invlist; # Shift off the range not in the list
 552             }
 553
 554             # Here @invlist contains all the ranges in the original that start
 555             # at code points above 256, and @latin1_list contains all the
 556             # native code points for ranges that start with a Unicode code
 557             # point below 257.  We sort the latter and convert it to inversion
 558             # list format.  Then simply prepend it to the list of the higher
 559             # code points.
 560             @latin1_list = sort { $a <=> $b } @latin1_list;
 561             @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
 562             unshift @invlist, @latin1_list;
 563             }
 564             else {  # Is an inversion map
 565
 566                 # This is a similar procedure as plain inversion list, but has
 567                 # multiple buckets.  A plain inversion list just has two
 568                 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
 569                 # pretty much can ignore the 2nd bucket, as it is completely
 570                 # defined by the 1st.  But here, what we do is create buckets
 571                 # which contain the code points that map to each, translated
 572                 # to native and turned into an inversion list.  Thus each
 573                 # bucket is an inversion list of native code points that map
 574                 # to it or don't map to it.  We use these to create an
 575                 # inversion map for the whole property.
 576
 577                 # As mentioned earlier, we use this procedure to not just
 578                 # remap the inversion list to native values, but also the maps
 579                 # of code points to native ones.  In the latter case we have
 580                 # to look at the whole of the inversion map (or at least to
 581                 # above Unicode; as the maps of code points above that should
 582                 # all be to the default).
 583                 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
 584
 585                 my %mapped_lists;   # A hash whose keys are the buckets.
 586                 while (@invlist) {
 587                     last if $invlist[0] > $upper_limit;
 588
 589                     # This shouldn't actually happen, as prop_invmap() returns
 590                     # an extra element at the end that is beyond $upper_limit
 591                     die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
 592
 593                     my $bucket;
 594
 595                     # A hash key can't be a ref (we are only expecting arrays
 596                     # of scalars here), so convert any such to a string that
 597                     # will be converted back later (using a vertical tab as
 598                     # the separator).  Even if the mapping is to code points,
 599                     # we don't translate to native here because the code
 600                     # output_map() calls to output these arrays assumes the
 601                     # input is Unicode, not native.
 602                     if (ref $invmap[0]) {
 603                         $bucket = join "\cK", @{$invmap[0]};
 604                     }
 605                     elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
 606
 607                         # Do convert to native for maps to single code points.
 608                         # There are some properties that have a few outlier
 609                         # maps that aren't code points, so the above test
 610                         # skips those.
 611                         $bucket = a2n($invmap[0]);
 612                     } else {
 613                         $bucket = $invmap[0];
 614                     }
 615
 616                     # We now have the bucket that all code points in the range
 617                     # map to, though possibly they need to be adjusted.  Go
 618                     # through the range and put each translated code point in
 619                     # it into its bucket.
 620                     my $base_map = $invmap[0];
 621                     for my $j ($invlist[0] .. $invlist[1] - 1) {
 622                         if ($to_adjust
 623                                # The 1st code point doesn't need adjusting
 624                             && $j > $invlist[0]
 625
 626                                # Skip any non-numeric maps: these are outliers
 627                                # that aren't code points.
 628                             && $base_map =~ $numeric_re
 629
 630                                #  'ne' because the default can be a string
 631                             && $base_map ne $map_default)
 632                         {
 633                             # We adjust, by incrementing each the bucket and
 634                             # the map.  For code point maps, translate to
 635                             # native
 636                             $base_map++;
 637                             $bucket = ($maps_to_code_point)
 638                                       ? a2n($base_map)
 639                                       : $base_map;
 640                         }
 641
 642                         # Add the native code point to the bucket for the
 643                         # current map
 644                         push @{$mapped_lists{$bucket}}, a2n($j);
 645                     } # End of loop through all code points in the range
 646
 647                     # Get ready for the next range
 648                     shift @invlist;
 649                     shift @invmap;
 650                 } # End of loop through all ranges in the map.
 651
 652                 # Here, @invlist and @invmap retain all the ranges from the
 653                 # originals that start with code points above $upper_limit.
 654                 # Each bucket in %mapped_lists contains all the code points
 655                 # that map to that bucket.  If the bucket is for a map to a
 656                 # single code point is a single code point, the bucket has
 657                 # been converted to native.  If something else (including
 658                 # multiple code points), no conversion is done.
 659                 #
 660                 # Now we recreate the inversion map into %xlated, but this
 661                 # time for the native character set.
 662                 my %xlated;
 663                 foreach my $bucket (keys %mapped_lists) {
 664
 665                     # Sort and convert this bucket to an inversion list.  The
 666                     # result will be that ranges that start with even-numbered
 667                     # indexes will be for code points that map to this bucket;
 668                     # odd ones map to some other bucket, and are discarded
 669                     # below.
 670                     @{$mapped_lists{$bucket}}
 671                                     = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
 672                     @{$mapped_lists{$bucket}}
 673                      = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
 674
 675                     # Add each even-numbered range in the bucket to %xlated;
 676                     # so that the keys of %xlated become the range start code
 677                     # points, and the values are their corresponding maps.
 678                     while (@{$mapped_lists{$bucket}}) {
 679                         my $range_start = $mapped_lists{$bucket}->[0];
 680                         if ($bucket =~ /\cK/) {
 681                             @{$xlated{$range_start}} = split /\cK/, $bucket;
 682                         }
 683                         else {
 684                             $xlated{$range_start} = $bucket;
 685                         }
 686                         shift @{$mapped_lists{$bucket}}; # Discard odd ranges
 687                         shift @{$mapped_lists{$bucket}}; # Get ready for next
 688                                                          # iteration
 689                     }
 690                 } # End of loop through all the buckets.
 691
 692                 # Here %xlated's keys are the range starts of all the code
 693                 # points in the inversion map.  Construct an inversion list
 694                 # from them.
 695                 my @new_invlist = sort { $a <=> $b } keys %xlated;
 696
 697                 # If the list is adjusted, we want to munge this list so that
 698                 # we only have one entry for where consecutive code points map
 699                 # to consecutive values.  We just skip the subsequent entries
 700                 # where this is the case.
 701                 if ($to_adjust) {
 702                     my @temp;
 703                     for my $i (0 .. @new_invlist - 1) {
 704                         next if $i > 0
 705                                 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
 706                                 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
 707                                 && $xlated{$new_invlist[$i]} =~ $numeric_re
 708                                 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
 709                         push @temp, $new_invlist[$i];
 710                     }
 711                     @new_invlist = @temp;
 712                 }
 713
 714                 # The inversion map comes from %xlated's values.  We can
 715                 # unshift each onto the front of the untouched portion, in
 716                 # reverse order of the portion we did process.
 717                 foreach my $start (reverse @new_invlist) {
 718                     unshift @invmap, $xlated{$start};
 719                 }
 720
 721                 # Finally prepend the inversion list we have just constructed to the
 722                 # one that contains anything we didn't process.
 723                 unshift @invlist, @new_invlist;
 724             }
 725         }
 726
 727         # prop_invmap() returns an extra final entry, which we can now
 728         # discard.
 729         if (@invmap) {
 730             pop @invlist;
 731             pop @invmap;
 732         }
 733
 734         if ($l1_only) {
 735             die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
 736             for my $i (0 .. @invlist - 1 - 1) {
 737                 if ($invlist[$i] > 255) {
 738
 739                     # In an inversion list, even-numbered elements give the code
 740                     # points that begin ranges that match the property;
 741                     # odd-numbered give ones that begin ranges that don't match.
 742                     # If $i is odd, we are at the first code point above 255 that
 743                     # doesn't match, which means the range it is ending does
 744                     # match, and crosses the 255/256 boundary.  We want to include
 745                     # this ending point, so increment $i, so the splice below
 746                     # includes it.  Conversely, if $i is even, it is the first
 747                     # code point above 255 that matches, which means there was no
 748                     # matching range that crossed the boundary, and we don't want
 749                     # to include this code point, so splice before it.
 750                     $i++ if $i % 2 != 0;
 751
 752                     # Remove everything past this.
 753                     splice @invlist, $i;
 754                     splice @invmap, $i if @invmap;
 755                     last;
 756                 }
 757             }
 758         }
 759         elsif ($nonl1_only) {
 760             my $found_nonl1 = 0;
 761             for my $i (0 .. @invlist - 1 - 1) {
 762                 next if $invlist[$i] < 256;
 763
 764                 # Here, we have the first element in the array that indicates an
 765                 # element above Latin1.  Get rid of all previous ones.
 766                 splice @invlist, 0, $i;
 767                 splice @invmap, 0, $i if @invmap;
 768
 769                 # If this one's index is not divisible by 2, it means that this
 770                 # element is inverting away from being in the list, which means
 771                 # all code points from 256 to this one are in this list (or
 772                 # map to the default for inversion maps)
 773                 if ($i % 2 != 0) {
 774                     unshift @invlist, 256;
 775                     unshift @invmap, $map_default if @invmap;
 776                 }
 777                 $found_nonl1 = 1;
 778                 last;
 779             }
 780             die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
 781         }
 782
 783         output_invlist($prop_name, \@invlist, $charset);
 784         output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
 785     }
 786     end_file_pound_if;
 787     print $out_fh "\n" . get_conditional_compile_line_end();
 788 }
 789
 790 my $sources_list = "lib/unicore/mktables.lst";
 791 my @sources = ($0, qw(lib/unicore/mktables
 792                       lib/Unicode/UCD.pm
 793                       regen/charset_translations.pl
 794                       ));
 795 {
 796     # Depend on mktables’ own sources.  It’s a shorter list of files than
 797     # those that Unicode::UCD uses.
 798     if (! open my $mktables_list, $sources_list) {
 799
 800           # This should force a rebuild once $sources_list exists
 801           push @sources, $sources_list;
 802     }
 803     else {
 804         while(<$mktables_list>) {
 805             last if /===/;
 806             chomp;
 807             push @sources, "lib/unicore/$_" if /^[^#]/;
 808         }
 809     }
 810 }
 811 read_only_bottom_close_and_rename($out_fh, \@sources)