regen/mk_invlists.pl

   1 #!perl -w
   2 use 5.015;
   3 use strict;
   4 use warnings;
   5 use Unicode::UCD qw(prop_invlist prop_invmap);
   6 require 'regen/regen_lib.pl';
   7
   8 # This program outputs charclass_invlists.h, which contains various inversion
   9 # lists in the form of C arrays that are to be used as-is for inversion lists.
  10 # Thus, the lists it contains are essentially pre-compiled, and need only a
  11 # light-weight fast wrapper to make them usable at run-time.
  12
  13 # As such, this code knows about the internal structure of these lists, and
  14 # any change made to that has to be done here as well.  A random number stored
  15 # in the headers is used to minimize the possibility of things getting
  16 # out-of-sync, or the wrong data structure being passed.  Currently that
  17 # random number is:
  18 my $VERSION_DATA_STRUCTURE_TYPE = 1826693541;
  19
  20 my $out_fh = open_new('charclass_invlists.h', '>',
  21                       {style => '*', by => $0,
  22                       from => "Unicode::UCD"});
  23
  24 print $out_fh "/* See the generating file for comments */\n\n";
  25
  26 my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );
  27
  28 sub output_invlist ($$) {
  29     my $name = shift;
  30     my $invlist = shift;     # Reference to inversion list array
  31
  32     die "No inversion list for $name" unless defined $invlist
  33                                              && ref $invlist eq 'ARRAY'
  34                                              && @$invlist;
  35
  36     # Output the inversion list $invlist using the name $name for it.
  37     # It is output in the exact internal form for inversion lists.
  38
  39     # Is the last element of the header 0, or 1 ?
  40     my $zero_or_one = 0;
  41     my $count = @$invlist;
  42     if ($invlist->[0] != 0) {
  43         unshift @$invlist, 0;
  44         $zero_or_one = 1;
  45     }
  46
  47     print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
  48     print $out_fh "\nstatic const UV ${name}_invlist[] = {\n";
  49
  50     print $out_fh "\t$count,\t/* Number of elements */\n";
  51     print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
  52     print $out_fh "\t", $zero_or_one,
  53                   ",\t/* 0 if the list starts at 0;",
  54                   "\n\t\t   1 if it starts at the element beyond 0 */\n";
  55
  56     # The main body are the UVs passed in to this routine.  Do the final
  57     # element separately
  58     for my $i (0 .. @$invlist - 1 - 1) {
  59         print $out_fh "\t$invlist->[$i],\n";
  60     }
  61
  62     # The final element does not have a trailing comma, as C can't handle it.
  63     print $out_fh "\t$invlist->[-1]\n";
  64
  65     print $out_fh "};\n";
  66     print $out_fh "\n#endif\n" unless exists $include_in_ext_re{$name};
  67
  68 }
  69
  70 sub mk_invlist_from_cp_list {
  71
  72     # Returns an inversion list constructed from the sorted input array of
  73     # code points
  74
  75     my $list_ref = shift;
  76
  77     # Initialize to just the first element
  78     my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
  79
  80     # For each succeeding element, if it extends the previous range, adjust
  81     # up, otherwise add it.
  82     for my $i (1 .. @$list_ref - 1) {
  83         if ($invlist[-1] == $list_ref->[$i]) {
  84             $invlist[-1]++;
  85         }
  86         else {
  87             push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
  88         }
  89     }
  90     return @invlist;
  91 }
  92
  93 # Read in the Case Folding rules, and construct arrays of code points for the
  94 # properties we need.
  95 my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
  96 die "Could not find inversion map for Case_Folding" unless defined $format;
  97 die "Incorrect format '$format' for Case_Folding inversion map"
  98                                                     unless $format eq 'al';
  99 my @has_multi_char_fold;
 100 my @is_non_final_fold;
 101
 102 for my $i (0 .. @$folds_ref - 1) {
 103     next unless ref $folds_ref->[$i];   # Skip single-char folds
 104     push @has_multi_char_fold, $cp_ref->[$i];
 105
 106     # Add to the non-finals list each code point that is in a non-final
 107     # position
 108     for my $j (0 .. @{$folds_ref->[$i]} - 2) {
 109         push @is_non_final_fold, $folds_ref->[$i][$j]
 110                 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
 111     }
 112 }
 113
 114 sub _Perl_Multi_Char_Folds {
 115     @has_multi_char_fold = sort { $a <=> $b } @has_multi_char_fold;
 116     return mk_invlist_from_cp_list(\@has_multi_char_fold);
 117 }
 118
 119 sub _Perl_Non_Final_Folds {
 120     @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
 121     return mk_invlist_from_cp_list(\@is_non_final_fold);
 122 }
 123
 124 output_invlist("Latin1", [ 0, 256 ]);
 125 output_invlist("AboveLatin1", [ 256 ]);
 126
 127 # We construct lists for all the POSIX and backslash sequence character
 128 # classes in two forms:
 129 #   1) ones which match only in the ASCII range
 130 #   2) ones which match either in the Latin1 range, or the entire Unicode range
 131 #
 132 # These get compiled in, and hence affect the memory footprint of every Perl
 133 # program, even those not using Unicode.  To minimize the size, currently
 134 # the Latin1 version is generated for the beyond ASCII range except for those
 135 # lists that are quite small for the entire range, such as for \s, which is 22
 136 # UVs long plus 4 UVs (currently) for the header.
 137 #
 138 # To save even more memory, the ASCII versions could be derived from the
 139 # larger ones at runtime, saving some memory (minus the expense of the machine
 140 # instructions to do so), but these are all small anyway, so their total is
 141 # about 100 UVs.
 142 #
 143 # In the list of properties below that get generated, the L1 prefix is a fake
 144 # property that means just the Latin1 range of the full property (whose name
 145 # has an X prefix instead of L1).
 146 #
 147 # An initial & means to use the subroutine from this file instead of an
 148 # official inversion list.
 149
 150 for my $prop (qw(
 151                 ASCII
 152                 L1Cased
 153                 VertSpace
 154                 PerlSpace
 155                     XPerlSpace
 156                 PosixAlnum
 157                     L1PosixAlnum
 158                 PosixAlpha
 159                     L1PosixAlpha
 160                 PosixBlank
 161                     XPosixBlank
 162                 PosixCntrl
 163                     XPosixCntrl
 164                 PosixDigit
 165                 PosixGraph
 166                     L1PosixGraph
 167                 PosixLower
 168                     L1PosixLower
 169                 PosixPrint
 170                     L1PosixPrint
 171                 PosixPunct
 172                     L1PosixPunct
 173                 PosixSpace
 174                     XPosixSpace
 175                 PosixUpper
 176                     L1PosixUpper
 177                 PosixWord
 178                     L1PosixWord
 179                 PosixXDigit
 180                     XPosixXDigit
 181                 &NonL1_Perl_Non_Final_Folds
 182                 &_Perl_Multi_Char_Folds
 183     )
 184 ) {
 185
 186     # For the Latin1 properties, we change to use the eXtended version of the
 187     # base property, then go through the result and get rid of everything not
 188     # in Latin1 (above 255).  Actually, we retain the element for the range
 189     # that crosses the 255/256 boundary if it is one that matches the
 190     # property.  For example, in the Word property, there is a range of code
 191     # points that start at U+00F8 and goes through U+02C1.  Instead of
 192     # artificially cutting that off at 256 because 256 is the first code point
 193     # above Latin1, we let the range go to its natural ending.  That gives us
 194     # extra information with no added space taken.  But if the range that
 195     # crosses the boundary is one that doesn't match the property, we don't
 196     # start a new range above 255, as that could be construed as going to
 197     # infinity.  For example, the Upper property doesn't include the character
 198     # at 255, but does include the one at 256.  We don't include the 256 one.
 199     my $prop_name = $prop;
 200     my $is_local_sub = $prop_name =~ s/^&//;
 201     my $lookup_prop = $prop_name;
 202     my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//);
 203     my $nonl1_only = 0;
 204     $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
 205
 206     my @invlist;
 207     if ($is_local_sub) {
 208         @invlist = eval $lookup_prop;
 209     }
 210     else {
 211         @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
 212     }
 213     die "Could not find inversion list for '$lookup_prop'" unless @invlist;
 214
 215     if ($l1_only) {
 216         for my $i (0 .. @invlist - 1 - 1) {
 217             if ($invlist[$i] > 255) {
 218
 219                 # In an inversion list, even-numbered elements give the code
 220                 # points that begin ranges that match the property;
 221                 # odd-numbered give ones that begin ranges that don't match.
 222                 # If $i is odd, we are at the first code point above 255 that
 223                 # doesn't match, which means the range it is ending does
 224                 # match, and crosses the 255/256 boundary.  We want to include
 225                 # this ending point, so increment $i, so the splice below
 226                 # includes it.  Conversely, if $i is even, it is the first
 227                 # code point above 255 that matches, which means there was no
 228                 # matching range that crossed the boundary, and we don't want
 229                 # to include this code point, so splice before it.
 230                 $i++ if $i % 2 != 0;
 231
 232                 # Remove everything past this.
 233                 splice @invlist, $i;
 234                 last;
 235             }
 236         }
 237     }
 238     elsif ($nonl1_only) {
 239         my $found_nonl1 = 0;
 240         for my $i (0 .. @invlist - 1 - 1) {
 241             next if $invlist[$i] < 256;
 242
 243             # Here, we have the first element in the array that indicates an
 244             # element above Latin1.  Get rid of all previous ones.
 245             splice @invlist, 0, $i;
 246
 247             # If this one's index is not divisible by 2, it means that this
 248             # element is inverting away from being in the list, which means
 249             # all code points from 256 to this one are in this list.
 250             unshift @invlist, 256 if $i % 2 != 0;
 251             $found_nonl1 = 1;
 252             last;
 253         }
 254         die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
 255     }
 256
 257     output_invlist($prop_name, \@invlist);
 258 }
 259
 260 read_only_bottom_close_and_rename($out_fh)