regen/mk_invlists.pl

   1 #!perl -w
   2 use 5.015;
   3 use strict;
   4 use warnings;
   5 use Unicode::UCD qw(prop_invlist prop_invmap);
   6 require 'regen/regen_lib.pl';
   7
   8 # This program outputs charclass_invlists.h, which contains various inversion
   9 # lists in the form of C arrays that are to be used as-is for inversion lists.
  10 # Thus, the lists it contains are essentially pre-compiled, and need only a
  11 # light-weight fast wrapper to make them usable at run-time.
  12
  13 # As such, this code knows about the internal structure of these lists, and
  14 # any change made to that has to be done here as well.  A random number stored
  15 # in the headers is used to minimize the possibility of things getting
  16 # out-of-sync, or the wrong data structure being passed.  Currently that
  17 # random number is:
  18 my $VERSION_DATA_STRUCTURE_TYPE = 290655244;
  19
  20 my $out_fh = open_new('charclass_invlists.h', '>',
  21                       {style => '*', by => $0,
  22                       from => "Unicode::UCD"});
  23
  24 print $out_fh "/* See the generating file for comments */\n\n";
  25
  26 my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );
  27
  28 sub output_invlist ($$) {
  29     my $name = shift;
  30     my $invlist = shift;     # Reference to inversion list array
  31
  32     die "No inversion list for $name" unless defined $invlist
  33                                              && ref $invlist eq 'ARRAY'
  34                                              && @$invlist;
  35
  36     # Output the inversion list $invlist using the name $name for it.
  37     # It is output in the exact internal form for inversion lists.
  38
  39     my $zero_or_one;    # Is the last element of the header 0, or 1 ?
  40
  41     # If the first element is 0, it goes in the header, instead of the body
  42     if ($invlist->[0] == 0) {
  43         shift @$invlist;
  44
  45         $zero_or_one = 0;
  46
  47         # Add a dummy 0 at the end so that the length is constant.  inversion
  48         # lists are always stored with enough room so that if they change from
  49         # beginning with 0, they don't have to grow.
  50         push @$invlist, 0;
  51     }
  52     else {
  53         $zero_or_one = 1;
  54     }
  55
  56     print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
  57     print $out_fh "\nstatic UV ${name}_invlist[] = {\n";
  58
  59     print $out_fh "\t", scalar @$invlist, ",\t/* Number of elements */\n";
  60
  61     # This should be UV_MAX, but I (khw) am not confident that the suffixes
  62     # for specifying the constant are portable, e.g.  'ull' on a 32 bit
  63     # machine that is configured to use 64 bits; might need a Configure probe
  64     print $out_fh "\t0,\t/* Current iteration position */\n";
  65     print $out_fh "\t0,\t/* Cache of previous search index result */\n";
  66     print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
  67     print $out_fh "\t", $zero_or_one,
  68                   ",\t/* 0 if this is the first element of the list proper;",
  69                   "\n\t\t   1 if the next element is the first */\n";
  70
  71     # The main body are the UVs passed in to this routine.  Do the final
  72     # element separately
  73     for my $i (0 .. @$invlist - 1 - 1) {
  74         print $out_fh "\t$invlist->[$i],\n";
  75     }
  76
  77     # The final element does not have a trailing comma, as C can't handle it.
  78     print $out_fh "\t$invlist->[-1]\n";
  79
  80     print $out_fh "};\n";
  81     print $out_fh "\n#endif\n" unless exists $include_in_ext_re{$name};
  82
  83 }
  84
  85 sub mk_invlist_from_cp_list {
  86
  87     # Returns an inversion list constructed from the sorted input array of
  88     # code points
  89
  90     my $list_ref = shift;
  91
  92     # Initialize to just the first element
  93     my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
  94
  95     # For each succeeding element, if it extends the previous range, adjust
  96     # up, otherwise add it.
  97     for my $i (1 .. @$list_ref - 1) {
  98         if ($invlist[-1] == $list_ref->[$i]) {
  99             $invlist[-1]++;
 100         }
 101         else {
 102             push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
 103         }
 104     }
 105     return @invlist;
 106 }
 107
 108 # Read in the Case Folding rules, and construct arrays of code points for the
 109 # properties we need.
 110 my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
 111 die "Could not find inversion map for Case_Folding" unless defined $format;
 112 die "Incorrect format '$format' for Case_Folding inversion map"
 113                                                     unless $format eq 'al';
 114 my @has_multi_char_fold;
 115 my @is_non_final_fold;
 116
 117 for my $i (0 .. @$folds_ref - 1) {
 118     next unless ref $folds_ref->[$i];   # Skip single-char folds
 119     push @has_multi_char_fold, $cp_ref->[$i];
 120
 121     # Add to the the non-finals list each code point that is in a non-final
 122     # position
 123     for my $j (0 .. @{$folds_ref->[$i]} - 2) {
 124         push @is_non_final_fold, $folds_ref->[$i][$j]
 125                 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
 126     }
 127 }
 128
 129 sub _Perl_Multi_Char_Folds {
 130     @has_multi_char_fold = sort { $a <=> $b } @has_multi_char_fold;
 131     return mk_invlist_from_cp_list(\@has_multi_char_fold);
 132 }
 133
 134 sub _Perl_Non_Final_Folds {
 135     @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
 136     return mk_invlist_from_cp_list(\@is_non_final_fold);
 137 }
 138
 139 output_invlist("Latin1", [ 0, 256 ]);
 140 output_invlist("AboveLatin1", [ 256 ]);
 141
 142 # We construct lists for all the POSIX and backslash sequence character
 143 # classes in two forms:
 144 #   1) ones which match only in the ASCII range
 145 #   2) ones which match either in the Latin1 range, or the entire Unicode range
 146 #
 147 # These get compiled in, and hence affect the memory footprint of every Perl
 148 # program, even those not using Unicode.  To minimize the size, currently
 149 # the Latin1 version is generated for the beyond ASCII range except for those
 150 # lists that are quite small for the entire range, such as for \s, which is 22
 151 # UVs long plus 4 UVs (currently) for the header.
 152 #
 153 # To save even more memory, the ASCII versions could be derived from the
 154 # larger ones at runtime, saving some memory (minus the expense of the machine
 155 # instructions to do so), but these are all small anyway, so their total is
 156 # about 100 UVs.
 157 #
 158 # In the list of properties below that get generated, the L1 prefix is a fake
 159 # property that means just the Latin1 range of the full property (whose name
 160 # has an X prefix instead of L1).
 161 #
 162 # An initial & means to use the subroutine from this file instead of an
 163 # official inversion list.
 164
 165 for my $prop (qw(
 166                 ASCII
 167                 L1Cased
 168                 VertSpace
 169                 PerlSpace
 170                     XPerlSpace
 171                 PosixAlnum
 172                     L1PosixAlnum
 173                 PosixAlpha
 174                     L1PosixAlpha
 175                 PosixBlank
 176                     XPosixBlank
 177                 PosixCntrl
 178                     XPosixCntrl
 179                 PosixDigit
 180                 PosixGraph
 181                     L1PosixGraph
 182                 PosixLower
 183                     L1PosixLower
 184                 PosixPrint
 185                     L1PosixPrint
 186                 PosixPunct
 187                     L1PosixPunct
 188                 PosixSpace
 189                     XPosixSpace
 190                 PosixUpper
 191                     L1PosixUpper
 192                 PosixWord
 193                     L1PosixWord
 194                 PosixXDigit
 195                     XPosixXDigit
 196                 &NonL1_Perl_Non_Final_Folds
 197                 &_Perl_Multi_Char_Folds
 198     )
 199 ) {
 200
 201     # For the Latin1 properties, we change to use the eXtended version of the
 202     # base property, then go through the result and get rid of everything not
 203     # in Latin1 (above 255).  Actually, we retain the element for the range
 204     # that crosses the 255/256 boundary if it is one that matches the
 205     # property.  For example, in the Word property, there is a range of code
 206     # points that start at U+00F8 and goes through U+02C1.  Instead of
 207     # artifically cutting that off at 256 because 256 is the first code point
 208     # above Latin1, we let the range go to its natural ending.  That gives us
 209     # extra information with no added space taken.  But if the range that
 210     # crosses the boundary is one that doesn't match the property, we don't
 211     # start a new range above 255, as that could be construed as going to
 212     # infinity.  For example, the Upper property doesn't include the character
 213     # at 255, but does include the one at 256.  We don't include the 256 one.
 214     my $prop_name = $prop;
 215     my $is_local_sub = $prop_name =~ s/^&//;
 216     my $lookup_prop = $prop_name;
 217     my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//);
 218     my $nonl1_only = 0;
 219     $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
 220
 221     my @invlist;
 222     if ($is_local_sub) {
 223         @invlist = eval $lookup_prop;
 224     }
 225     else {
 226         @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
 227     }
 228     die "Could not find inversion list for '$lookup_prop'" unless @invlist;
 229
 230     if ($l1_only) {
 231         for my $i (0 .. @invlist - 1 - 1) {
 232             if ($invlist[$i] > 255) {
 233
 234                 # In an inversion list, even-numbered elements give the code
 235                 # points that begin ranges that match the property;
 236                 # odd-numbered give ones that begin ranges that don't match.
 237                 # If $i is odd, we are at the first code point above 255 that
 238                 # doesn't match, which means the range it is ending does
 239                 # match, and crosses the 255/256 boundary.  We want to include
 240                 # this ending point, so increment $i, so the splice below
 241                 # includes it.  Conversely, if $i is even, it is the first
 242                 # code point above 255 that matches, which means there was no
 243                 # matching range that crossed the boundary, and we don't want
 244                 # to include this code point, so splice before it.
 245                 $i++ if $i % 2 != 0;
 246
 247                 # Remove everything past this.
 248                 splice @invlist, $i;
 249                 last;
 250             }
 251         }
 252     }
 253     elsif ($nonl1_only) {
 254         my $found_nonl1 = 0;
 255         for my $i (0 .. @invlist - 1 - 1) {
 256             next if $invlist[$i] < 256;
 257
 258             # Here, we have the first element in the array that indicates an
 259             # element above Latin1.  Get rid of all previous ones.
 260             splice @invlist, 0, $i;
 261
 262             # If this one's index is not divisible by 2, it means that this
 263             # element is inverting away from being in the list, which means
 264             # all code points from 256 to this one are in this list.
 265             unshift @invlist, 256 if $i % 2 != 0;
 266             $found_nonl1 = 1;
 267             last;
 268         }
 269         die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
 270     }
 271
 272     output_invlist($prop_name, \@invlist);
 273 }
 274
 275 read_only_bottom_close_and_rename($out_fh)