regen/mk_invlists.pl

   1 #!perl -w
   2 use 5.015;
   3 use strict;
   4 use warnings;
   5 use Unicode::UCD qw(prop_invlist prop_invmap);
   6 require 'regen/regen_lib.pl';
   7 require 'regen/charset_translations.pl';
   8
   9 # This program outputs charclass_invlists.h, which contains various inversion
  10 # lists in the form of C arrays that are to be used as-is for inversion lists.
  11 # Thus, the lists it contains are essentially pre-compiled, and need only a
  12 # light-weight fast wrapper to make them usable at run-time.
  13
  14 # As such, this code knows about the internal structure of these lists, and
  15 # any change made to that has to be done here as well.  A random number stored
  16 # in the headers is used to minimize the possibility of things getting
  17 # out-of-sync, or the wrong data structure being passed.  Currently that
  18 # random number is:
  19 my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
  20
  21 my $out_fh = open_new('charclass_invlists.h', '>',
  22                       {style => '*', by => $0,
  23                       from => "Unicode::UCD"});
  24
  25 my $is_in_ifndef_ext_re = 0;
  26
  27 print $out_fh "/* See the generating file for comments */\n\n";
  28
  29 my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );
  30
  31 sub end_ifndef_ext_re {
  32     if ($is_in_ifndef_ext_re) {
  33         print $out_fh "\n#endif\t/* #ifndef PERL_IN_XSUB_RE */\n";
  34         $is_in_ifndef_ext_re = 0;
  35     }
  36 }
  37
  38 sub output_invlist ($$;$) {
  39     my $name = shift;
  40     my $invlist = shift;     # Reference to inversion list array
  41     my $charset = shift // "";  # name of character set for comment
  42
  43     die "No inversion list for $name" unless defined $invlist
  44                                              && ref $invlist eq 'ARRAY'
  45                                              && @$invlist;
  46
  47     # Output the inversion list $invlist using the name $name for it.
  48     # It is output in the exact internal form for inversion lists.
  49
  50     # Is the last element of the header 0, or 1 ?
  51     my $zero_or_one = 0;
  52     if ($invlist->[0] != 0) {
  53         unshift @$invlist, 0;
  54         $zero_or_one = 1;
  55     }
  56     my $count = @$invlist;
  57
  58     if ($is_in_ifndef_ext_re) {
  59         if (exists $include_in_ext_re{$name}) {
  60             end_ifndef_ext_re;
  61         }
  62     }
  63     elsif (! exists $include_in_ext_re{$name}) {
  64         print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
  65         $is_in_ifndef_ext_re = 1;
  66     }
  67
  68     print $out_fh "\nstatic const UV ${name}_invlist[] = {";
  69     print $out_fh " /* for $charset */" if $charset;
  70     print $out_fh "\n";
  71
  72     print $out_fh "\t$count,\t/* Number of elements */\n";
  73     print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
  74     print $out_fh "\t", $zero_or_one,
  75                   ",\t/* 0 if the list starts at 0;",
  76                   "\n\t\t   1 if it starts at the element beyond 0 */\n";
  77
  78     # The main body are the UVs passed in to this routine.  Do the final
  79     # element separately
  80     for my $i (0 .. @$invlist - 1) {
  81         printf $out_fh "\t0x%X", $invlist->[$i];
  82         print $out_fh "," if $i < @$invlist - 1;
  83         print $out_fh "\n";
  84     }
  85
  86     print $out_fh "};\n";
  87 }
  88
  89 sub mk_invlist_from_sorted_cp_list {
  90
  91     # Returns an inversion list constructed from the sorted input array of
  92     # code points
  93
  94     my $list_ref = shift;
  95
  96     # Initialize to just the first element
  97     my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
  98
  99     # For each succeeding element, if it extends the previous range, adjust
 100     # up, otherwise add it.
 101     for my $i (1 .. @$list_ref - 1) {
 102         if ($invlist[-1] == $list_ref->[$i]) {
 103             $invlist[-1]++;
 104         }
 105         else {
 106             push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
 107         }
 108     }
 109     return @invlist;
 110 }
 111
 112 # Read in the Case Folding rules, and construct arrays of code points for the
 113 # properties we need.
 114 my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
 115 die "Could not find inversion map for Case_Folding" unless defined $format;
 116 die "Incorrect format '$format' for Case_Folding inversion map"
 117                                                     unless $format eq 'al';
 118 my @has_multi_char_fold;
 119 my @is_non_final_fold;
 120
 121 for my $i (0 .. @$folds_ref - 1) {
 122     next unless ref $folds_ref->[$i];   # Skip single-char folds
 123     push @has_multi_char_fold, $cp_ref->[$i];
 124
 125     # Add to the non-finals list each code point that is in a non-final
 126     # position
 127     for my $j (0 .. @{$folds_ref->[$i]} - 2) {
 128         push @is_non_final_fold, $folds_ref->[$i][$j]
 129                 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
 130     }
 131 }
 132
 133 sub _Perl_Non_Final_Folds {
 134     @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
 135     return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
 136 }
 137
 138 sub UpperLatin1 {
 139     return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
 140 }
 141
 142 output_invlist("Latin1", [ 0, 256 ]);
 143 output_invlist("AboveLatin1", [ 256 ]);
 144
 145 end_ifndef_ext_re;
 146
 147 # We construct lists for all the POSIX and backslash sequence character
 148 # classes in two forms:
 149 #   1) ones which match only in the ASCII range
 150 #   2) ones which match either in the Latin1 range, or the entire Unicode range
 151 #
 152 # These get compiled in, and hence affect the memory footprint of every Perl
 153 # program, even those not using Unicode.  To minimize the size, currently
 154 # the Latin1 version is generated for the beyond ASCII range except for those
 155 # lists that are quite small for the entire range, such as for \s, which is 22
 156 # UVs long plus 4 UVs (currently) for the header.
 157 #
 158 # To save even more memory, the ASCII versions could be derived from the
 159 # larger ones at runtime, saving some memory (minus the expense of the machine
 160 # instructions to do so), but these are all small anyway, so their total is
 161 # about 100 UVs.
 162 #
 163 # In the list of properties below that get generated, the L1 prefix is a fake
 164 # property that means just the Latin1 range of the full property (whose name
 165 # has an X prefix instead of L1).
 166 #
 167 # An initial & means to use the subroutine from this file instead of an
 168 # official inversion list.
 169
 170 for my $charset (get_supported_code_pages()) {
 171     print $out_fh "\n" . get_conditional_compile_line_start($charset);
 172
 173     my @a2n = @{get_a2n($charset)};
 174                              # Ignore non-alpha in sort
 175     for my $prop (sort {     lc ($a =~ s/[[:^alpha:]]//gr)
 176                          cmp lc ($b =~ s/[[:^alpha:]]//gr)
 177                        } qw(
 178                              ASCII
 179                              Cased
 180                              VertSpace
 181                              XPerlSpace
 182                              XPosixAlnum
 183                              XPosixAlpha
 184                              XPosixBlank
 185                              XPosixCntrl
 186                              XPosixDigit
 187                              XPosixGraph
 188                              XPosixLower
 189                              XPosixPrint
 190                              XPosixPunct
 191                              XPosixSpace
 192                              XPosixUpper
 193                              XPosixWord
 194                              XPosixXDigit
 195                              _Perl_Any_Folds
 196                              &NonL1_Perl_Non_Final_Folds
 197                              _Perl_Folds_To_Multi_Char
 198                              &UpperLatin1
 199                              _Perl_IDStart
 200                              _Perl_IDCont
 201                            )
 202     ) {
 203
 204         # For the Latin1 properties, we change to use the eXtended version of the
 205         # base property, then go through the result and get rid of everything not
 206         # in Latin1 (above 255).  Actually, we retain the element for the range
 207         # that crosses the 255/256 boundary if it is one that matches the
 208         # property.  For example, in the Word property, there is a range of code
 209         # points that start at U+00F8 and goes through U+02C1.  Instead of
 210         # artificially cutting that off at 256 because 256 is the first code point
 211         # above Latin1, we let the range go to its natural ending.  That gives us
 212         # extra information with no added space taken.  But if the range that
 213         # crosses the boundary is one that doesn't match the property, we don't
 214         # start a new range above 255, as that could be construed as going to
 215         # infinity.  For example, the Upper property doesn't include the character
 216         # at 255, but does include the one at 256.  We don't include the 256 one.
 217         my $prop_name = $prop;
 218         my $is_local_sub = $prop_name =~ s/^&//;
 219         my $lookup_prop = $prop_name;
 220         my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
 221                        or $lookup_prop =~ s/^L1//);
 222         my $nonl1_only = 0;
 223         $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
 224
 225         my @invlist;
 226         if ($is_local_sub) {
 227             @invlist = eval $lookup_prop;
 228         }
 229         else {
 230             @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
 231         }
 232         die "Could not find inversion list for '$lookup_prop'" unless @invlist;
 233
 234         # Re-order the Unicode code points to native ones for this platform;
 235         # only needed for code points below 256, and only if the first range
 236         # doesn't span the whole of 0..256 (256 not 255 because a re-ordering
 237         # could cause 256 to need to be in the same range as 255.)
 238         if (! $nonl1_only || ($invlist[0] < 256
 239                               && ! ($invlist[0] == 0 && $invlist[1] > 256)))
 240         {
 241
 242             # Look at all the ranges that start before 257.
 243             my @latin1_list;
 244             while (@invlist) {
 245                 last if $invlist[0] > 256;
 246                 my $upper = @invlist > 1
 247                             ? $invlist[1] - 1      # In range
 248
 249                               # To infinity.  You may want to stop much much
 250                               # earlier; going this high may expose perl
 251                               # deficiencies with very large numbers.
 252                             : $Unicode::UCD::MAX_CP;
 253                 for my $j ($invlist[0] .. $upper) {
 254                     if ($j < 256) {
 255                         push @latin1_list, $a2n[$j];
 256                     }
 257                     else {
 258                         push @latin1_list, $j;
 259                     }
 260                 }
 261
 262                 shift @invlist; # Shift off the range that's in the list
 263                 shift @invlist; # Shift off the range not in the list
 264             }
 265
 266             # Here @invlist contains all the ranges in the original that start
 267             # at code points above 256, and @latin1_list contains all the
 268             # native code points for ranges that start with a Unicode code
 269             # point below 257.  We sort the latter and convert it to inversion
 270             # list format.  Then simply prepend it to the list of the higher
 271             # code points.
 272             @latin1_list = sort { $a <=> $b } @latin1_list;
 273             @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
 274             unshift @invlist, @latin1_list;
 275         }
 276
 277         if ($l1_only) {
 278             for my $i (0 .. @invlist - 1 - 1) {
 279                 if ($invlist[$i] > 255) {
 280
 281                     # In an inversion list, even-numbered elements give the code
 282                     # points that begin ranges that match the property;
 283                     # odd-numbered give ones that begin ranges that don't match.
 284                     # If $i is odd, we are at the first code point above 255 that
 285                     # doesn't match, which means the range it is ending does
 286                     # match, and crosses the 255/256 boundary.  We want to include
 287                     # this ending point, so increment $i, so the splice below
 288                     # includes it.  Conversely, if $i is even, it is the first
 289                     # code point above 255 that matches, which means there was no
 290                     # matching range that crossed the boundary, and we don't want
 291                     # to include this code point, so splice before it.
 292                     $i++ if $i % 2 != 0;
 293
 294                     # Remove everything past this.
 295                     splice @invlist, $i;
 296                     last;
 297                 }
 298             }
 299         }
 300         elsif ($nonl1_only) {
 301             my $found_nonl1 = 0;
 302             for my $i (0 .. @invlist - 1 - 1) {
 303                 next if $invlist[$i] < 256;
 304
 305                 # Here, we have the first element in the array that indicates an
 306                 # element above Latin1.  Get rid of all previous ones.
 307                 splice @invlist, 0, $i;
 308
 309                 # If this one's index is not divisible by 2, it means that this
 310                 # element is inverting away from being in the list, which means
 311                 # all code points from 256 to this one are in this list.
 312                 unshift @invlist, 256 if $i % 2 != 0;
 313                 $found_nonl1 = 1;
 314                 last;
 315             }
 316             die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
 317         }
 318
 319         output_invlist($prop_name, \@invlist, $charset);
 320     }
 321     end_ifndef_ext_re;
 322     print $out_fh "\n" . get_conditional_compile_line_end();
 323 }
 324
 325 my @sources = ($0, "lib/Unicode/UCD.pm");
 326 {
 327     # Depend on mktables’ own sources.  It’s a shorter list of files than
 328     # those that Unicode::UCD uses.
 329     open my $mktables_list, "lib/unicore/mktables.lst"
 330         or die "$0 cannot open lib/unicore/mktables.lst: $!";
 331     while(<$mktables_list>) {
 332         last if /===/;
 333         chomp;
 334         push @sources, "lib/unicore/$_" if /^[^#]/;
 335     }
 336 }
 337 read_only_bottom_close_and_rename($out_fh, \@sources)