regen/mk_invlists.pl

   1 #!perl -w
   2 use 5.015;
   3 use strict;
   4 use warnings;
   5 use Unicode::UCD qw(prop_invlist prop_invmap);
   6 require 'regen/regen_lib.pl';
   7 require 'regen/charset_translations.pl';
   8
   9 # This program outputs charclass_invlists.h, which contains various inversion
  10 # lists in the form of C arrays that are to be used as-is for inversion lists.
  11 # Thus, the lists it contains are essentially pre-compiled, and need only a
  12 # light-weight fast wrapper to make them usable at run-time.
  13
  14 # As such, this code knows about the internal structure of these lists, and
  15 # any change made to that has to be done here as well.  A random number stored
  16 # in the headers is used to minimize the possibility of things getting
  17 # out-of-sync, or the wrong data structure being passed.  Currently that
  18 # random number is:
  19 my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
  20
  21 my $out_fh = open_new('charclass_invlists.h', '>',
  22                       {style => '*', by => $0,
  23                       from => "Unicode::UCD"});
  24
  25 my $is_in_ifndef_ext_re = 0;
  26
  27 print $out_fh "/* See the generating file for comments */\n\n";
  28
  29 my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );
  30
  31 sub end_ifndef_ext_re {
  32     if ($is_in_ifndef_ext_re) {
  33         print $out_fh "\n#endif\t/* #ifndef PERL_IN_XSUB_RE */\n";
  34         $is_in_ifndef_ext_re = 0;
  35     }
  36 }
  37
  38 sub output_invlist ($$;$) {
  39     my $name = shift;
  40     my $invlist = shift;     # Reference to inversion list array
  41     my $charset = shift // "";  # name of character set for comment
  42
  43     die "No inversion list for $name" unless defined $invlist
  44                                              && ref $invlist eq 'ARRAY'
  45                                              && @$invlist;
  46
  47     # Output the inversion list $invlist using the name $name for it.
  48     # It is output in the exact internal form for inversion lists.
  49
  50     # Is the last element of the header 0, or 1 ?
  51     my $zero_or_one = 0;
  52     if ($invlist->[0] != 0) {
  53         unshift @$invlist, 0;
  54         $zero_or_one = 1;
  55     }
  56     my $count = @$invlist;
  57
  58     if ($is_in_ifndef_ext_re) {
  59         if (exists $include_in_ext_re{$name}) {
  60             end_ifndef_ext_re;
  61         }
  62     }
  63     elsif (! exists $include_in_ext_re{$name}) {
  64         print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
  65         $is_in_ifndef_ext_re = 1;
  66     }
  67
  68     print $out_fh "\nstatic const UV ${name}_invlist[] = {";
  69     print $out_fh " /* for $charset */" if $charset;
  70     print $out_fh "\n";
  71
  72     print $out_fh "\t$count,\t/* Number of elements */\n";
  73     print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
  74     print $out_fh "\t", $zero_or_one,
  75                   ",\t/* 0 if the list starts at 0;",
  76                   "\n\t\t   1 if it starts at the element beyond 0 */\n";
  77
  78     # The main body are the UVs passed in to this routine.  Do the final
  79     # element separately
  80     for my $i (0 .. @$invlist - 1 - 1) {
  81         print $out_fh "\t$invlist->[$i],\n";
  82     }
  83
  84     # The final element does not have a trailing comma, as C can't handle it.
  85     print $out_fh "\t$invlist->[-1]\n";
  86
  87     print $out_fh "};\n";
  88 }
  89
  90 sub mk_invlist_from_cp_list {
  91
  92     # Returns an inversion list constructed from the sorted input array of
  93     # code points
  94
  95     my $list_ref = shift;
  96
  97     # Initialize to just the first element
  98     my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
  99
 100     # For each succeeding element, if it extends the previous range, adjust
 101     # up, otherwise add it.
 102     for my $i (1 .. @$list_ref - 1) {
 103         if ($invlist[-1] == $list_ref->[$i]) {
 104             $invlist[-1]++;
 105         }
 106         else {
 107             push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
 108         }
 109     }
 110     return @invlist;
 111 }
 112
 113 # Read in the Case Folding rules, and construct arrays of code points for the
 114 # properties we need.
 115 my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
 116 die "Could not find inversion map for Case_Folding" unless defined $format;
 117 die "Incorrect format '$format' for Case_Folding inversion map"
 118                                                     unless $format eq 'al';
 119 my @has_multi_char_fold;
 120 my @is_non_final_fold;
 121
 122 for my $i (0 .. @$folds_ref - 1) {
 123     next unless ref $folds_ref->[$i];   # Skip single-char folds
 124     push @has_multi_char_fold, $cp_ref->[$i];
 125
 126     # Add to the non-finals list each code point that is in a non-final
 127     # position
 128     for my $j (0 .. @{$folds_ref->[$i]} - 2) {
 129         push @is_non_final_fold, $folds_ref->[$i][$j]
 130                 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
 131     }
 132 }
 133
 134 sub _Perl_Non_Final_Folds {
 135     @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
 136     return mk_invlist_from_cp_list(\@is_non_final_fold);
 137 }
 138
 139 sub UpperLatin1 {
 140     return mk_invlist_from_cp_list([ 128 .. 255 ]);
 141 }
 142
 143 output_invlist("Latin1", [ 0, 256 ]);
 144 output_invlist("AboveLatin1", [ 256 ]);
 145
 146 end_ifndef_ext_re;
 147
 148 # We construct lists for all the POSIX and backslash sequence character
 149 # classes in two forms:
 150 #   1) ones which match only in the ASCII range
 151 #   2) ones which match either in the Latin1 range, or the entire Unicode range
 152 #
 153 # These get compiled in, and hence affect the memory footprint of every Perl
 154 # program, even those not using Unicode.  To minimize the size, currently
 155 # the Latin1 version is generated for the beyond ASCII range except for those
 156 # lists that are quite small for the entire range, such as for \s, which is 22
 157 # UVs long plus 4 UVs (currently) for the header.
 158 #
 159 # To save even more memory, the ASCII versions could be derived from the
 160 # larger ones at runtime, saving some memory (minus the expense of the machine
 161 # instructions to do so), but these are all small anyway, so their total is
 162 # about 100 UVs.
 163 #
 164 # In the list of properties below that get generated, the L1 prefix is a fake
 165 # property that means just the Latin1 range of the full property (whose name
 166 # has an X prefix instead of L1).
 167 #
 168 # An initial & means to use the subroutine from this file instead of an
 169 # official inversion list.
 170
 171 for my $charset (get_supported_code_pages()) {
 172     print $out_fh "\n" . get_conditional_compile_line_start($charset);
 173
 174     my @a2n = get_a2n($charset);
 175     for my $prop (qw(
 176                     ASCII
 177                     Cased
 178                     VertSpace
 179                     XPerlSpace
 180                     XPosixAlnum
 181                     XPosixAlpha
 182                     XPosixBlank
 183                     XPosixCntrl
 184                     XPosixDigit
 185                     XPosixGraph
 186                     XPosixLower
 187                     XPosixPrint
 188                     XPosixPunct
 189                     XPosixSpace
 190                     XPosixUpper
 191                     XPosixWord
 192                     XPosixXDigit
 193                     _Perl_Any_Folds
 194                     &NonL1_Perl_Non_Final_Folds
 195                     _Perl_Folds_To_Multi_Char
 196                     &UpperLatin1
 197                     _Perl_IDStart
 198                     _Perl_IDCont
 199         )
 200     ) {
 201
 202         # For the Latin1 properties, we change to use the eXtended version of the
 203         # base property, then go through the result and get rid of everything not
 204         # in Latin1 (above 255).  Actually, we retain the element for the range
 205         # that crosses the 255/256 boundary if it is one that matches the
 206         # property.  For example, in the Word property, there is a range of code
 207         # points that start at U+00F8 and goes through U+02C1.  Instead of
 208         # artificially cutting that off at 256 because 256 is the first code point
 209         # above Latin1, we let the range go to its natural ending.  That gives us
 210         # extra information with no added space taken.  But if the range that
 211         # crosses the boundary is one that doesn't match the property, we don't
 212         # start a new range above 255, as that could be construed as going to
 213         # infinity.  For example, the Upper property doesn't include the character
 214         # at 255, but does include the one at 256.  We don't include the 256 one.
 215         my $prop_name = $prop;
 216         my $is_local_sub = $prop_name =~ s/^&//;
 217         my $lookup_prop = $prop_name;
 218         my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
 219                        or $lookup_prop =~ s/^L1//);
 220         my $nonl1_only = 0;
 221         $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
 222
 223         my @invlist;
 224         if ($is_local_sub) {
 225             @invlist = eval $lookup_prop;
 226         }
 227         else {
 228             @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
 229         }
 230         die "Could not find inversion list for '$lookup_prop'" unless @invlist;
 231         my @full_list;
 232         for (my $i = 0; $i < @invlist; $i += 2) {
 233             my $upper = ($i + 1) < @invlist
 234                         ? $invlist[$i+1] - 1      # In range
 235                         : $Unicode::UCD::MAX_CP;  # To infinity.  You may want
 236                                                 # to stop much much earlier;
 237                                                 # going this high may expose
 238                                                 # perl deficiencies with very
 239                                                 # large numbers.
 240             for my $j ($invlist[$i] .. $upper) {
 241                 if ($j < 256) {
 242                     push @full_list, $a2n[$j];
 243                 }
 244                 else {
 245                     push @full_list, $j;
 246                 }
 247             }
 248         }
 249         @full_list = sort { $a <=> $b } @full_list;
 250         @invlist = mk_invlist_from_cp_list(\@full_list);
 251
 252         if ($l1_only) {
 253             for my $i (0 .. @invlist - 1 - 1) {
 254                 if ($invlist[$i] > 255) {
 255
 256                     # In an inversion list, even-numbered elements give the code
 257                     # points that begin ranges that match the property;
 258                     # odd-numbered give ones that begin ranges that don't match.
 259                     # If $i is odd, we are at the first code point above 255 that
 260                     # doesn't match, which means the range it is ending does
 261                     # match, and crosses the 255/256 boundary.  We want to include
 262                     # this ending point, so increment $i, so the splice below
 263                     # includes it.  Conversely, if $i is even, it is the first
 264                     # code point above 255 that matches, which means there was no
 265                     # matching range that crossed the boundary, and we don't want
 266                     # to include this code point, so splice before it.
 267                     $i++ if $i % 2 != 0;
 268
 269                     # Remove everything past this.
 270                     splice @invlist, $i;
 271                     last;
 272                 }
 273             }
 274         }
 275         elsif ($nonl1_only) {
 276             my $found_nonl1 = 0;
 277             for my $i (0 .. @invlist - 1 - 1) {
 278                 next if $invlist[$i] < 256;
 279
 280                 # Here, we have the first element in the array that indicates an
 281                 # element above Latin1.  Get rid of all previous ones.
 282                 splice @invlist, 0, $i;
 283
 284                 # If this one's index is not divisible by 2, it means that this
 285                 # element is inverting away from being in the list, which means
 286                 # all code points from 256 to this one are in this list.
 287                 unshift @invlist, 256 if $i % 2 != 0;
 288                 $found_nonl1 = 1;
 289                 last;
 290             }
 291             die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
 292         }
 293
 294         output_invlist($prop_name, \@invlist, $charset);
 295     }
 296     end_ifndef_ext_re;
 297     print $out_fh "\n" . get_conditional_compile_line_end();
 298 }
 299
 300 read_only_bottom_close_and_rename($out_fh)