regen/mk_invlists.pl

   1 #!perl -w
   2 use 5.015;
   3 use strict;
   4 use warnings;
   5 use Unicode::UCD "prop_invlist";
   6 require 'regen/regen_lib.pl';
   7
   8 # This program outputs charclass_invlists.h, which contains various inversion
   9 # lists in the form of C arrays that are to be used as-is for inversion lists.
  10 # Thus, the lists it contains are essentially pre-compiled, and need only a
  11 # light-weight fast wrapper to make them usable at run-time.
  12
  13 # As such, this code knows about the internal structure of these lists, and
  14 # any change made to that has to be done here as well.  A random number stored
  15 # in the headers is used to minimize the possibility of things getting
  16 # out-of-sync, or the wrong data structure being passed.  Currently that
  17 # random number is:
  18 my $VERSION_DATA_STRUCTURE_TYPE = 1064334010;
  19
  20 my $out_fh = open_new('charclass_invlists.h', '>',
  21                       {style => '*', by => $0,
  22                       from => "Unicode::UCD"});
  23
  24 print $out_fh "/* See the generating file for comments */\n\n";
  25
  26 sub output_invlist ($$) {
  27     my $name = shift;
  28     my $invlist = shift;     # Reference to inversion list array
  29
  30     # Output the inversion list $invlist using the name $name for it.
  31     # It is output in the exact internal form for inversion lists.
  32
  33     my $zero_or_one;    # Is the last element of the header 0, or 1 ?
  34
  35     # If the first element is 0, it goes in the header, instead of the body
  36     if ($invlist->[0] == 0) {
  37         shift @$invlist;
  38
  39         $zero_or_one = 0;
  40
  41         # Add a dummy 0 at the end so that the length is constant.  inversion
  42         # lists are always stored with enough room so that if they change from
  43         # beginning with 0, they don't have to grow.
  44         push @$invlist, 0;
  45     }
  46     else {
  47         $zero_or_one = 1;
  48     }
  49
  50     print $out_fh "\nUV ${name}_invlist[] = {\n";
  51
  52     print $out_fh "\t", scalar @$invlist, ",\t/* Number of elements */\n";
  53     print $out_fh "\t0,\t/* Current iteration position */\n";
  54     print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
  55     print $out_fh "\t", $zero_or_one,
  56                   ",\t/* 0 if this is the first element of the list proper;",
  57                   "\n\t\t   1 if the next element is the first */\n";
  58
  59     # The main body are the UVs passed in to this routine.  Do the final
  60     # element separately
  61     for my $i (0 .. @$invlist - 1 - 1) {
  62         print $out_fh "\t$invlist->[$i],\n";
  63     }
  64
  65     # The final element does not have a trailing comma, as C can't handle it.
  66     print $out_fh "\t$invlist->[-1]\n";
  67
  68     print $out_fh "};\n";
  69 }
  70
  71 output_invlist("Latin1", [ 0, 256 ]);
  72 output_invlist("AboveLatin1", [ 256 ]);
  73
  74 # We construct lists for all the POSIX and backslash sequence character
  75 # classes in two forms:
  76 #   1) ones which match only in the ASCII range
  77 #   2) ones which match either in the Latin1 range, or the entire Unicode range
  78 #
  79 # These get compiled in, and hence affect the memory footprint of every Perl
  80 # program, even those not using Unicode.  To minimize the size, currently
  81 # the Latin1 version is generated for the beyond ASCII range except for those
  82 # lists that are quite small for the entire range, such as for \s, which is 22
  83 # UVs long plus 4 UVs (currently) for the header.
  84 #
  85 # To save even more memory, the ASCII versions could be derived from the
  86 # larger ones at runtime, saving some memory (minus the expense of the machine
  87 # instructions to do so), but these are all small anyway, so their total is
  88 # about 100 UVs.
  89 #
  90 # In the list of properties below that get generated, the L1 prefix is a fake
  91 # property that means just the Latin1 range of the full property (whose name
  92 # has an X prefix instead of L1).
  93
  94 for my $prop (qw(
  95                 ASCII
  96                 L1Cased
  97                 VertSpace
  98                 PerlSpace
  99                     XPerlSpace
 100                 PosixAlnum
 101                     L1PosixAlnum
 102                 PosixAlpha
 103                     L1PosixAlpha
 104                 PosixBlank
 105                     XPosixBlank
 106                 PosixCntrl
 107                     XPosixCntrl
 108                 PosixDigit
 109                 PosixGraph
 110                     L1PosixGraph
 111                 PosixLower
 112                     L1PosixLower
 113                 PosixPrint
 114                     L1PosixPrint
 115                 PosixPunct
 116                     L1PosixPunct
 117                 PosixSpace
 118                     XPosixSpace
 119                 PosixUpper
 120                     L1PosixUpper
 121                 PosixWord
 122                     L1PosixWord
 123                 PosixXDigit
 124                     XPosixXDigit
 125     )
 126 ) {
 127
 128     # For the Latin1 properties, we change to use the eXtended version of the
 129     # base property, then go through the result and get rid of everything not
 130     # in Latin1 (above 255).  Actually, we retain the element for the range
 131     # that crosses the 255/256 boundary if it is one that matches the
 132     # property.  For example, in the Word property, there is a range of code
 133     # points that start at U+00F8 and goes through U+02C1.  Instead of
 134     # artifically cutting that off at 256 because 256 is the first code point
 135     # above Latin1, we let the range go to its natural ending.  That gives us
 136     # extra information with no added space taken.  But if the range that
 137     # crosses the boundary is one that doesn't match the property, we don't
 138     # start a new range above 255, as that could be construed as going to
 139     # infinity.  For example, the Upper property doesn't include the character
 140     # at 255, but does include the one at 256.  We don't include the 256 one.
 141     my $lookup_prop = $prop;
 142     $lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//;
 143     my @invlist = prop_invlist($lookup_prop);
 144
 145     if ($lookup_prop ne $prop) {
 146         for my $i (0 .. @invlist - 1 - 1) {
 147             if ($invlist[$i] > 255) {
 148
 149                 # In an inversion list, even-numbered elements give the code
 150                 # points that begin ranges that match the property;
 151                 # odd-numbered give ones that begin ranges that don't match.
 152                 # If $i is odd, we are at the first code point above 255 that
 153                 # doesn't match, which means the range it is ending does
 154                 # match, and crosses the 255/256 boundary.  We want to include
 155                 # this ending point, so increment $i, so the splice below
 156                 # includes it.  Conversely, if $i is even, it is the first
 157                 # code point above 255 that matches, which means there was no
 158                 # matching range that crossed the boundary, and we don't want
 159                 # to include this code point, so splice before it.
 160                 $i++ if $i % 2 != 0;
 161
 162                 # Remove everything past this.
 163                 splice @invlist, $i;
 164                 last;
 165             }
 166         }
 167     }
 168
 169     output_invlist($prop, \@invlist);
 170 }
 171
 172 read_only_bottom_close_and_rename($out_fh)