5 use Unicode::UCD "prop_invlist";
6 require 'regen/regen_lib.pl';
8 # This program outputs charclass_invlists.h, which contains various inversion
9 # lists in the form of C arrays that are to be used as-is for inversion lists.
10 # Thus, the lists it contains are essentially pre-compiled, and need only a
11 # light-weight fast wrapper to make them usable at run-time.
13 # As such, this code knows about the internal structure of these lists, and
14 # any change made to that has to be done here as well. A random number stored
15 # in the headers is used to minimize the possibility of things getting
16 # out-of-sync, or the wrong data structure being passed. Currently that
18 my $VERSION_DATA_STRUCTURE_TYPE = 1064334010;
20 my $out_fh = open_new('charclass_invlists.h', '>',
21 {style => '*', by => $0,
22 from => "Unicode::UCD"});
24 print $out_fh "/* See the generating file for comments */\n\n";
26 sub output_invlist ($$) {
28 my $invlist = shift; # Reference to inversion list array
30 die "No inversion list for $name" unless defined $invlist
31 && ref $invlist eq 'ARRAY'
34 # Output the inversion list $invlist using the name $name for it.
35 # It is output in the exact internal form for inversion lists.
37 my $zero_or_one; # Is the last element of the header 0, or 1 ?
39 # If the first element is 0, it goes in the header, instead of the body
40 if ($invlist->[0] == 0) {
45 # Add a dummy 0 at the end so that the length is constant. inversion
46 # lists are always stored with enough room so that if they change from
47 # beginning with 0, they don't have to grow.
54 print $out_fh "\nUV ${name}_invlist[] = {\n";
56 print $out_fh "\t", scalar @$invlist, ",\t/* Number of elements */\n";
57 print $out_fh "\t0,\t/* Current iteration position */\n";
58 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
59 print $out_fh "\t", $zero_or_one,
60 ",\t/* 0 if this is the first element of the list proper;",
61 "\n\t\t 1 if the next element is the first */\n";
63 # The main body are the UVs passed in to this routine. Do the final
65 for my $i (0 .. @$invlist - 1 - 1) {
66 print $out_fh "\t$invlist->[$i],\n";
69 # The final element does not have a trailing comma, as C can't handle it.
70 print $out_fh "\t$invlist->[-1]\n";
75 output_invlist("Latin1", [ 0, 256 ]);
76 output_invlist("AboveLatin1", [ 256 ]);
78 # We construct lists for all the POSIX and backslash sequence character
79 # classes in two forms:
80 # 1) ones which match only in the ASCII range
81 # 2) ones which match either in the Latin1 range, or the entire Unicode range
83 # These get compiled in, and hence affect the memory footprint of every Perl
84 # program, even those not using Unicode. To minimize the size, currently
85 # the Latin1 version is generated for the beyond ASCII range except for those
86 # lists that are quite small for the entire range, such as for \s, which is 22
87 # UVs long plus 4 UVs (currently) for the header.
89 # To save even more memory, the ASCII versions could be derived from the
90 # larger ones at runtime, saving some memory (minus the expense of the machine
91 # instructions to do so), but these are all small anyway, so their total is
94 # In the list of properties below that get generated, the L1 prefix is a fake
95 # property that means just the Latin1 range of the full property (whose name
96 # has an X prefix instead of L1).
129 _Perl_Non_Final_Folds
133 # For the Latin1 properties, we change to use the eXtended version of the
134 # base property, then go through the result and get rid of everything not
135 # in Latin1 (above 255). Actually, we retain the element for the range
136 # that crosses the 255/256 boundary if it is one that matches the
137 # property. For example, in the Word property, there is a range of code
138 # points that start at U+00F8 and goes through U+02C1. Instead of
139 # artifically cutting that off at 256 because 256 is the first code point
140 # above Latin1, we let the range go to its natural ending. That gives us
141 # extra information with no added space taken. But if the range that
142 # crosses the boundary is one that doesn't match the property, we don't
143 # start a new range above 255, as that could be construed as going to
144 # infinity. For example, the Upper property doesn't include the character
145 # at 255, but does include the one at 256. We don't include the 256 one.
146 my $lookup_prop = $prop;
147 $lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//;
148 my @invlist = prop_invlist($lookup_prop);
149 die "Could not find inversion list for '$lookup_prop'" unless @invlist;
151 if ($lookup_prop ne $prop) {
152 for my $i (0 .. @invlist - 1 - 1) {
153 if ($invlist[$i] > 255) {
155 # In an inversion list, even-numbered elements give the code
156 # points that begin ranges that match the property;
157 # odd-numbered give ones that begin ranges that don't match.
158 # If $i is odd, we are at the first code point above 255 that
159 # doesn't match, which means the range it is ending does
160 # match, and crosses the 255/256 boundary. We want to include
161 # this ending point, so increment $i, so the splice below
162 # includes it. Conversely, if $i is even, it is the first
163 # code point above 255 that matches, which means there was no
164 # matching range that crossed the boundary, and we don't want
165 # to include this code point, so splice before it.
168 # Remove everything past this.
175 output_invlist($prop, \@invlist);
178 read_only_bottom_close_and_rename($out_fh)