This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/mk_invlists.pl: Make list for multi-fold chars
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
5use Unicode::UCD "prop_invlist";
6require 'regen/regen_lib.pl';
7
8# This program outputs charclass_invlists.h, which contains various inversion
9# lists in the form of C arrays that are to be used as-is for inversion lists.
10# Thus, the lists it contains are essentially pre-compiled, and need only a
11# light-weight fast wrapper to make them usable at run-time.
12
13# As such, this code knows about the internal structure of these lists, and
14# any change made to that has to be done here as well. A random number stored
15# in the headers is used to minimize the possibility of things getting
16# out-of-sync, or the wrong data structure being passed. Currently that
17# random number is:
15896d2f 18my $VERSION_DATA_STRUCTURE_TYPE = 290655244;
9d9177be
KW
19
20my $out_fh = open_new('charclass_invlists.h', '>',
21 {style => '*', by => $0,
22 from => "Unicode::UCD"});
23
24print $out_fh "/* See the generating file for comments */\n\n";
25
26sub output_invlist ($$) {
27 my $name = shift;
28 my $invlist = shift; # Reference to inversion list array
29
76d3994c
KW
30 die "No inversion list for $name" unless defined $invlist
31 && ref $invlist eq 'ARRAY'
32 && @$invlist;
33
9d9177be
KW
34 # Output the inversion list $invlist using the name $name for it.
35 # It is output in the exact internal form for inversion lists.
36
37 my $zero_or_one; # Is the last element of the header 0, or 1 ?
38
39 # If the first element is 0, it goes in the header, instead of the body
40 if ($invlist->[0] == 0) {
41 shift @$invlist;
42
43 $zero_or_one = 0;
44
45 # Add a dummy 0 at the end so that the length is constant. inversion
46 # lists are always stored with enough room so that if they change from
47 # beginning with 0, they don't have to grow.
48 push @$invlist, 0;
49 }
50 else {
51 $zero_or_one = 1;
52 }
53
54 print $out_fh "\nUV ${name}_invlist[] = {\n";
55
56 print $out_fh "\t", scalar @$invlist, ",\t/* Number of elements */\n";
57 print $out_fh "\t0,\t/* Current iteration position */\n";
15896d2f 58 print $out_fh "\t0,\t/* Cache of previous search index result */\n";
9d9177be
KW
59 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
60 print $out_fh "\t", $zero_or_one,
61 ",\t/* 0 if this is the first element of the list proper;",
62 "\n\t\t 1 if the next element is the first */\n";
63
64 # The main body are the UVs passed in to this routine. Do the final
65 # element separately
66 for my $i (0 .. @$invlist - 1 - 1) {
67 print $out_fh "\t$invlist->[$i],\n";
68 }
69
70 # The final element does not have a trailing comma, as C can't handle it.
71 print $out_fh "\t$invlist->[-1]\n";
72
73 print $out_fh "};\n";
74}
75
76output_invlist("Latin1", [ 0, 256 ]);
77output_invlist("AboveLatin1", [ 256 ]);
78
3f427fd9
KW
79# We construct lists for all the POSIX and backslash sequence character
80# classes in two forms:
81# 1) ones which match only in the ASCII range
82# 2) ones which match either in the Latin1 range, or the entire Unicode range
83#
84# These get compiled in, and hence affect the memory footprint of every Perl
85# program, even those not using Unicode. To minimize the size, currently
86# the Latin1 version is generated for the beyond ASCII range except for those
87# lists that are quite small for the entire range, such as for \s, which is 22
88# UVs long plus 4 UVs (currently) for the header.
89#
90# To save even more memory, the ASCII versions could be derived from the
91# larger ones at runtime, saving some memory (minus the expense of the machine
92# instructions to do so), but these are all small anyway, so their total is
93# about 100 UVs.
94#
95# In the list of properties below that get generated, the L1 prefix is a fake
96# property that means just the Latin1 range of the full property (whose name
97# has an X prefix instead of L1).
98
9d9177be
KW
99for my $prop (qw(
100 ASCII
dab0c3e7 101 L1Cased
3f427fd9
KW
102 VertSpace
103 PerlSpace
104 XPerlSpace
105 PosixAlnum
106 L1PosixAlnum
107 PosixAlpha
108 L1PosixAlpha
109 PosixBlank
110 XPosixBlank
111 PosixCntrl
112 XPosixCntrl
113 PosixDigit
114 PosixGraph
115 L1PosixGraph
116 PosixLower
117 L1PosixLower
118 PosixPrint
119 L1PosixPrint
120 PosixPunct
121 L1PosixPunct
122 PosixSpace
123 XPosixSpace
124 PosixUpper
125 L1PosixUpper
126 PosixWord
127 L1PosixWord
128 PosixXDigit
129 XPosixXDigit
b72a36d4 130 NonL1_Perl_Non_Final_Folds
9ffebac4 131 _Perl_Multi_Char_Folds
9d9177be
KW
132 )
133) {
134
3f427fd9
KW
135 # For the Latin1 properties, we change to use the eXtended version of the
136 # base property, then go through the result and get rid of everything not
b4069bca
KW
137 # in Latin1 (above 255). Actually, we retain the element for the range
138 # that crosses the 255/256 boundary if it is one that matches the
139 # property. For example, in the Word property, there is a range of code
140 # points that start at U+00F8 and goes through U+02C1. Instead of
3f427fd9
KW
141 # artifically cutting that off at 256 because 256 is the first code point
142 # above Latin1, we let the range go to its natural ending. That gives us
b4069bca
KW
143 # extra information with no added space taken. But if the range that
144 # crosses the boundary is one that doesn't match the property, we don't
145 # start a new range above 255, as that could be construed as going to
146 # infinity. For example, the Upper property doesn't include the character
147 # at 255, but does include the one at 256. We don't include the 256 one.
dab0c3e7 148 my $lookup_prop = $prop;
c4854dea
KW
149 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//);
150 my $nonl1_only = 0;
151 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
152 my @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
ad89228c 153 die "Could not find inversion list for '$lookup_prop'" unless @invlist;
3f427fd9 154
c4854dea 155 if ($l1_only) {
3f427fd9
KW
156 for my $i (0 .. @invlist - 1 - 1) {
157 if ($invlist[$i] > 255) {
b4069bca
KW
158
159 # In an inversion list, even-numbered elements give the code
160 # points that begin ranges that match the property;
161 # odd-numbered give ones that begin ranges that don't match.
162 # If $i is odd, we are at the first code point above 255 that
163 # doesn't match, which means the range it is ending does
164 # match, and crosses the 255/256 boundary. We want to include
165 # this ending point, so increment $i, so the splice below
166 # includes it. Conversely, if $i is even, it is the first
167 # code point above 255 that matches, which means there was no
168 # matching range that crossed the boundary, and we don't want
169 # to include this code point, so splice before it.
170 $i++ if $i % 2 != 0;
171
172 # Remove everything past this.
173 splice @invlist, $i;
3f427fd9
KW
174 last;
175 }
176 }
177 }
c4854dea
KW
178 elsif ($nonl1_only) {
179 my $found_nonl1 = 0;
180 for my $i (0 .. @invlist - 1 - 1) {
181 next if $invlist[$i] < 256;
182
183 # Here, we have the first element in the array that indicates an
184 # element above Latin1. Get rid of all previous ones.
185 splice @invlist, 0, $i;
186
187 # If this one's index is not divisible by 2, it means that this
188 # element is inverting away from being in the list, which means
189 # all code points from 256 to this one are in this list.
190 unshift @invlist, 256 if $i % 2 != 0;
191 $found_nonl1 = 1;
192 last;
193 }
194 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
195 }
3f427fd9 196
9d9177be
KW
197 output_invlist($prop, \@invlist);
198}
199
200read_only_bottom_close_and_rename($out_fh)