This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
embed.fnc: Clarify comments on E,X flag usage; nit
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
a02047bf 5use Unicode::UCD qw(prop_invlist prop_invmap);
9d9177be
KW
6require 'regen/regen_lib.pl';
7
8# This program outputs charclass_invlists.h, which contains various inversion
9# lists in the form of C arrays that are to be used as-is for inversion lists.
10# Thus, the lists it contains are essentially pre-compiled, and need only a
11# light-weight fast wrapper to make them usable at run-time.
12
13# As such, this code knows about the internal structure of these lists, and
14# any change made to that has to be done here as well. A random number stored
15# in the headers is used to minimize the possibility of things getting
16# out-of-sync, or the wrong data structure being passed. Currently that
17# random number is:
de353015 18my $VERSION_DATA_STRUCTURE_TYPE = 290655244;
9d9177be
KW
19
20my $out_fh = open_new('charclass_invlists.h', '>',
21 {style => '*', by => $0,
22 from => "Unicode::UCD"});
23
24print $out_fh "/* See the generating file for comments */\n\n";
25
015bb97c
CB
26my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );
27
9d9177be
KW
28sub output_invlist ($$) {
29 my $name = shift;
30 my $invlist = shift; # Reference to inversion list array
31
76d3994c
KW
32 die "No inversion list for $name" unless defined $invlist
33 && ref $invlist eq 'ARRAY'
34 && @$invlist;
35
9d9177be
KW
36 # Output the inversion list $invlist using the name $name for it.
37 # It is output in the exact internal form for inversion lists.
38
de353015
KW
39 my $zero_or_one; # Is the last element of the header 0, or 1 ?
40
41 # If the first element is 0, it goes in the header, instead of the body
42 if ($invlist->[0] == 0) {
43 shift @$invlist;
44
45 $zero_or_one = 0;
46
47 # Add a dummy 0 at the end so that the length is constant. inversion
48 # lists are always stored with enough room so that if they change from
49 # beginning with 0, they don't have to grow.
50 push @$invlist, 0;
51 }
52 else {
9d9177be
KW
53 $zero_or_one = 1;
54 }
55
015bb97c 56 print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
18505f09 57 print $out_fh "\nstatic UV ${name}_invlist[] = {\n";
9d9177be 58
de353015 59 print $out_fh "\t", scalar @$invlist, ",\t/* Number of elements */\n";
67434baf
KW
60
61 # This should be UV_MAX, but I (khw) am not confident that the suffixes
62 # for specifying the constant are portable, e.g. 'ull' on a 32 bit
63 # machine that is configured to use 64 bits; might need a Configure probe
64 print $out_fh "\t0,\t/* Current iteration position */\n";
65 print $out_fh "\t0,\t/* Cache of previous search index result */\n";
9d9177be
KW
66 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
67 print $out_fh "\t", $zero_or_one,
de353015
KW
68 ",\t/* 0 if this is the first element of the list proper;",
69 "\n\t\t 1 if the next element is the first */\n";
9d9177be
KW
70
71 # The main body are the UVs passed in to this routine. Do the final
72 # element separately
73 for my $i (0 .. @$invlist - 1 - 1) {
74 print $out_fh "\t$invlist->[$i],\n";
75 }
76
77 # The final element does not have a trailing comma, as C can't handle it.
78 print $out_fh "\t$invlist->[-1]\n";
79
80 print $out_fh "};\n";
015bb97c
CB
81 print $out_fh "\n#endif\n" unless exists $include_in_ext_re{$name};
82
9d9177be
KW
83}
84
a02047bf
KW
85sub mk_invlist_from_cp_list {
86
87 # Returns an inversion list constructed from the sorted input array of
88 # code points
89
90 my $list_ref = shift;
91
92 # Initialize to just the first element
93 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
94
95 # For each succeeding element, if it extends the previous range, adjust
96 # up, otherwise add it.
97 for my $i (1 .. @$list_ref - 1) {
98 if ($invlist[-1] == $list_ref->[$i]) {
99 $invlist[-1]++;
100 }
101 else {
102 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
103 }
104 }
105 return @invlist;
106}
107
108# Read in the Case Folding rules, and construct arrays of code points for the
109# properties we need.
110my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
111die "Could not find inversion map for Case_Folding" unless defined $format;
112die "Incorrect format '$format' for Case_Folding inversion map"
113 unless $format eq 'al';
114my @has_multi_char_fold;
115my @is_non_final_fold;
116
117for my $i (0 .. @$folds_ref - 1) {
118 next unless ref $folds_ref->[$i]; # Skip single-char folds
119 push @has_multi_char_fold, $cp_ref->[$i];
120
b6a6e956 121 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
122 # position
123 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
124 push @is_non_final_fold, $folds_ref->[$i][$j]
125 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
126 }
127}
128
129sub _Perl_Multi_Char_Folds {
130 @has_multi_char_fold = sort { $a <=> $b } @has_multi_char_fold;
131 return mk_invlist_from_cp_list(\@has_multi_char_fold);
132}
133
134sub _Perl_Non_Final_Folds {
135 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
136 return mk_invlist_from_cp_list(\@is_non_final_fold);
137}
138
9d9177be
KW
139output_invlist("Latin1", [ 0, 256 ]);
140output_invlist("AboveLatin1", [ 256 ]);
141
3f427fd9
KW
142# We construct lists for all the POSIX and backslash sequence character
143# classes in two forms:
144# 1) ones which match only in the ASCII range
145# 2) ones which match either in the Latin1 range, or the entire Unicode range
146#
147# These get compiled in, and hence affect the memory footprint of every Perl
148# program, even those not using Unicode. To minimize the size, currently
149# the Latin1 version is generated for the beyond ASCII range except for those
150# lists that are quite small for the entire range, such as for \s, which is 22
151# UVs long plus 4 UVs (currently) for the header.
152#
153# To save even more memory, the ASCII versions could be derived from the
154# larger ones at runtime, saving some memory (minus the expense of the machine
155# instructions to do so), but these are all small anyway, so their total is
156# about 100 UVs.
157#
158# In the list of properties below that get generated, the L1 prefix is a fake
159# property that means just the Latin1 range of the full property (whose name
160# has an X prefix instead of L1).
a02047bf
KW
161#
162# An initial & means to use the subroutine from this file instead of an
163# official inversion list.
3f427fd9 164
9d9177be
KW
165for my $prop (qw(
166 ASCII
dab0c3e7 167 L1Cased
3f427fd9
KW
168 VertSpace
169 PerlSpace
170 XPerlSpace
171 PosixAlnum
172 L1PosixAlnum
173 PosixAlpha
174 L1PosixAlpha
175 PosixBlank
176 XPosixBlank
177 PosixCntrl
178 XPosixCntrl
179 PosixDigit
180 PosixGraph
181 L1PosixGraph
182 PosixLower
183 L1PosixLower
184 PosixPrint
185 L1PosixPrint
186 PosixPunct
187 L1PosixPunct
188 PosixSpace
189 XPosixSpace
190 PosixUpper
191 L1PosixUpper
192 PosixWord
193 L1PosixWord
194 PosixXDigit
195 XPosixXDigit
a02047bf
KW
196 &NonL1_Perl_Non_Final_Folds
197 &_Perl_Multi_Char_Folds
9d9177be
KW
198 )
199) {
200
3f427fd9
KW
201 # For the Latin1 properties, we change to use the eXtended version of the
202 # base property, then go through the result and get rid of everything not
b4069bca
KW
203 # in Latin1 (above 255). Actually, we retain the element for the range
204 # that crosses the 255/256 boundary if it is one that matches the
205 # property. For example, in the Word property, there is a range of code
206 # points that start at U+00F8 and goes through U+02C1. Instead of
b6a6e956 207 # artificially cutting that off at 256 because 256 is the first code point
3f427fd9 208 # above Latin1, we let the range go to its natural ending. That gives us
b4069bca
KW
209 # extra information with no added space taken. But if the range that
210 # crosses the boundary is one that doesn't match the property, we don't
211 # start a new range above 255, as that could be construed as going to
212 # infinity. For example, the Upper property doesn't include the character
213 # at 255, but does include the one at 256. We don't include the 256 one.
a02047bf
KW
214 my $prop_name = $prop;
215 my $is_local_sub = $prop_name =~ s/^&//;
216 my $lookup_prop = $prop_name;
c4854dea
KW
217 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//);
218 my $nonl1_only = 0;
219 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
a02047bf
KW
220
221 my @invlist;
222 if ($is_local_sub) {
223 @invlist = eval $lookup_prop;
224 }
225 else {
226 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
227 }
ad89228c 228 die "Could not find inversion list for '$lookup_prop'" unless @invlist;
3f427fd9 229
c4854dea 230 if ($l1_only) {
3f427fd9
KW
231 for my $i (0 .. @invlist - 1 - 1) {
232 if ($invlist[$i] > 255) {
b4069bca
KW
233
234 # In an inversion list, even-numbered elements give the code
235 # points that begin ranges that match the property;
236 # odd-numbered give ones that begin ranges that don't match.
237 # If $i is odd, we are at the first code point above 255 that
238 # doesn't match, which means the range it is ending does
239 # match, and crosses the 255/256 boundary. We want to include
240 # this ending point, so increment $i, so the splice below
241 # includes it. Conversely, if $i is even, it is the first
242 # code point above 255 that matches, which means there was no
243 # matching range that crossed the boundary, and we don't want
244 # to include this code point, so splice before it.
245 $i++ if $i % 2 != 0;
246
247 # Remove everything past this.
248 splice @invlist, $i;
3f427fd9
KW
249 last;
250 }
251 }
252 }
c4854dea
KW
253 elsif ($nonl1_only) {
254 my $found_nonl1 = 0;
255 for my $i (0 .. @invlist - 1 - 1) {
256 next if $invlist[$i] < 256;
257
258 # Here, we have the first element in the array that indicates an
259 # element above Latin1. Get rid of all previous ones.
260 splice @invlist, 0, $i;
261
262 # If this one's index is not divisible by 2, it means that this
263 # element is inverting away from being in the list, which means
264 # all code points from 256 to this one are in this list.
265 unshift @invlist, 256 if $i % 2 != 0;
266 $found_nonl1 = 1;
267 last;
268 }
269 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
270 }
3f427fd9 271
a02047bf 272 output_invlist($prop_name, \@invlist);
9d9177be
KW
273}
274
275read_only_bottom_close_and_rename($out_fh)