This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
mktables: Refactor \X test so can be used for others
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
a02047bf 5use Unicode::UCD qw(prop_invlist prop_invmap);
9d9177be 6require 'regen/regen_lib.pl';
0c4ecf42 7require 'regen/charset_translations.pl';
9d9177be
KW
8
9# This program outputs charclass_invlists.h, which contains various inversion
10# lists in the form of C arrays that are to be used as-is for inversion lists.
11# Thus, the lists it contains are essentially pre-compiled, and need only a
12# light-weight fast wrapper to make them usable at run-time.
13
14# As such, this code knows about the internal structure of these lists, and
15# any change made to that has to be done here as well. A random number stored
16# in the headers is used to minimize the possibility of things getting
17# out-of-sync, or the wrong data structure being passed. Currently that
18# random number is:
0a07b44b 19my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be
KW
20
21my $out_fh = open_new('charclass_invlists.h', '>',
22 {style => '*', by => $0,
23 from => "Unicode::UCD"});
24
43b443dd
KW
25my $is_in_ifndef_ext_re = 0;
26
9d9177be
KW
27print $out_fh "/* See the generating file for comments */\n\n";
28
015bb97c
CB
29my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );
30
43b443dd
KW
31sub end_ifndef_ext_re {
32 if ($is_in_ifndef_ext_re) {
33 print $out_fh "\n#endif\t/* #ifndef PERL_IN_XSUB_RE */\n";
34 $is_in_ifndef_ext_re = 0;
35 }
36}
37
0c4ecf42 38sub output_invlist ($$;$) {
9d9177be
KW
39 my $name = shift;
40 my $invlist = shift; # Reference to inversion list array
0c4ecf42 41 my $charset = shift // ""; # name of character set for comment
9d9177be 42
76d3994c
KW
43 die "No inversion list for $name" unless defined $invlist
44 && ref $invlist eq 'ARRAY'
45 && @$invlist;
46
9d9177be
KW
47 # Output the inversion list $invlist using the name $name for it.
48 # It is output in the exact internal form for inversion lists.
49
a0316a6c
KW
50 # Is the last element of the header 0, or 1 ?
51 my $zero_or_one = 0;
a0316a6c
KW
52 if ($invlist->[0] != 0) {
53 unshift @$invlist, 0;
9d9177be
KW
54 $zero_or_one = 1;
55 }
0a07b44b 56 my $count = @$invlist;
9d9177be 57
43b443dd
KW
58 if ($is_in_ifndef_ext_re) {
59 if (exists $include_in_ext_re{$name}) {
60 end_ifndef_ext_re;
61 }
62 }
63 elsif (! exists $include_in_ext_re{$name}) {
64 print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
65 $is_in_ifndef_ext_re = 1;
66 }
67
0c4ecf42
KW
68 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
69 print $out_fh " /* for $charset */" if $charset;
70 print $out_fh "\n";
9d9177be 71
a0316a6c 72 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
73 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
74 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
75 ",\t/* 0 if the list starts at 0;",
76 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
77
78 # The main body are the UVs passed in to this routine. Do the final
79 # element separately
47d53124
KW
80 for my $i (0 .. @$invlist - 1) {
81 printf $out_fh "\t0x%X", $invlist->[$i];
82 print $out_fh "," if $i < @$invlist - 1;
83 print $out_fh "\n";
9d9177be
KW
84 }
85
9d9177be
KW
86 print $out_fh "};\n";
87}
88
5a7e5385 89sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
90
91 # Returns an inversion list constructed from the sorted input array of
92 # code points
93
94 my $list_ref = shift;
95
96 # Initialize to just the first element
97 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
98
99 # For each succeeding element, if it extends the previous range, adjust
100 # up, otherwise add it.
101 for my $i (1 .. @$list_ref - 1) {
102 if ($invlist[-1] == $list_ref->[$i]) {
103 $invlist[-1]++;
104 }
105 else {
106 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
107 }
108 }
109 return @invlist;
110}
111
112# Read in the Case Folding rules, and construct arrays of code points for the
113# properties we need.
114my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
115die "Could not find inversion map for Case_Folding" unless defined $format;
116die "Incorrect format '$format' for Case_Folding inversion map"
117 unless $format eq 'al';
118my @has_multi_char_fold;
119my @is_non_final_fold;
120
121for my $i (0 .. @$folds_ref - 1) {
122 next unless ref $folds_ref->[$i]; # Skip single-char folds
123 push @has_multi_char_fold, $cp_ref->[$i];
124
b6a6e956 125 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
126 # position
127 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
128 push @is_non_final_fold, $folds_ref->[$i][$j]
129 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
130 }
131}
132
a02047bf
KW
133sub _Perl_Non_Final_Folds {
134 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 135 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
136}
137
892d8259 138sub UpperLatin1 {
5a7e5385 139 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
140}
141
9d9177be
KW
142output_invlist("Latin1", [ 0, 256 ]);
143output_invlist("AboveLatin1", [ 256 ]);
144
43b443dd
KW
145end_ifndef_ext_re;
146
3f427fd9
KW
147# We construct lists for all the POSIX and backslash sequence character
148# classes in two forms:
149# 1) ones which match only in the ASCII range
150# 2) ones which match either in the Latin1 range, or the entire Unicode range
151#
152# These get compiled in, and hence affect the memory footprint of every Perl
153# program, even those not using Unicode. To minimize the size, currently
154# the Latin1 version is generated for the beyond ASCII range except for those
155# lists that are quite small for the entire range, such as for \s, which is 22
156# UVs long plus 4 UVs (currently) for the header.
157#
158# To save even more memory, the ASCII versions could be derived from the
159# larger ones at runtime, saving some memory (minus the expense of the machine
160# instructions to do so), but these are all small anyway, so their total is
161# about 100 UVs.
162#
163# In the list of properties below that get generated, the L1 prefix is a fake
164# property that means just the Latin1 range of the full property (whose name
165# has an X prefix instead of L1).
a02047bf
KW
166#
167# An initial & means to use the subroutine from this file instead of an
168# official inversion list.
3f427fd9 169
0c4ecf42
KW
170for my $charset (get_supported_code_pages()) {
171 print $out_fh "\n" . get_conditional_compile_line_start($charset);
172
c30a0cf2 173 my @a2n = @{get_a2n($charset)};
1c8c3428
KW
174 # Ignore non-alpha in sort
175 for my $prop (sort { lc ($a =~ s/[[:^alpha:]]//gr)
176 cmp lc ($b =~ s/[[:^alpha:]]//gr)
177 } qw(
178 ASCII
179 Cased
180 VertSpace
181 XPerlSpace
182 XPosixAlnum
183 XPosixAlpha
184 XPosixBlank
185 XPosixCntrl
186 XPosixDigit
187 XPosixGraph
188 XPosixLower
189 XPosixPrint
190 XPosixPunct
191 XPosixSpace
192 XPosixUpper
193 XPosixWord
194 XPosixXDigit
195 _Perl_Any_Folds
196 &NonL1_Perl_Non_Final_Folds
197 _Perl_Folds_To_Multi_Char
198 &UpperLatin1
199 _Perl_IDStart
200 _Perl_IDCont
201 )
0f5e3c71
KW
202 ) {
203
204 # For the Latin1 properties, we change to use the eXtended version of the
205 # base property, then go through the result and get rid of everything not
206 # in Latin1 (above 255). Actually, we retain the element for the range
207 # that crosses the 255/256 boundary if it is one that matches the
208 # property. For example, in the Word property, there is a range of code
209 # points that start at U+00F8 and goes through U+02C1. Instead of
210 # artificially cutting that off at 256 because 256 is the first code point
211 # above Latin1, we let the range go to its natural ending. That gives us
212 # extra information with no added space taken. But if the range that
213 # crosses the boundary is one that doesn't match the property, we don't
214 # start a new range above 255, as that could be construed as going to
215 # infinity. For example, the Upper property doesn't include the character
216 # at 255, but does include the one at 256. We don't include the 256 one.
217 my $prop_name = $prop;
218 my $is_local_sub = $prop_name =~ s/^&//;
219 my $lookup_prop = $prop_name;
220 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
221 or $lookup_prop =~ s/^L1//);
222 my $nonl1_only = 0;
223 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
224
225 my @invlist;
226 if ($is_local_sub) {
227 @invlist = eval $lookup_prop;
228 }
229 else {
230 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
231 }
232 die "Could not find inversion list for '$lookup_prop'" unless @invlist;
ceb1de32
KW
233
234 # Re-order the Unicode code points to native ones for this platform;
235 # only needed for code points below 256, and only if the first range
236 # doesn't span the whole of 0..256 (256 not 255 because a re-ordering
237 # could cause 256 to need to be in the same range as 255.)
238 if (! $nonl1_only || ($invlist[0] < 256
239 && ! ($invlist[0] == 0 && $invlist[1] > 256)))
240 {
fb4554ea
KW
241
242 # Look at all the ranges that start before 257.
243 my @latin1_list;
244 while (@invlist) {
245 last if $invlist[0] > 256;
246 my $upper = @invlist > 1
247 ? $invlist[1] - 1 # In range
8a6c81cf
KW
248
249 # To infinity. You may want to stop much much
250 # earlier; going this high may expose perl
251 # deficiencies with very large numbers.
252 : $Unicode::UCD::MAX_CP;
fb4554ea 253 for my $j ($invlist[0] .. $upper) {
8a6c81cf 254 if ($j < 256) {
fb4554ea 255 push @latin1_list, $a2n[$j];
8a6c81cf
KW
256 }
257 else {
fb4554ea 258 push @latin1_list, $j;
8a6c81cf 259 }
0f5e3c71 260 }
fb4554ea
KW
261
262 shift @invlist; # Shift off the range that's in the list
263 shift @invlist; # Shift off the range not in the list
0c4ecf42 264 }
fb4554ea
KW
265
266 # Here @invlist contains all the ranges in the original that start
267 # at code points above 256, and @latin1_list contains all the
268 # native code points for ranges that start with a Unicode code
269 # point below 257. We sort the latter and convert it to inversion
270 # list format. Then simply prepend it to the list of the higher
271 # code points.
272 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 273 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 274 unshift @invlist, @latin1_list;
ceb1de32 275 }
0f5e3c71
KW
276
277 if ($l1_only) {
278 for my $i (0 .. @invlist - 1 - 1) {
279 if ($invlist[$i] > 255) {
280
281 # In an inversion list, even-numbered elements give the code
282 # points that begin ranges that match the property;
283 # odd-numbered give ones that begin ranges that don't match.
284 # If $i is odd, we are at the first code point above 255 that
285 # doesn't match, which means the range it is ending does
286 # match, and crosses the 255/256 boundary. We want to include
287 # this ending point, so increment $i, so the splice below
288 # includes it. Conversely, if $i is even, it is the first
289 # code point above 255 that matches, which means there was no
290 # matching range that crossed the boundary, and we don't want
291 # to include this code point, so splice before it.
292 $i++ if $i % 2 != 0;
293
294 # Remove everything past this.
295 splice @invlist, $i;
296 last;
297 }
0c4ecf42
KW
298 }
299 }
0f5e3c71
KW
300 elsif ($nonl1_only) {
301 my $found_nonl1 = 0;
302 for my $i (0 .. @invlist - 1 - 1) {
303 next if $invlist[$i] < 256;
304
305 # Here, we have the first element in the array that indicates an
306 # element above Latin1. Get rid of all previous ones.
307 splice @invlist, 0, $i;
308
309 # If this one's index is not divisible by 2, it means that this
310 # element is inverting away from being in the list, which means
311 # all code points from 256 to this one are in this list.
312 unshift @invlist, 256 if $i % 2 != 0;
313 $found_nonl1 = 1;
3f427fd9
KW
314 last;
315 }
0f5e3c71 316 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 317 }
3f427fd9 318
0f5e3c71
KW
319 output_invlist($prop_name, \@invlist, $charset);
320 }
43b443dd 321 end_ifndef_ext_re;
0c4ecf42 322 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
323}
324
9a3da3ad
FC
325my @sources = ($0, "lib/Unicode/UCD.pm");
326{
327 # Depend on mktables’ own sources. It’s a shorter list of files than
328 # those that Unicode::UCD uses.
329 open my $mktables_list, "lib/unicore/mktables.lst"
330 or die "$0 cannot open lib/unicore/mktables.lst: $!";
331 while(<$mktables_list>) {
332 last if /===/;
333 chomp;
334 push @sources, "lib/unicore/$_" if /^[^#]/;
335 }
336}
337read_only_bottom_close_and_rename($out_fh, \@sources)