This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
comments only, explain the different ONE_AT_A_TIME algos
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
a02047bf 5use Unicode::UCD qw(prop_invlist prop_invmap);
9d9177be
KW
6require 'regen/regen_lib.pl';
7
8# This program outputs charclass_invlists.h, which contains various inversion
9# lists in the form of C arrays that are to be used as-is for inversion lists.
10# Thus, the lists it contains are essentially pre-compiled, and need only a
11# light-weight fast wrapper to make them usable at run-time.
12
13# As such, this code knows about the internal structure of these lists, and
14# any change made to that has to be done here as well. A random number stored
15# in the headers is used to minimize the possibility of things getting
16# out-of-sync, or the wrong data structure being passed. Currently that
17# random number is:
0a07b44b 18my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be
KW
19
20my $out_fh = open_new('charclass_invlists.h', '>',
21 {style => '*', by => $0,
22 from => "Unicode::UCD"});
23
24print $out_fh "/* See the generating file for comments */\n\n";
25
015bb97c
CB
26my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );
27
9d9177be
KW
28sub output_invlist ($$) {
29 my $name = shift;
30 my $invlist = shift; # Reference to inversion list array
31
76d3994c
KW
32 die "No inversion list for $name" unless defined $invlist
33 && ref $invlist eq 'ARRAY'
34 && @$invlist;
35
9d9177be
KW
36 # Output the inversion list $invlist using the name $name for it.
37 # It is output in the exact internal form for inversion lists.
38
a0316a6c
KW
39 # Is the last element of the header 0, or 1 ?
40 my $zero_or_one = 0;
a0316a6c
KW
41 if ($invlist->[0] != 0) {
42 unshift @$invlist, 0;
9d9177be
KW
43 $zero_or_one = 1;
44 }
0a07b44b 45 my $count = @$invlist;
9d9177be 46
015bb97c 47 print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
af495420 48 print $out_fh "\nstatic const UV ${name}_invlist[] = {\n";
9d9177be 49
a0316a6c 50 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
51 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
52 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
53 ",\t/* 0 if the list starts at 0;",
54 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
55
56 # The main body are the UVs passed in to this routine. Do the final
57 # element separately
58 for my $i (0 .. @$invlist - 1 - 1) {
59 print $out_fh "\t$invlist->[$i],\n";
60 }
61
62 # The final element does not have a trailing comma, as C can't handle it.
63 print $out_fh "\t$invlist->[-1]\n";
64
65 print $out_fh "};\n";
015bb97c
CB
66 print $out_fh "\n#endif\n" unless exists $include_in_ext_re{$name};
67
9d9177be
KW
68}
69
a02047bf
KW
70sub mk_invlist_from_cp_list {
71
72 # Returns an inversion list constructed from the sorted input array of
73 # code points
74
75 my $list_ref = shift;
76
77 # Initialize to just the first element
78 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
79
80 # For each succeeding element, if it extends the previous range, adjust
81 # up, otherwise add it.
82 for my $i (1 .. @$list_ref - 1) {
83 if ($invlist[-1] == $list_ref->[$i]) {
84 $invlist[-1]++;
85 }
86 else {
87 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
88 }
89 }
90 return @invlist;
91}
92
93# Read in the Case Folding rules, and construct arrays of code points for the
94# properties we need.
95my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
96die "Could not find inversion map for Case_Folding" unless defined $format;
97die "Incorrect format '$format' for Case_Folding inversion map"
98 unless $format eq 'al';
99my @has_multi_char_fold;
100my @is_non_final_fold;
101
102for my $i (0 .. @$folds_ref - 1) {
103 next unless ref $folds_ref->[$i]; # Skip single-char folds
104 push @has_multi_char_fold, $cp_ref->[$i];
105
b6a6e956 106 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
107 # position
108 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
109 push @is_non_final_fold, $folds_ref->[$i][$j]
110 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
111 }
112}
113
114sub _Perl_Multi_Char_Folds {
115 @has_multi_char_fold = sort { $a <=> $b } @has_multi_char_fold;
116 return mk_invlist_from_cp_list(\@has_multi_char_fold);
117}
118
119sub _Perl_Non_Final_Folds {
120 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
121 return mk_invlist_from_cp_list(\@is_non_final_fold);
122}
123
9d9177be
KW
124output_invlist("Latin1", [ 0, 256 ]);
125output_invlist("AboveLatin1", [ 256 ]);
126
3f427fd9
KW
127# We construct lists for all the POSIX and backslash sequence character
128# classes in two forms:
129# 1) ones which match only in the ASCII range
130# 2) ones which match either in the Latin1 range, or the entire Unicode range
131#
132# These get compiled in, and hence affect the memory footprint of every Perl
133# program, even those not using Unicode. To minimize the size, currently
134# the Latin1 version is generated for the beyond ASCII range except for those
135# lists that are quite small for the entire range, such as for \s, which is 22
136# UVs long plus 4 UVs (currently) for the header.
137#
138# To save even more memory, the ASCII versions could be derived from the
139# larger ones at runtime, saving some memory (minus the expense of the machine
140# instructions to do so), but these are all small anyway, so their total is
141# about 100 UVs.
142#
143# In the list of properties below that get generated, the L1 prefix is a fake
144# property that means just the Latin1 range of the full property (whose name
145# has an X prefix instead of L1).
a02047bf
KW
146#
147# An initial & means to use the subroutine from this file instead of an
148# official inversion list.
3f427fd9 149
9d9177be
KW
150for my $prop (qw(
151 ASCII
dab0c3e7 152 L1Cased
3f427fd9
KW
153 VertSpace
154 PerlSpace
155 XPerlSpace
156 PosixAlnum
157 L1PosixAlnum
158 PosixAlpha
159 L1PosixAlpha
160 PosixBlank
161 XPosixBlank
162 PosixCntrl
163 XPosixCntrl
164 PosixDigit
165 PosixGraph
166 L1PosixGraph
167 PosixLower
168 L1PosixLower
169 PosixPrint
170 L1PosixPrint
171 PosixPunct
172 L1PosixPunct
173 PosixSpace
174 XPosixSpace
175 PosixUpper
176 L1PosixUpper
177 PosixWord
178 L1PosixWord
179 PosixXDigit
180 XPosixXDigit
a02047bf
KW
181 &NonL1_Perl_Non_Final_Folds
182 &_Perl_Multi_Char_Folds
9d9177be
KW
183 )
184) {
185
3f427fd9
KW
186 # For the Latin1 properties, we change to use the eXtended version of the
187 # base property, then go through the result and get rid of everything not
b4069bca
KW
188 # in Latin1 (above 255). Actually, we retain the element for the range
189 # that crosses the 255/256 boundary if it is one that matches the
190 # property. For example, in the Word property, there is a range of code
191 # points that start at U+00F8 and goes through U+02C1. Instead of
b6a6e956 192 # artificially cutting that off at 256 because 256 is the first code point
3f427fd9 193 # above Latin1, we let the range go to its natural ending. That gives us
b4069bca
KW
194 # extra information with no added space taken. But if the range that
195 # crosses the boundary is one that doesn't match the property, we don't
196 # start a new range above 255, as that could be construed as going to
197 # infinity. For example, the Upper property doesn't include the character
198 # at 255, but does include the one at 256. We don't include the 256 one.
a02047bf
KW
199 my $prop_name = $prop;
200 my $is_local_sub = $prop_name =~ s/^&//;
201 my $lookup_prop = $prop_name;
c4854dea
KW
202 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//);
203 my $nonl1_only = 0;
204 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
a02047bf
KW
205
206 my @invlist;
207 if ($is_local_sub) {
208 @invlist = eval $lookup_prop;
209 }
210 else {
211 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
212 }
ad89228c 213 die "Could not find inversion list for '$lookup_prop'" unless @invlist;
3f427fd9 214
c4854dea 215 if ($l1_only) {
3f427fd9
KW
216 for my $i (0 .. @invlist - 1 - 1) {
217 if ($invlist[$i] > 255) {
b4069bca
KW
218
219 # In an inversion list, even-numbered elements give the code
220 # points that begin ranges that match the property;
221 # odd-numbered give ones that begin ranges that don't match.
222 # If $i is odd, we are at the first code point above 255 that
223 # doesn't match, which means the range it is ending does
224 # match, and crosses the 255/256 boundary. We want to include
225 # this ending point, so increment $i, so the splice below
226 # includes it. Conversely, if $i is even, it is the first
227 # code point above 255 that matches, which means there was no
228 # matching range that crossed the boundary, and we don't want
229 # to include this code point, so splice before it.
230 $i++ if $i % 2 != 0;
231
232 # Remove everything past this.
233 splice @invlist, $i;
3f427fd9
KW
234 last;
235 }
236 }
237 }
c4854dea
KW
238 elsif ($nonl1_only) {
239 my $found_nonl1 = 0;
240 for my $i (0 .. @invlist - 1 - 1) {
241 next if $invlist[$i] < 256;
242
243 # Here, we have the first element in the array that indicates an
244 # element above Latin1. Get rid of all previous ones.
245 splice @invlist, 0, $i;
246
247 # If this one's index is not divisible by 2, it means that this
248 # element is inverting away from being in the list, which means
249 # all code points from 256 to this one are in this list.
250 unshift @invlist, 256 if $i % 2 != 0;
251 $found_nonl1 = 1;
252 last;
253 }
254 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
255 }
3f427fd9 256
a02047bf 257 output_invlist($prop_name, \@invlist);
9d9177be
KW
258}
259
260read_only_bottom_close_and_rename($out_fh)