This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/mk_invlists.pl: White-space only
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
a02047bf 5use Unicode::UCD qw(prop_invlist prop_invmap);
9d9177be 6require 'regen/regen_lib.pl';
0c4ecf42 7require 'regen/charset_translations.pl';
9d9177be
KW
8
9# This program outputs charclass_invlists.h, which contains various inversion
10# lists in the form of C arrays that are to be used as-is for inversion lists.
11# Thus, the lists it contains are essentially pre-compiled, and need only a
12# light-weight fast wrapper to make them usable at run-time.
13
14# As such, this code knows about the internal structure of these lists, and
15# any change made to that has to be done here as well. A random number stored
16# in the headers is used to minimize the possibility of things getting
17# out-of-sync, or the wrong data structure being passed. Currently that
18# random number is:
0a07b44b 19my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be
KW
20
21my $out_fh = open_new('charclass_invlists.h', '>',
22 {style => '*', by => $0,
23 from => "Unicode::UCD"});
24
25print $out_fh "/* See the generating file for comments */\n\n";
26
015bb97c
CB
27my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );
28
0c4ecf42 29sub output_invlist ($$;$) {
9d9177be
KW
30 my $name = shift;
31 my $invlist = shift; # Reference to inversion list array
0c4ecf42 32 my $charset = shift // ""; # name of character set for comment
9d9177be 33
76d3994c
KW
34 die "No inversion list for $name" unless defined $invlist
35 && ref $invlist eq 'ARRAY'
36 && @$invlist;
37
9d9177be
KW
38 # Output the inversion list $invlist using the name $name for it.
39 # It is output in the exact internal form for inversion lists.
40
a0316a6c
KW
41 # Is the last element of the header 0, or 1 ?
42 my $zero_or_one = 0;
a0316a6c
KW
43 if ($invlist->[0] != 0) {
44 unshift @$invlist, 0;
9d9177be
KW
45 $zero_or_one = 1;
46 }
0a07b44b 47 my $count = @$invlist;
9d9177be 48
015bb97c 49 print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
0c4ecf42
KW
50 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
51 print $out_fh " /* for $charset */" if $charset;
52 print $out_fh "\n";
9d9177be 53
a0316a6c 54 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
55 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
56 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
57 ",\t/* 0 if the list starts at 0;",
58 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
59
60 # The main body are the UVs passed in to this routine. Do the final
61 # element separately
62 for my $i (0 .. @$invlist - 1 - 1) {
63 print $out_fh "\t$invlist->[$i],\n";
64 }
65
66 # The final element does not have a trailing comma, as C can't handle it.
67 print $out_fh "\t$invlist->[-1]\n";
68
69 print $out_fh "};\n";
015bb97c 70 print $out_fh "\n#endif\n" unless exists $include_in_ext_re{$name};
9d9177be
KW
71}
72
a02047bf
KW
73sub mk_invlist_from_cp_list {
74
75 # Returns an inversion list constructed from the sorted input array of
76 # code points
77
78 my $list_ref = shift;
79
80 # Initialize to just the first element
81 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
82
83 # For each succeeding element, if it extends the previous range, adjust
84 # up, otherwise add it.
85 for my $i (1 .. @$list_ref - 1) {
86 if ($invlist[-1] == $list_ref->[$i]) {
87 $invlist[-1]++;
88 }
89 else {
90 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
91 }
92 }
93 return @invlist;
94}
95
96# Read in the Case Folding rules, and construct arrays of code points for the
97# properties we need.
98my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
99die "Could not find inversion map for Case_Folding" unless defined $format;
100die "Incorrect format '$format' for Case_Folding inversion map"
101 unless $format eq 'al';
102my @has_multi_char_fold;
103my @is_non_final_fold;
104
105for my $i (0 .. @$folds_ref - 1) {
106 next unless ref $folds_ref->[$i]; # Skip single-char folds
107 push @has_multi_char_fold, $cp_ref->[$i];
108
b6a6e956 109 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
110 # position
111 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
112 push @is_non_final_fold, $folds_ref->[$i][$j]
113 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
114 }
115}
116
a02047bf
KW
117sub _Perl_Non_Final_Folds {
118 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
119 return mk_invlist_from_cp_list(\@is_non_final_fold);
120}
121
892d8259 122sub UpperLatin1 {
0c4ecf42 123 return mk_invlist_from_cp_list([ 128 .. 255 ]);
892d8259
KW
124}
125
9d9177be
KW
126output_invlist("Latin1", [ 0, 256 ]);
127output_invlist("AboveLatin1", [ 256 ]);
128
3f427fd9
KW
129# We construct lists for all the POSIX and backslash sequence character
130# classes in two forms:
131# 1) ones which match only in the ASCII range
132# 2) ones which match either in the Latin1 range, or the entire Unicode range
133#
134# These get compiled in, and hence affect the memory footprint of every Perl
135# program, even those not using Unicode. To minimize the size, currently
136# the Latin1 version is generated for the beyond ASCII range except for those
137# lists that are quite small for the entire range, such as for \s, which is 22
138# UVs long plus 4 UVs (currently) for the header.
139#
140# To save even more memory, the ASCII versions could be derived from the
141# larger ones at runtime, saving some memory (minus the expense of the machine
142# instructions to do so), but these are all small anyway, so their total is
143# about 100 UVs.
144#
145# In the list of properties below that get generated, the L1 prefix is a fake
146# property that means just the Latin1 range of the full property (whose name
147# has an X prefix instead of L1).
a02047bf
KW
148#
149# An initial & means to use the subroutine from this file instead of an
150# official inversion list.
3f427fd9 151
0c4ecf42
KW
152for my $charset (get_supported_code_pages()) {
153 print $out_fh "\n" . get_conditional_compile_line_start($charset);
154
155 my @a2n = get_a2n($charset);
0f5e3c71
KW
156 for my $prop (qw(
157 ASCII
158 Cased
159 VertSpace
160 XPerlSpace
161 XPosixAlnum
162 XPosixAlpha
163 XPosixBlank
164 XPosixCntrl
165 XPosixDigit
166 XPosixGraph
167 XPosixLower
168 XPosixPrint
169 XPosixPunct
170 XPosixSpace
171 XPosixUpper
172 XPosixWord
173 XPosixXDigit
174 _Perl_Any_Folds
175 &NonL1_Perl_Non_Final_Folds
176 _Perl_Folds_To_Multi_Char
177 &UpperLatin1
178 _Perl_IDStart
179 _Perl_IDCont
180 )
181 ) {
182
183 # For the Latin1 properties, we change to use the eXtended version of the
184 # base property, then go through the result and get rid of everything not
185 # in Latin1 (above 255). Actually, we retain the element for the range
186 # that crosses the 255/256 boundary if it is one that matches the
187 # property. For example, in the Word property, there is a range of code
188 # points that start at U+00F8 and goes through U+02C1. Instead of
189 # artificially cutting that off at 256 because 256 is the first code point
190 # above Latin1, we let the range go to its natural ending. That gives us
191 # extra information with no added space taken. But if the range that
192 # crosses the boundary is one that doesn't match the property, we don't
193 # start a new range above 255, as that could be construed as going to
194 # infinity. For example, the Upper property doesn't include the character
195 # at 255, but does include the one at 256. We don't include the 256 one.
196 my $prop_name = $prop;
197 my $is_local_sub = $prop_name =~ s/^&//;
198 my $lookup_prop = $prop_name;
199 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
200 or $lookup_prop =~ s/^L1//);
201 my $nonl1_only = 0;
202 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
203
204 my @invlist;
205 if ($is_local_sub) {
206 @invlist = eval $lookup_prop;
207 }
208 else {
209 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
210 }
211 die "Could not find inversion list for '$lookup_prop'" unless @invlist;
212 my @full_list;
213 for (my $i = 0; $i < @invlist; $i += 2) {
214 my $upper = ($i + 1) < @invlist
215 ? $invlist[$i+1] - 1 # In range
216 : $Unicode::UCD::MAX_CP; # To infinity. You may want
217 # to stop much much earlier;
218 # going this high may expose
219 # perl deficiencies with very
220 # large numbers.
221 for my $j ($invlist[$i] .. $upper) {
222 if ($j < 256) {
223 push @full_list, $a2n[$j];
224 }
225 else {
226 push @full_list, $j;
227 }
0c4ecf42 228 }
0f5e3c71
KW
229 }
230 @full_list = sort { $a <=> $b } @full_list;
231 @invlist = mk_invlist_from_cp_list(\@full_list);
232
233 if ($l1_only) {
234 for my $i (0 .. @invlist - 1 - 1) {
235 if ($invlist[$i] > 255) {
236
237 # In an inversion list, even-numbered elements give the code
238 # points that begin ranges that match the property;
239 # odd-numbered give ones that begin ranges that don't match.
240 # If $i is odd, we are at the first code point above 255 that
241 # doesn't match, which means the range it is ending does
242 # match, and crosses the 255/256 boundary. We want to include
243 # this ending point, so increment $i, so the splice below
244 # includes it. Conversely, if $i is even, it is the first
245 # code point above 255 that matches, which means there was no
246 # matching range that crossed the boundary, and we don't want
247 # to include this code point, so splice before it.
248 $i++ if $i % 2 != 0;
249
250 # Remove everything past this.
251 splice @invlist, $i;
252 last;
253 }
0c4ecf42
KW
254 }
255 }
0f5e3c71
KW
256 elsif ($nonl1_only) {
257 my $found_nonl1 = 0;
258 for my $i (0 .. @invlist - 1 - 1) {
259 next if $invlist[$i] < 256;
260
261 # Here, we have the first element in the array that indicates an
262 # element above Latin1. Get rid of all previous ones.
263 splice @invlist, 0, $i;
264
265 # If this one's index is not divisible by 2, it means that this
266 # element is inverting away from being in the list, which means
267 # all code points from 256 to this one are in this list.
268 unshift @invlist, 256 if $i % 2 != 0;
269 $found_nonl1 = 1;
3f427fd9
KW
270 last;
271 }
0f5e3c71 272 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 273 }
3f427fd9 274
0f5e3c71
KW
275 output_invlist($prop_name, \@invlist, $charset);
276 }
0c4ecf42 277 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
278}
279
280read_only_bottom_close_and_rename($out_fh)