This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update File-Temp to CPAN version 0.2302
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
a02047bf 5use Unicode::UCD qw(prop_invlist prop_invmap);
9d9177be
KW
6require 'regen/regen_lib.pl';
7
8# This program outputs charclass_invlists.h, which contains various inversion
9# lists in the form of C arrays that are to be used as-is for inversion lists.
10# Thus, the lists it contains are essentially pre-compiled, and need only a
11# light-weight fast wrapper to make them usable at run-time.
12
13# As such, this code knows about the internal structure of these lists, and
14# any change made to that has to be done here as well. A random number stored
15# in the headers is used to minimize the possibility of things getting
16# out-of-sync, or the wrong data structure being passed. Currently that
17# random number is:
0a07b44b 18my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be
KW
19
20my $out_fh = open_new('charclass_invlists.h', '>',
21 {style => '*', by => $0,
22 from => "Unicode::UCD"});
23
24print $out_fh "/* See the generating file for comments */\n\n";
25
015bb97c
CB
26my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );
27
9d9177be
KW
28sub output_invlist ($$) {
29 my $name = shift;
30 my $invlist = shift; # Reference to inversion list array
31
76d3994c
KW
32 die "No inversion list for $name" unless defined $invlist
33 && ref $invlist eq 'ARRAY'
34 && @$invlist;
35
9d9177be
KW
36 # Output the inversion list $invlist using the name $name for it.
37 # It is output in the exact internal form for inversion lists.
38
a0316a6c
KW
39 # Is the last element of the header 0, or 1 ?
40 my $zero_or_one = 0;
a0316a6c
KW
41 if ($invlist->[0] != 0) {
42 unshift @$invlist, 0;
9d9177be
KW
43 $zero_or_one = 1;
44 }
0a07b44b 45 my $count = @$invlist;
9d9177be 46
015bb97c 47 print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
af495420 48 print $out_fh "\nstatic const UV ${name}_invlist[] = {\n";
9d9177be 49
a0316a6c 50 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
51 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
52 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
53 ",\t/* 0 if the list starts at 0;",
54 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
55
56 # The main body are the UVs passed in to this routine. Do the final
57 # element separately
58 for my $i (0 .. @$invlist - 1 - 1) {
59 print $out_fh "\t$invlist->[$i],\n";
60 }
61
62 # The final element does not have a trailing comma, as C can't handle it.
63 print $out_fh "\t$invlist->[-1]\n";
64
65 print $out_fh "};\n";
015bb97c
CB
66 print $out_fh "\n#endif\n" unless exists $include_in_ext_re{$name};
67
9d9177be
KW
68}
69
a02047bf
KW
70sub mk_invlist_from_cp_list {
71
72 # Returns an inversion list constructed from the sorted input array of
73 # code points
74
75 my $list_ref = shift;
76
77 # Initialize to just the first element
78 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
79
80 # For each succeeding element, if it extends the previous range, adjust
81 # up, otherwise add it.
82 for my $i (1 .. @$list_ref - 1) {
83 if ($invlist[-1] == $list_ref->[$i]) {
84 $invlist[-1]++;
85 }
86 else {
87 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
88 }
89 }
90 return @invlist;
91}
92
93# Read in the Case Folding rules, and construct arrays of code points for the
94# properties we need.
95my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
96die "Could not find inversion map for Case_Folding" unless defined $format;
97die "Incorrect format '$format' for Case_Folding inversion map"
98 unless $format eq 'al';
99my @has_multi_char_fold;
100my @is_non_final_fold;
101
102for my $i (0 .. @$folds_ref - 1) {
103 next unless ref $folds_ref->[$i]; # Skip single-char folds
104 push @has_multi_char_fold, $cp_ref->[$i];
105
b6a6e956 106 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
107 # position
108 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
109 push @is_non_final_fold, $folds_ref->[$i][$j]
110 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
111 }
112}
113
114sub _Perl_Multi_Char_Folds {
115 @has_multi_char_fold = sort { $a <=> $b } @has_multi_char_fold;
116 return mk_invlist_from_cp_list(\@has_multi_char_fold);
117}
118
119sub _Perl_Non_Final_Folds {
120 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
121 return mk_invlist_from_cp_list(\@is_non_final_fold);
122}
123
892d8259
KW
124sub UpperLatin1 {
125 my @upper_latin1;
126 for my $i (0 .. 255) { # Complicated because of EBCDIC
127 push @upper_latin1, $i if chr($i) =~ /[[:^ascii:]]/;
128 }
129 return mk_invlist_from_cp_list(\@upper_latin1);
130}
131
9d9177be
KW
132output_invlist("Latin1", [ 0, 256 ]);
133output_invlist("AboveLatin1", [ 256 ]);
134
3f427fd9
KW
135# We construct lists for all the POSIX and backslash sequence character
136# classes in two forms:
137# 1) ones which match only in the ASCII range
138# 2) ones which match either in the Latin1 range, or the entire Unicode range
139#
140# These get compiled in, and hence affect the memory footprint of every Perl
141# program, even those not using Unicode. To minimize the size, currently
142# the Latin1 version is generated for the beyond ASCII range except for those
143# lists that are quite small for the entire range, such as for \s, which is 22
144# UVs long plus 4 UVs (currently) for the header.
145#
146# To save even more memory, the ASCII versions could be derived from the
147# larger ones at runtime, saving some memory (minus the expense of the machine
148# instructions to do so), but these are all small anyway, so their total is
149# about 100 UVs.
150#
151# In the list of properties below that get generated, the L1 prefix is a fake
152# property that means just the Latin1 range of the full property (whose name
153# has an X prefix instead of L1).
a02047bf
KW
154#
155# An initial & means to use the subroutine from this file instead of an
156# official inversion list.
3f427fd9 157
9d9177be
KW
158for my $prop (qw(
159 ASCII
dab0c3e7 160 L1Cased
3f427fd9
KW
161 VertSpace
162 PerlSpace
163 XPerlSpace
164 PosixAlnum
165 L1PosixAlnum
166 PosixAlpha
167 L1PosixAlpha
168 PosixBlank
169 XPosixBlank
170 PosixCntrl
171 XPosixCntrl
172 PosixDigit
173 PosixGraph
174 L1PosixGraph
175 PosixLower
176 L1PosixLower
177 PosixPrint
178 L1PosixPrint
179 PosixPunct
180 L1PosixPunct
181 PosixSpace
182 XPosixSpace
183 PosixUpper
184 L1PosixUpper
185 PosixWord
186 L1PosixWord
187 PosixXDigit
188 XPosixXDigit
a02047bf
KW
189 &NonL1_Perl_Non_Final_Folds
190 &_Perl_Multi_Char_Folds
892d8259 191 &UpperLatin1
9d9177be
KW
192 )
193) {
194
3f427fd9
KW
195 # For the Latin1 properties, we change to use the eXtended version of the
196 # base property, then go through the result and get rid of everything not
b4069bca
KW
197 # in Latin1 (above 255). Actually, we retain the element for the range
198 # that crosses the 255/256 boundary if it is one that matches the
199 # property. For example, in the Word property, there is a range of code
200 # points that start at U+00F8 and goes through U+02C1. Instead of
b6a6e956 201 # artificially cutting that off at 256 because 256 is the first code point
3f427fd9 202 # above Latin1, we let the range go to its natural ending. That gives us
b4069bca
KW
203 # extra information with no added space taken. But if the range that
204 # crosses the boundary is one that doesn't match the property, we don't
205 # start a new range above 255, as that could be construed as going to
206 # infinity. For example, the Upper property doesn't include the character
207 # at 255, but does include the one at 256. We don't include the 256 one.
a02047bf
KW
208 my $prop_name = $prop;
209 my $is_local_sub = $prop_name =~ s/^&//;
210 my $lookup_prop = $prop_name;
c4854dea
KW
211 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//);
212 my $nonl1_only = 0;
213 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
a02047bf
KW
214
215 my @invlist;
216 if ($is_local_sub) {
217 @invlist = eval $lookup_prop;
218 }
219 else {
220 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
221 }
ad89228c 222 die "Could not find inversion list for '$lookup_prop'" unless @invlist;
3f427fd9 223
c4854dea 224 if ($l1_only) {
3f427fd9
KW
225 for my $i (0 .. @invlist - 1 - 1) {
226 if ($invlist[$i] > 255) {
b4069bca
KW
227
228 # In an inversion list, even-numbered elements give the code
229 # points that begin ranges that match the property;
230 # odd-numbered give ones that begin ranges that don't match.
231 # If $i is odd, we are at the first code point above 255 that
232 # doesn't match, which means the range it is ending does
233 # match, and crosses the 255/256 boundary. We want to include
234 # this ending point, so increment $i, so the splice below
235 # includes it. Conversely, if $i is even, it is the first
236 # code point above 255 that matches, which means there was no
237 # matching range that crossed the boundary, and we don't want
238 # to include this code point, so splice before it.
239 $i++ if $i % 2 != 0;
240
241 # Remove everything past this.
242 splice @invlist, $i;
3f427fd9
KW
243 last;
244 }
245 }
246 }
c4854dea
KW
247 elsif ($nonl1_only) {
248 my $found_nonl1 = 0;
249 for my $i (0 .. @invlist - 1 - 1) {
250 next if $invlist[$i] < 256;
251
252 # Here, we have the first element in the array that indicates an
253 # element above Latin1. Get rid of all previous ones.
254 splice @invlist, 0, $i;
255
256 # If this one's index is not divisible by 2, it means that this
257 # element is inverting away from being in the list, which means
258 # all code points from 256 to this one are in this list.
259 unshift @invlist, 256 if $i % 2 != 0;
260 $found_nonl1 = 1;
261 last;
262 }
263 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
264 }
3f427fd9 265
a02047bf 266 output_invlist($prop_name, \@invlist);
9d9177be
KW
267}
268
269read_only_bottom_close_and_rename($out_fh)