This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Preserve the case of t/lib/vmsfspec.t.
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
a02047bf 5use Unicode::UCD qw(prop_invlist prop_invmap);
9d9177be
KW
6require 'regen/regen_lib.pl';
7
8# This program outputs charclass_invlists.h, which contains various inversion
9# lists in the form of C arrays that are to be used as-is for inversion lists.
10# Thus, the lists it contains are essentially pre-compiled, and need only a
11# light-weight fast wrapper to make them usable at run-time.
12
13# As such, this code knows about the internal structure of these lists, and
14# any change made to that has to be done here as well. A random number stored
15# in the headers is used to minimize the possibility of things getting
16# out-of-sync, or the wrong data structure being passed. Currently that
17# random number is:
15896d2f 18my $VERSION_DATA_STRUCTURE_TYPE = 290655244;
9d9177be
KW
19
20my $out_fh = open_new('charclass_invlists.h', '>',
21 {style => '*', by => $0,
22 from => "Unicode::UCD"});
23
24print $out_fh "/* See the generating file for comments */\n\n";
25
015bb97c
CB
26my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );
27
9d9177be
KW
28sub output_invlist ($$) {
29 my $name = shift;
30 my $invlist = shift; # Reference to inversion list array
31
76d3994c
KW
32 die "No inversion list for $name" unless defined $invlist
33 && ref $invlist eq 'ARRAY'
34 && @$invlist;
35
9d9177be
KW
36 # Output the inversion list $invlist using the name $name for it.
37 # It is output in the exact internal form for inversion lists.
38
39 my $zero_or_one; # Is the last element of the header 0, or 1 ?
40
41 # If the first element is 0, it goes in the header, instead of the body
42 if ($invlist->[0] == 0) {
43 shift @$invlist;
44
45 $zero_or_one = 0;
46
47 # Add a dummy 0 at the end so that the length is constant. inversion
48 # lists are always stored with enough room so that if they change from
49 # beginning with 0, they don't have to grow.
50 push @$invlist, 0;
51 }
52 else {
53 $zero_or_one = 1;
54 }
55
015bb97c
CB
56 print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
57 print $out_fh "\nstatic UV ${name}_invlist[] = {\n";
9d9177be
KW
58
59 print $out_fh "\t", scalar @$invlist, ",\t/* Number of elements */\n";
60 print $out_fh "\t0,\t/* Current iteration position */\n";
15896d2f 61 print $out_fh "\t0,\t/* Cache of previous search index result */\n";
9d9177be
KW
62 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
63 print $out_fh "\t", $zero_or_one,
64 ",\t/* 0 if this is the first element of the list proper;",
65 "\n\t\t 1 if the next element is the first */\n";
66
67 # The main body are the UVs passed in to this routine. Do the final
68 # element separately
69 for my $i (0 .. @$invlist - 1 - 1) {
70 print $out_fh "\t$invlist->[$i],\n";
71 }
72
73 # The final element does not have a trailing comma, as C can't handle it.
74 print $out_fh "\t$invlist->[-1]\n";
75
76 print $out_fh "};\n";
015bb97c
CB
77 print $out_fh "\n#endif\n" unless exists $include_in_ext_re{$name};
78
9d9177be
KW
79}
80
a02047bf
KW
81sub mk_invlist_from_cp_list {
82
83 # Returns an inversion list constructed from the sorted input array of
84 # code points
85
86 my $list_ref = shift;
87
88 # Initialize to just the first element
89 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
90
91 # For each succeeding element, if it extends the previous range, adjust
92 # up, otherwise add it.
93 for my $i (1 .. @$list_ref - 1) {
94 if ($invlist[-1] == $list_ref->[$i]) {
95 $invlist[-1]++;
96 }
97 else {
98 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
99 }
100 }
101 return @invlist;
102}
103
104# Read in the Case Folding rules, and construct arrays of code points for the
105# properties we need.
106my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
107die "Could not find inversion map for Case_Folding" unless defined $format;
108die "Incorrect format '$format' for Case_Folding inversion map"
109 unless $format eq 'al';
110my @has_multi_char_fold;
111my @is_non_final_fold;
112
113for my $i (0 .. @$folds_ref - 1) {
114 next unless ref $folds_ref->[$i]; # Skip single-char folds
115 push @has_multi_char_fold, $cp_ref->[$i];
116
117 # Add to the the non-finals list each code point that is in a non-final
118 # position
119 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
120 push @is_non_final_fold, $folds_ref->[$i][$j]
121 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
122 }
123}
124
125sub _Perl_Multi_Char_Folds {
126 @has_multi_char_fold = sort { $a <=> $b } @has_multi_char_fold;
127 return mk_invlist_from_cp_list(\@has_multi_char_fold);
128}
129
130sub _Perl_Non_Final_Folds {
131 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
132 return mk_invlist_from_cp_list(\@is_non_final_fold);
133}
134
9d9177be
KW
135output_invlist("Latin1", [ 0, 256 ]);
136output_invlist("AboveLatin1", [ 256 ]);
137
3f427fd9
KW
138# We construct lists for all the POSIX and backslash sequence character
139# classes in two forms:
140# 1) ones which match only in the ASCII range
141# 2) ones which match either in the Latin1 range, or the entire Unicode range
142#
143# These get compiled in, and hence affect the memory footprint of every Perl
144# program, even those not using Unicode. To minimize the size, currently
145# the Latin1 version is generated for the beyond ASCII range except for those
146# lists that are quite small for the entire range, such as for \s, which is 22
147# UVs long plus 4 UVs (currently) for the header.
148#
149# To save even more memory, the ASCII versions could be derived from the
150# larger ones at runtime, saving some memory (minus the expense of the machine
151# instructions to do so), but these are all small anyway, so their total is
152# about 100 UVs.
153#
154# In the list of properties below that get generated, the L1 prefix is a fake
155# property that means just the Latin1 range of the full property (whose name
156# has an X prefix instead of L1).
a02047bf
KW
157#
158# An initial & means to use the subroutine from this file instead of an
159# official inversion list.
3f427fd9 160
9d9177be
KW
161for my $prop (qw(
162 ASCII
dab0c3e7 163 L1Cased
3f427fd9
KW
164 VertSpace
165 PerlSpace
166 XPerlSpace
167 PosixAlnum
168 L1PosixAlnum
169 PosixAlpha
170 L1PosixAlpha
171 PosixBlank
172 XPosixBlank
173 PosixCntrl
174 XPosixCntrl
175 PosixDigit
176 PosixGraph
177 L1PosixGraph
178 PosixLower
179 L1PosixLower
180 PosixPrint
181 L1PosixPrint
182 PosixPunct
183 L1PosixPunct
184 PosixSpace
185 XPosixSpace
186 PosixUpper
187 L1PosixUpper
188 PosixWord
189 L1PosixWord
190 PosixXDigit
191 XPosixXDigit
a02047bf
KW
192 &NonL1_Perl_Non_Final_Folds
193 &_Perl_Multi_Char_Folds
9d9177be
KW
194 )
195) {
196
3f427fd9
KW
197 # For the Latin1 properties, we change to use the eXtended version of the
198 # base property, then go through the result and get rid of everything not
b4069bca
KW
199 # in Latin1 (above 255). Actually, we retain the element for the range
200 # that crosses the 255/256 boundary if it is one that matches the
201 # property. For example, in the Word property, there is a range of code
202 # points that start at U+00F8 and goes through U+02C1. Instead of
3f427fd9
KW
203 # artifically cutting that off at 256 because 256 is the first code point
204 # above Latin1, we let the range go to its natural ending. That gives us
b4069bca
KW
205 # extra information with no added space taken. But if the range that
206 # crosses the boundary is one that doesn't match the property, we don't
207 # start a new range above 255, as that could be construed as going to
208 # infinity. For example, the Upper property doesn't include the character
209 # at 255, but does include the one at 256. We don't include the 256 one.
a02047bf
KW
210 my $prop_name = $prop;
211 my $is_local_sub = $prop_name =~ s/^&//;
212 my $lookup_prop = $prop_name;
c4854dea
KW
213 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//);
214 my $nonl1_only = 0;
215 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
a02047bf
KW
216
217 my @invlist;
218 if ($is_local_sub) {
219 @invlist = eval $lookup_prop;
220 }
221 else {
222 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
223 }
ad89228c 224 die "Could not find inversion list for '$lookup_prop'" unless @invlist;
3f427fd9 225
c4854dea 226 if ($l1_only) {
3f427fd9
KW
227 for my $i (0 .. @invlist - 1 - 1) {
228 if ($invlist[$i] > 255) {
b4069bca
KW
229
230 # In an inversion list, even-numbered elements give the code
231 # points that begin ranges that match the property;
232 # odd-numbered give ones that begin ranges that don't match.
233 # If $i is odd, we are at the first code point above 255 that
234 # doesn't match, which means the range it is ending does
235 # match, and crosses the 255/256 boundary. We want to include
236 # this ending point, so increment $i, so the splice below
237 # includes it. Conversely, if $i is even, it is the first
238 # code point above 255 that matches, which means there was no
239 # matching range that crossed the boundary, and we don't want
240 # to include this code point, so splice before it.
241 $i++ if $i % 2 != 0;
242
243 # Remove everything past this.
244 splice @invlist, $i;
3f427fd9
KW
245 last;
246 }
247 }
248 }
c4854dea
KW
249 elsif ($nonl1_only) {
250 my $found_nonl1 = 0;
251 for my $i (0 .. @invlist - 1 - 1) {
252 next if $invlist[$i] < 256;
253
254 # Here, we have the first element in the array that indicates an
255 # element above Latin1. Get rid of all previous ones.
256 splice @invlist, 0, $i;
257
258 # If this one's index is not divisible by 2, it means that this
259 # element is inverting away from being in the list, which means
260 # all code points from 256 to this one are in this list.
261 unshift @invlist, 256 if $i % 2 != 0;
262 $found_nonl1 = 1;
263 last;
264 }
265 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
266 }
3f427fd9 267
a02047bf 268 output_invlist($prop_name, \@invlist);
9d9177be
KW
269}
270
271read_only_bottom_close_and_rename($out_fh)