5 use Unicode::UCD qw(prop_invlist prop_invmap);
6 require 'regen/regen_lib.pl';
7 require 'regen/charset_translations.pl';
9 # This program outputs charclass_invlists.h, which contains various inversion
10 # lists in the form of C arrays that are to be used as-is for inversion lists.
11 # Thus, the lists it contains are essentially pre-compiled, and need only a
12 # light-weight fast wrapper to make them usable at run-time.
14 # As such, this code knows about the internal structure of these lists, and
15 # any change made to that has to be done here as well. A random number stored
16 # in the headers is used to minimize the possibility of things getting
17 # out-of-sync, or the wrong data structure being passed. Currently that
19 my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
21 my $out_fh = open_new('charclass_invlists.h', '>',
22 {style => '*', by => $0,
23 from => "Unicode::UCD"});
25 my $is_in_ifndef_ext_re = 0;
27 print $out_fh "/* See the generating file for comments */\n\n";
29 my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );
31 sub end_ifndef_ext_re {
32 if ($is_in_ifndef_ext_re) {
33 print $out_fh "\n#endif\t/* #ifndef PERL_IN_XSUB_RE */\n";
34 $is_in_ifndef_ext_re = 0;
38 sub output_invlist ($$;$) {
40 my $invlist = shift; # Reference to inversion list array
41 my $charset = shift // ""; # name of character set for comment
43 die "No inversion list for $name" unless defined $invlist
44 && ref $invlist eq 'ARRAY'
47 # Output the inversion list $invlist using the name $name for it.
48 # It is output in the exact internal form for inversion lists.
50 # Is the last element of the header 0, or 1 ?
52 if ($invlist->[0] != 0) {
56 my $count = @$invlist;
58 if ($is_in_ifndef_ext_re) {
59 if (exists $include_in_ext_re{$name}) {
63 elsif (! exists $include_in_ext_re{$name}) {
64 print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
65 $is_in_ifndef_ext_re = 1;
68 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
69 print $out_fh " /* for $charset */" if $charset;
72 print $out_fh "\t$count,\t/* Number of elements */\n";
73 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
74 print $out_fh "\t", $zero_or_one,
75 ",\t/* 0 if the list starts at 0;",
76 "\n\t\t 1 if it starts at the element beyond 0 */\n";
78 # The main body are the UVs passed in to this routine. Do the final
80 for my $i (0 .. @$invlist - 1) {
81 printf $out_fh "\t0x%X", $invlist->[$i];
82 print $out_fh "," if $i < @$invlist - 1;
89 sub mk_invlist_from_sorted_cp_list {
91 # Returns an inversion list constructed from the sorted input array of
96 # Initialize to just the first element
97 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
99 # For each succeeding element, if it extends the previous range, adjust
100 # up, otherwise add it.
101 for my $i (1 .. @$list_ref - 1) {
102 if ($invlist[-1] == $list_ref->[$i]) {
106 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
112 # Read in the Case Folding rules, and construct arrays of code points for the
113 # properties we need.
114 my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
115 die "Could not find inversion map for Case_Folding" unless defined $format;
116 die "Incorrect format '$format' for Case_Folding inversion map"
117 unless $format eq 'al';
118 my @has_multi_char_fold;
119 my @is_non_final_fold;
121 for my $i (0 .. @$folds_ref - 1) {
122 next unless ref $folds_ref->[$i]; # Skip single-char folds
123 push @has_multi_char_fold, $cp_ref->[$i];
125 # Add to the non-finals list each code point that is in a non-final
127 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
128 push @is_non_final_fold, $folds_ref->[$i][$j]
129 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
133 sub _Perl_Non_Final_Folds {
134 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
135 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
139 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
142 output_invlist("Latin1", [ 0, 256 ]);
143 output_invlist("AboveLatin1", [ 256 ]);
147 # We construct lists for all the POSIX and backslash sequence character
148 # classes in two forms:
149 # 1) ones which match only in the ASCII range
150 # 2) ones which match either in the Latin1 range, or the entire Unicode range
152 # These get compiled in, and hence affect the memory footprint of every Perl
153 # program, even those not using Unicode. To minimize the size, currently
154 # the Latin1 version is generated for the beyond ASCII range except for those
155 # lists that are quite small for the entire range, such as for \s, which is 22
156 # UVs long plus 4 UVs (currently) for the header.
158 # To save even more memory, the ASCII versions could be derived from the
159 # larger ones at runtime, saving some memory (minus the expense of the machine
160 # instructions to do so), but these are all small anyway, so their total is
163 # In the list of properties below that get generated, the L1 prefix is a fake
164 # property that means just the Latin1 range of the full property (whose name
165 # has an X prefix instead of L1).
167 # An initial & means to use the subroutine from this file instead of an
168 # official inversion list.
170 for my $charset (get_supported_code_pages()) {
171 print $out_fh "\n" . get_conditional_compile_line_start($charset);
173 my @a2n = @{get_a2n($charset)};
174 # Ignore non-alpha in sort
175 for my $prop (sort { lc ($a =~ s/[[:^alpha:]]//gr)
176 cmp lc ($b =~ s/[[:^alpha:]]//gr)
196 &NonL1_Perl_Non_Final_Folds
197 _Perl_Folds_To_Multi_Char
204 # For the Latin1 properties, we change to use the eXtended version of the
205 # base property, then go through the result and get rid of everything not
206 # in Latin1 (above 255). Actually, we retain the element for the range
207 # that crosses the 255/256 boundary if it is one that matches the
208 # property. For example, in the Word property, there is a range of code
209 # points that start at U+00F8 and goes through U+02C1. Instead of
210 # artificially cutting that off at 256 because 256 is the first code point
211 # above Latin1, we let the range go to its natural ending. That gives us
212 # extra information with no added space taken. But if the range that
213 # crosses the boundary is one that doesn't match the property, we don't
214 # start a new range above 255, as that could be construed as going to
215 # infinity. For example, the Upper property doesn't include the character
216 # at 255, but does include the one at 256. We don't include the 256 one.
217 my $prop_name = $prop;
218 my $is_local_sub = $prop_name =~ s/^&//;
219 my $lookup_prop = $prop_name;
220 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
221 or $lookup_prop =~ s/^L1//);
223 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
227 @invlist = eval $lookup_prop;
230 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
232 die "Could not find inversion list for '$lookup_prop'" unless @invlist;
234 # Re-order the Unicode code points to native ones for this platform;
235 # only needed for code points below 256, and only if the first range
236 # doesn't span the whole of 0..256 (256 not 255 because a re-ordering
237 # could cause 256 to need to be in the same range as 255.)
238 if (! $nonl1_only || ($invlist[0] < 256
239 && ! ($invlist[0] == 0 && $invlist[1] > 256)))
242 # Look at all the ranges that start before 257.
245 last if $invlist[0] > 256;
246 my $upper = @invlist > 1
247 ? $invlist[1] - 1 # In range
249 # To infinity. You may want to stop much much
250 # earlier; going this high may expose perl
251 # deficiencies with very large numbers.
252 : $Unicode::UCD::MAX_CP;
253 for my $j ($invlist[0] .. $upper) {
255 push @latin1_list, $a2n[$j];
258 push @latin1_list, $j;
262 shift @invlist; # Shift off the range that's in the list
263 shift @invlist; # Shift off the range not in the list
266 # Here @invlist contains all the ranges in the original that start
267 # at code points above 256, and @latin1_list contains all the
268 # native code points for ranges that start with a Unicode code
269 # point below 257. We sort the latter and convert it to inversion
270 # list format. Then simply prepend it to the list of the higher
272 @latin1_list = sort { $a <=> $b } @latin1_list;
273 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
274 unshift @invlist, @latin1_list;
278 for my $i (0 .. @invlist - 1 - 1) {
279 if ($invlist[$i] > 255) {
281 # In an inversion list, even-numbered elements give the code
282 # points that begin ranges that match the property;
283 # odd-numbered give ones that begin ranges that don't match.
284 # If $i is odd, we are at the first code point above 255 that
285 # doesn't match, which means the range it is ending does
286 # match, and crosses the 255/256 boundary. We want to include
287 # this ending point, so increment $i, so the splice below
288 # includes it. Conversely, if $i is even, it is the first
289 # code point above 255 that matches, which means there was no
290 # matching range that crossed the boundary, and we don't want
291 # to include this code point, so splice before it.
294 # Remove everything past this.
300 elsif ($nonl1_only) {
302 for my $i (0 .. @invlist - 1 - 1) {
303 next if $invlist[$i] < 256;
305 # Here, we have the first element in the array that indicates an
306 # element above Latin1. Get rid of all previous ones.
307 splice @invlist, 0, $i;
309 # If this one's index is not divisible by 2, it means that this
310 # element is inverting away from being in the list, which means
311 # all code points from 256 to this one are in this list.
312 unshift @invlist, 256 if $i % 2 != 0;
316 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
319 output_invlist($prop_name, \@invlist, $charset);
322 print $out_fh "\n" . get_conditional_compile_line_end();
325 my @sources = ($0, "lib/Unicode/UCD.pm");
327 # Depend on mktables’ own sources. It’s a shorter list of files than
328 # those that Unicode::UCD uses.
329 open my $mktables_list, "lib/unicore/mktables.lst"
330 or die "$0 cannot open lib/unicore/mktables.lst: $!";
331 while(<$mktables_list>) {
334 push @sources, "lib/unicore/$_" if /^[^#]/;
337 read_only_bottom_close_and_rename($out_fh, \@sources)