This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
remove vestigial uses of PRIVSHIFT
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
9d9177be 11require 'regen/regen_lib.pl';
0c4ecf42 12require 'regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
9d9177be
KW
43print $out_fh "/* See the generating file for comments */\n\n";
44
bffc0129
KW
45# The symbols generated by this program are all currently defined only in a
46# single dot c each. The code knows where most of them go, but this hash
47# gives overrides for the exceptions to the typical place
48my %exceptions_to_where_to_define =
49 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
50 AboveLatin1 => 'PERL_IN_REGCOMP_C',
51 Latin1 => 'PERL_IN_REGCOMP_C',
52 UpperLatin1 => 'PERL_IN_REGCOMP_C',
53 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
54 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
55 _Perl_IDCont => 'PERL_IN_UTF8_C',
56 _Perl_IDStart => 'PERL_IN_UTF8_C',
57 );
015bb97c 58
f79a09fc
KW
59# This hash contains the properties with enums that have hard-coded references
60# to them in C code. Its only use is to make sure that if perl is compiled
61# with an older Unicode data set, that all the enum values the code is
62# expecting will still be in the enum typedef. Thus the code doesn't have to
63# change. The Unicode version won't have any code points that have these enum
64# values, so the code that handles them will not get exercised. This is far
65# better than having to #ifdef things.
66my %hard_coded_enums =
67 ( gcb => [
68 'Control',
69 'CR',
70 'Extend',
71 'L',
72 'LF',
73 'LV',
74 'LVT',
75 'Other',
76 'Prepend',
77 'Regional_Indicator',
78 'SpacingMark',
79 'T',
80 'V',
81 ],
82 sb => [
83 'ATerm',
84 'Close',
85 'CR',
86 'Extend',
87 'Format',
88 'LF',
89 'Lower',
90 'Numeric',
91 'OLetter',
92 'Other',
93 'SContinue',
94 'Sep',
95 'Sp',
96 'STerm',
97 'Upper',
98 ],
99 wb => [
100 'ALetter',
101 'CR',
102 'Double_Quote',
103 'Extend',
104 'ExtendNumLet',
105 'Format',
106 'Hebrew_Letter',
107 'Katakana',
108 'LF',
109 'MidLetter',
110 'MidNum',
111 'MidNumLet',
112 'Newline',
113 'Numeric',
114 'Other',
f1f6961f 115 'Perl_Tailored_HSpace',
f79a09fc
KW
116 'Regional_Indicator',
117 'Single_Quote',
118 'UNKNOWN',
119 ],
120);
121
99f21fb9
KW
122my @a2n;
123
124sub uniques {
125 # Returns non-duplicated input values. From "Perl Best Practices:
126 # Encapsulated Cleverness". p. 455 in first edition.
127
128 my %seen;
129 return grep { ! $seen{$_}++ } @_;
130}
131
132sub a2n($) {
133 my $cp = shift;
134
135 # Returns the input Unicode code point translated to native.
136
137 return $cp if $cp !~ $numeric_re || $cp > 255;
138 return $a2n[$cp];
139}
140
bffc0129
KW
141sub end_file_pound_if {
142 if ($in_file_pound_if) {
143 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
144 $in_file_pound_if = 0;
145 }
146}
147
148sub switch_pound_if ($$) {
149 my $name = shift;
150 my $new_pound_if = shift;
151
152 # Switch to new #if given by the 2nd argument. If there is an override
153 # for this, it instead switches to that. The 1st argument is the
154 # static's name, used to look up the overrides
155
156 if (exists $exceptions_to_where_to_define{$name}) {
157 $new_pound_if = $exceptions_to_where_to_define{$name};
158 }
159
160 # Exit current #if if the new one is different from the old
161 if ($in_file_pound_if
162 && $in_file_pound_if !~ /$new_pound_if/)
163 {
164 end_file_pound_if;
165 }
166
167 # Enter new #if, if not already in it.
168 if (! $in_file_pound_if) {
169 $in_file_pound_if = "defined($new_pound_if)";
170 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
171 }
172}
173
0c4ecf42 174sub output_invlist ($$;$) {
9d9177be
KW
175 my $name = shift;
176 my $invlist = shift; # Reference to inversion list array
0c4ecf42 177 my $charset = shift // ""; # name of character set for comment
9d9177be 178
76d3994c 179 die "No inversion list for $name" unless defined $invlist
ad85f59a 180 && ref $invlist eq 'ARRAY';
76d3994c 181
9d9177be
KW
182 # Output the inversion list $invlist using the name $name for it.
183 # It is output in the exact internal form for inversion lists.
184
a0316a6c
KW
185 # Is the last element of the header 0, or 1 ?
186 my $zero_or_one = 0;
ad85f59a 187 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 188 unshift @$invlist, 0;
9d9177be
KW
189 $zero_or_one = 1;
190 }
0a07b44b 191 my $count = @$invlist;
9d9177be 192
bffc0129 193 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 194
0c4ecf42
KW
195 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
196 print $out_fh " /* for $charset */" if $charset;
197 print $out_fh "\n";
9d9177be 198
a0316a6c 199 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
200 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
201 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
202 ",\t/* 0 if the list starts at 0;",
203 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
204
205 # The main body are the UVs passed in to this routine. Do the final
206 # element separately
47d53124
KW
207 for my $i (0 .. @$invlist - 1) {
208 printf $out_fh "\t0x%X", $invlist->[$i];
209 print $out_fh "," if $i < @$invlist - 1;
210 print $out_fh "\n";
9d9177be
KW
211 }
212
9d9177be
KW
213 print $out_fh "};\n";
214}
215
99f21fb9
KW
216sub output_invmap ($$$$$$$) {
217 my $name = shift;
218 my $invmap = shift; # Reference to inversion map array
219 my $prop_name = shift;
220 my $input_format = shift; # The inversion map's format
221 my $default = shift; # The property value for code points who
222 # otherwise don't have a value specified.
223 my $extra_enums = shift; # comma-separated list of our additions to the
224 # property's standard possible values
225 my $charset = shift // ""; # name of character set for comment
226
227 # Output the inversion map $invmap for property $prop_name, but use $name
228 # as the actual data structure's name.
229
230 my $count = @$invmap;
231
232 my $output_format;
233 my $declaration_type;
234 my %enums;
235 my $name_prefix;
236
237 if ($input_format eq 's') {
02f811dd
KW
238 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
239 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
99f21fb9
KW
240 my @enums = prop_values($prop_name);
241 if (! @enums) {
242 die "Only enum properties are currently handled; '$prop_name' isn't one";
243 }
244 else {
245
f79a09fc 246 # Convert short names to long
99f21fb9 247 @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums;
f79a09fc
KW
248
249 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
250 die 'You need to update %hard_coded_enums to reflect new entries in this Unicode version'
251 if @expected_enums < @enums;
252
253 # Remove the enums found in the input from the ones we expect
254 for (my $i = @expected_enums - 1; $i >= 0; $i--) {
255 splice(@expected_enums, $i, 1)
256 if grep { $expected_enums[$i] eq $_ } @enums;
257 }
258
259 # The ones remaining must be because we're using an older
260 # Unicode version. Add them to the list.
261 push @enums, @expected_enums;
262
263 # Add in the extra values coded into this program, and sort.
99f21fb9
KW
264 push @enums, split /,/, $extra_enums if $extra_enums ne "";
265 @enums = sort @enums;
266
267 # Assign a value to each element of the enum. The default
268 # value always gets 0; the others are arbitrarily assigned.
269 my $enum_val = 0;
02f811dd
KW
270 my $canonical_default = prop_value_aliases($prop_name, $default);
271 $default = $canonical_default if defined $canonical_default;
99f21fb9
KW
272 $enums{$default} = $enum_val++;
273 for my $enum (@enums) {
274 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
275 }
276 }
277
bffc0129
KW
278 # Inversion map stuff is currently used only by regexec
279 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
99f21fb9
KW
280 {
281
99f21fb9
KW
282 # The short names tend to be two lower case letters, but it looks
283 # better for those if they are upper. XXX
284 $short_name = uc($short_name) if length($short_name) < 3
285 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
85e5f08b 286 $name_prefix = "${short_name}_";
99f21fb9
KW
287 my $enum_count = keys %enums;
288 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
289
290 print $out_fh "\ntypedef enum {\n";
291 print $out_fh "\t${name_prefix}$default = $enums{$default},\n";
292 delete $enums{$default};
293 foreach my $enum (sort { $a cmp $b } keys %enums) {
294 print $out_fh "\t${name_prefix}$enum = $enums{$enum}";
295 print $out_fh "," if $enums{$enum} < $enum_count - 1;
296 print $out_fh "\n";
297 }
298 $declaration_type = "${name_prefix}enum";
299 print $out_fh "} $declaration_type;\n";
300
301 $output_format = "${name_prefix}%s";
302 }
303 }
304 else {
305 die "'$input_format' invmap() format for '$prop_name' unimplemented";
306 }
307
308 die "No inversion map for $prop_name" unless defined $invmap
309 && ref $invmap eq 'ARRAY'
310 && $count;
311
312 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
313 print $out_fh " /* for $charset */" if $charset;
314 print $out_fh "\n";
315
316 # The main body are the scalars passed in to this routine.
317 for my $i (0 .. $count - 1) {
318 my $element = $invmap->[$i];
02f811dd
KW
319 my $full_element_name = prop_value_aliases($prop_name, $element);
320 $element = $full_element_name if defined $full_element_name;
321 $element = $name_prefix . $element;
99f21fb9
KW
322 print $out_fh "\t$element";
323 print $out_fh "," if $i < $count - 1;
324 print $out_fh "\n";
325 }
326 print $out_fh "};\n";
99f21fb9
KW
327}
328
5a7e5385 329sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
330
331 # Returns an inversion list constructed from the sorted input array of
332 # code points
333
334 my $list_ref = shift;
335
99f21fb9
KW
336 return unless @$list_ref;
337
a02047bf
KW
338 # Initialize to just the first element
339 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
340
341 # For each succeeding element, if it extends the previous range, adjust
342 # up, otherwise add it.
343 for my $i (1 .. @$list_ref - 1) {
344 if ($invlist[-1] == $list_ref->[$i]) {
345 $invlist[-1]++;
346 }
347 else {
348 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
349 }
350 }
351 return @invlist;
352}
353
354# Read in the Case Folding rules, and construct arrays of code points for the
355# properties we need.
356my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
357die "Could not find inversion map for Case_Folding" unless defined $format;
358die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
359 unless $format eq 'al'
360 || $format eq 'a';
a02047bf
KW
361my @has_multi_char_fold;
362my @is_non_final_fold;
363
364for my $i (0 .. @$folds_ref - 1) {
365 next unless ref $folds_ref->[$i]; # Skip single-char folds
366 push @has_multi_char_fold, $cp_ref->[$i];
367
b6a6e956 368 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
369 # position
370 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
371 push @is_non_final_fold, $folds_ref->[$i][$j]
372 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
373 }
374}
375
a02047bf
KW
376sub _Perl_Non_Final_Folds {
377 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 378 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
379}
380
99f21fb9
KW
381sub prop_name_for_cmp ($) { # Sort helper
382 my $name = shift;
383
384 # Returns the input lowercased, with non-alphas removed, as well as
385 # everything starting with a comma
386
387 $name =~ s/,.*//;
388 $name =~ s/[[:^alpha:]]//g;
389 return lc $name;
390}
391
892d8259 392sub UpperLatin1 {
5a7e5385 393 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
394}
395
9d9177be
KW
396output_invlist("Latin1", [ 0, 256 ]);
397output_invlist("AboveLatin1", [ 256 ]);
398
bffc0129 399end_file_pound_if;
43b443dd 400
3f427fd9
KW
401# We construct lists for all the POSIX and backslash sequence character
402# classes in two forms:
403# 1) ones which match only in the ASCII range
404# 2) ones which match either in the Latin1 range, or the entire Unicode range
405#
406# These get compiled in, and hence affect the memory footprint of every Perl
407# program, even those not using Unicode. To minimize the size, currently
408# the Latin1 version is generated for the beyond ASCII range except for those
409# lists that are quite small for the entire range, such as for \s, which is 22
410# UVs long plus 4 UVs (currently) for the header.
411#
412# To save even more memory, the ASCII versions could be derived from the
413# larger ones at runtime, saving some memory (minus the expense of the machine
414# instructions to do so), but these are all small anyway, so their total is
415# about 100 UVs.
416#
417# In the list of properties below that get generated, the L1 prefix is a fake
418# property that means just the Latin1 range of the full property (whose name
419# has an X prefix instead of L1).
a02047bf
KW
420#
421# An initial & means to use the subroutine from this file instead of an
422# official inversion list.
3f427fd9 423
0c4ecf42
KW
424for my $charset (get_supported_code_pages()) {
425 print $out_fh "\n" . get_conditional_compile_line_start($charset);
426
99f21fb9
KW
427 @a2n = @{get_a2n($charset)};
428 no warnings 'qw';
429 # Ignore non-alpha in sort
430 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
1c8c3428
KW
431 ASCII
432 Cased
433 VertSpace
434 XPerlSpace
435 XPosixAlnum
436 XPosixAlpha
437 XPosixBlank
438 XPosixCntrl
439 XPosixDigit
440 XPosixGraph
441 XPosixLower
442 XPosixPrint
443 XPosixPunct
444 XPosixSpace
445 XPosixUpper
446 XPosixWord
447 XPosixXDigit
448 _Perl_Any_Folds
449 &NonL1_Perl_Non_Final_Folds
450 _Perl_Folds_To_Multi_Char
451 &UpperLatin1
452 _Perl_IDStart
453 _Perl_IDCont
02f811dd 454 _Perl_GCB,EDGE
bf4268fa
KW
455 _Perl_WB,EDGE,UNKNOWN
456 _Perl_SB,EDGE
1c8c3428 457 )
0f5e3c71
KW
458 ) {
459
460 # For the Latin1 properties, we change to use the eXtended version of the
461 # base property, then go through the result and get rid of everything not
462 # in Latin1 (above 255). Actually, we retain the element for the range
463 # that crosses the 255/256 boundary if it is one that matches the
464 # property. For example, in the Word property, there is a range of code
465 # points that start at U+00F8 and goes through U+02C1. Instead of
466 # artificially cutting that off at 256 because 256 is the first code point
467 # above Latin1, we let the range go to its natural ending. That gives us
468 # extra information with no added space taken. But if the range that
469 # crosses the boundary is one that doesn't match the property, we don't
470 # start a new range above 255, as that could be construed as going to
471 # infinity. For example, the Upper property doesn't include the character
472 # at 255, but does include the one at 256. We don't include the 256 one.
473 my $prop_name = $prop;
474 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
475 my $extra_enums = "";
476 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
477 my $lookup_prop = $prop_name;
478 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
479 or $lookup_prop =~ s/^L1//);
480 my $nonl1_only = 0;
481 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 482 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
483
484 my @invlist;
99f21fb9
KW
485 my @invmap;
486 my $map_format;
487 my $map_default;
488 my $maps_to_code_point;
489 my $to_adjust;
0f5e3c71
KW
490 if ($is_local_sub) {
491 @invlist = eval $lookup_prop;
492 }
493 else {
494 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 495 if (! @invlist) {
99f21fb9 496
ad85f59a
KW
497 # If couldn't find a non-empty inversion list, see if it is
498 # instead an inversion map
499 my ($list_ref, $map_ref, $format, $default)
99f21fb9 500 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
501 if (! $list_ref) {
502 # An empty return here could mean an unknown property, or
503 # merely that the original inversion list is empty. Call
504 # in scalar context to differentiate
505 my $count = prop_invlist($lookup_prop,
506 '_perl_core_internal_ok');
507 die "Could not find inversion list for '$lookup_prop'"
508 unless defined $count;
509 }
510 else {
18b852b3
KW
511 @invlist = @$list_ref;
512 @invmap = @$map_ref;
513 $map_format = $format;
514 $map_default = $default;
515 $maps_to_code_point = $map_format =~ /x/;
516 $to_adjust = $map_format =~ /a/;
ad85f59a 517 }
99f21fb9 518 }
0f5e3c71 519 }
ad85f59a
KW
520
521
522 # Short-circuit an empty inversion list.
523 if (! @invlist) {
524 output_invlist($prop_name, \@invlist, $charset);
525 next;
526 }
ceb1de32 527
99f21fb9
KW
528 # Re-order the Unicode code points to native ones for this platform.
529 # This is only needed for code points below 256, because native code
530 # points are only in that range. For inversion maps of properties
531 # where the mappings are adjusted (format =~ /a/), this reordering
532 # could mess up the adjustment pattern that was in the input, so that
533 # has to be dealt with.
534 #
535 # And inversion maps that map to code points need to eventually have
536 # all those code points remapped to native, and it's better to do that
537 # here, going through the whole list not just those below 256. This
538 # is because some inversion maps have adjustments (format =~ /a/)
539 # which may be affected by the reordering. This code needs to be done
540 # both for when we are translating the inversion lists for < 256, and
541 # for the inversion maps for everything. By doing both in this loop,
542 # we can share that code.
543 #
544 # So, we go through everything for an inversion map to code points;
545 # otherwise, we can skip any remapping at all if we are going to
546 # output only the above-Latin1 values, or if the range spans the whole
547 # of 0..256, as the remap will also include all of 0..256 (256 not
548 # 255 because a re-ordering could cause 256 to need to be in the same
549 # range as 255.)
550 if ((@invmap && $maps_to_code_point)
551 || (! $nonl1_only || ($invlist[0] < 256
552 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 553 {
fb4554ea 554
99f21fb9 555 if (! @invmap) { # Straight inversion list
fb4554ea
KW
556 # Look at all the ranges that start before 257.
557 my @latin1_list;
558 while (@invlist) {
559 last if $invlist[0] > 256;
560 my $upper = @invlist > 1
561 ? $invlist[1] - 1 # In range
8a6c81cf
KW
562
563 # To infinity. You may want to stop much much
564 # earlier; going this high may expose perl
565 # deficiencies with very large numbers.
566 : $Unicode::UCD::MAX_CP;
fb4554ea 567 for my $j ($invlist[0] .. $upper) {
99f21fb9 568 push @latin1_list, a2n($j);
0f5e3c71 569 }
fb4554ea
KW
570
571 shift @invlist; # Shift off the range that's in the list
572 shift @invlist; # Shift off the range not in the list
0c4ecf42 573 }
fb4554ea
KW
574
575 # Here @invlist contains all the ranges in the original that start
576 # at code points above 256, and @latin1_list contains all the
577 # native code points for ranges that start with a Unicode code
578 # point below 257. We sort the latter and convert it to inversion
579 # list format. Then simply prepend it to the list of the higher
580 # code points.
581 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 582 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 583 unshift @invlist, @latin1_list;
99f21fb9
KW
584 }
585 else { # Is an inversion map
586
587 # This is a similar procedure as plain inversion list, but has
588 # multiple buckets. A plain inversion list just has two
589 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
590 # pretty much can ignore the 2nd bucket, as it is completely
591 # defined by the 1st. But here, what we do is create buckets
592 # which contain the code points that map to each, translated
593 # to native and turned into an inversion list. Thus each
594 # bucket is an inversion list of native code points that map
595 # to it or don't map to it. We use these to create an
596 # inversion map for the whole property.
597
598 # As mentioned earlier, we use this procedure to not just
599 # remap the inversion list to native values, but also the maps
600 # of code points to native ones. In the latter case we have
601 # to look at the whole of the inversion map (or at least to
602 # above Unicode; as the maps of code points above that should
603 # all be to the default).
604 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
605
606 my %mapped_lists; # A hash whose keys are the buckets.
607 while (@invlist) {
608 last if $invlist[0] > $upper_limit;
609
610 # This shouldn't actually happen, as prop_invmap() returns
611 # an extra element at the end that is beyond $upper_limit
612 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
613
614 my $bucket;
615
616 # A hash key can't be a ref (we are only expecting arrays
617 # of scalars here), so convert any such to a string that
618 # will be converted back later (using a vertical tab as
619 # the separator). Even if the mapping is to code points,
620 # we don't translate to native here because the code
621 # output_map() calls to output these arrays assumes the
622 # input is Unicode, not native.
623 if (ref $invmap[0]) {
624 $bucket = join "\cK", @{$invmap[0]};
625 }
626 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
627
628 # Do convert to native for maps to single code points.
629 # There are some properties that have a few outlier
630 # maps that aren't code points, so the above test
631 # skips those.
632 $bucket = a2n($invmap[0]);
633 } else {
634 $bucket = $invmap[0];
635 }
636
637 # We now have the bucket that all code points in the range
638 # map to, though possibly they need to be adjusted. Go
639 # through the range and put each translated code point in
640 # it into its bucket.
641 my $base_map = $invmap[0];
642 for my $j ($invlist[0] .. $invlist[1] - 1) {
643 if ($to_adjust
644 # The 1st code point doesn't need adjusting
645 && $j > $invlist[0]
646
647 # Skip any non-numeric maps: these are outliers
648 # that aren't code points.
649 && $base_map =~ $numeric_re
650
651 # 'ne' because the default can be a string
652 && $base_map ne $map_default)
653 {
654 # We adjust, by incrementing each the bucket and
655 # the map. For code point maps, translate to
656 # native
657 $base_map++;
658 $bucket = ($maps_to_code_point)
659 ? a2n($base_map)
660 : $base_map;
661 }
662
663 # Add the native code point to the bucket for the
664 # current map
665 push @{$mapped_lists{$bucket}}, a2n($j);
666 } # End of loop through all code points in the range
667
668 # Get ready for the next range
669 shift @invlist;
670 shift @invmap;
671 } # End of loop through all ranges in the map.
672
673 # Here, @invlist and @invmap retain all the ranges from the
674 # originals that start with code points above $upper_limit.
675 # Each bucket in %mapped_lists contains all the code points
676 # that map to that bucket. If the bucket is for a map to a
677 # single code point is a single code point, the bucket has
678 # been converted to native. If something else (including
679 # multiple code points), no conversion is done.
680 #
681 # Now we recreate the inversion map into %xlated, but this
682 # time for the native character set.
683 my %xlated;
684 foreach my $bucket (keys %mapped_lists) {
685
686 # Sort and convert this bucket to an inversion list. The
687 # result will be that ranges that start with even-numbered
688 # indexes will be for code points that map to this bucket;
689 # odd ones map to some other bucket, and are discarded
690 # below.
691 @{$mapped_lists{$bucket}}
692 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
693 @{$mapped_lists{$bucket}}
694 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
695
696 # Add each even-numbered range in the bucket to %xlated;
697 # so that the keys of %xlated become the range start code
698 # points, and the values are their corresponding maps.
699 while (@{$mapped_lists{$bucket}}) {
700 my $range_start = $mapped_lists{$bucket}->[0];
701 if ($bucket =~ /\cK/) {
702 @{$xlated{$range_start}} = split /\cK/, $bucket;
703 }
704 else {
705 $xlated{$range_start} = $bucket;
706 }
707 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
708 shift @{$mapped_lists{$bucket}}; # Get ready for next
709 # iteration
710 }
711 } # End of loop through all the buckets.
712
713 # Here %xlated's keys are the range starts of all the code
714 # points in the inversion map. Construct an inversion list
715 # from them.
716 my @new_invlist = sort { $a <=> $b } keys %xlated;
717
718 # If the list is adjusted, we want to munge this list so that
719 # we only have one entry for where consecutive code points map
720 # to consecutive values. We just skip the subsequent entries
721 # where this is the case.
722 if ($to_adjust) {
723 my @temp;
724 for my $i (0 .. @new_invlist - 1) {
725 next if $i > 0
726 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
727 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
728 && $xlated{$new_invlist[$i]} =~ $numeric_re
729 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
730 push @temp, $new_invlist[$i];
731 }
732 @new_invlist = @temp;
733 }
734
735 # The inversion map comes from %xlated's values. We can
736 # unshift each onto the front of the untouched portion, in
737 # reverse order of the portion we did process.
738 foreach my $start (reverse @new_invlist) {
739 unshift @invmap, $xlated{$start};
740 }
741
742 # Finally prepend the inversion list we have just constructed to the
743 # one that contains anything we didn't process.
744 unshift @invlist, @new_invlist;
745 }
746 }
747
748 # prop_invmap() returns an extra final entry, which we can now
749 # discard.
750 if (@invmap) {
751 pop @invlist;
752 pop @invmap;
ceb1de32 753 }
0f5e3c71
KW
754
755 if ($l1_only) {
99f21fb9 756 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
757 for my $i (0 .. @invlist - 1 - 1) {
758 if ($invlist[$i] > 255) {
759
760 # In an inversion list, even-numbered elements give the code
761 # points that begin ranges that match the property;
762 # odd-numbered give ones that begin ranges that don't match.
763 # If $i is odd, we are at the first code point above 255 that
764 # doesn't match, which means the range it is ending does
765 # match, and crosses the 255/256 boundary. We want to include
766 # this ending point, so increment $i, so the splice below
767 # includes it. Conversely, if $i is even, it is the first
768 # code point above 255 that matches, which means there was no
769 # matching range that crossed the boundary, and we don't want
770 # to include this code point, so splice before it.
771 $i++ if $i % 2 != 0;
772
773 # Remove everything past this.
774 splice @invlist, $i;
99f21fb9 775 splice @invmap, $i if @invmap;
0f5e3c71
KW
776 last;
777 }
0c4ecf42
KW
778 }
779 }
0f5e3c71
KW
780 elsif ($nonl1_only) {
781 my $found_nonl1 = 0;
782 for my $i (0 .. @invlist - 1 - 1) {
783 next if $invlist[$i] < 256;
784
785 # Here, we have the first element in the array that indicates an
786 # element above Latin1. Get rid of all previous ones.
787 splice @invlist, 0, $i;
99f21fb9 788 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
789
790 # If this one's index is not divisible by 2, it means that this
791 # element is inverting away from being in the list, which means
99f21fb9
KW
792 # all code points from 256 to this one are in this list (or
793 # map to the default for inversion maps)
794 if ($i % 2 != 0) {
795 unshift @invlist, 256;
796 unshift @invmap, $map_default if @invmap;
797 }
0f5e3c71 798 $found_nonl1 = 1;
3f427fd9
KW
799 last;
800 }
0f5e3c71 801 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 802 }
3f427fd9 803
0f5e3c71 804 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 805 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 806 }
bffc0129 807 end_file_pound_if;
0c4ecf42 808 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
809}
810
2308ab83 811my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
812my @sources = ($0, qw(lib/unicore/mktables
813 lib/Unicode/UCD.pm
814 regen/charset_translations.pl
815 ));
9a3da3ad
FC
816{
817 # Depend on mktables’ own sources. It’s a shorter list of files than
818 # those that Unicode::UCD uses.
2308ab83
KW
819 if (! open my $mktables_list, $sources_list) {
820
821 # This should force a rebuild once $sources_list exists
822 push @sources, $sources_list;
823 }
824 else {
825 while(<$mktables_list>) {
826 last if /===/;
827 chomp;
828 push @sources, "lib/unicore/$_" if /^[^#]/;
829 }
9a3da3ad
FC
830 }
831}
832read_only_bottom_close_and_rename($out_fh, \@sources)