This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/mk_invlists.pl: Use property's real values
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
9d9177be 11require 'regen/regen_lib.pl';
0c4ecf42 12require 'regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
9d9177be
KW
43print $out_fh "/* See the generating file for comments */\n\n";
44
bffc0129
KW
45# The symbols generated by this program are all currently defined only in a
46# single dot c each. The code knows where most of them go, but this hash
47# gives overrides for the exceptions to the typical place
48my %exceptions_to_where_to_define =
49 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
50 AboveLatin1 => 'PERL_IN_REGCOMP_C',
51 Latin1 => 'PERL_IN_REGCOMP_C',
52 UpperLatin1 => 'PERL_IN_REGCOMP_C',
53 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
54 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
55 _Perl_IDCont => 'PERL_IN_UTF8_C',
56 _Perl_IDStart => 'PERL_IN_UTF8_C',
57 );
015bb97c 58
f79a09fc
KW
59# This hash contains the properties with enums that have hard-coded references
60# to them in C code. Its only use is to make sure that if perl is compiled
61# with an older Unicode data set, that all the enum values the code is
62# expecting will still be in the enum typedef. Thus the code doesn't have to
63# change. The Unicode version won't have any code points that have these enum
64# values, so the code that handles them will not get exercised. This is far
65# better than having to #ifdef things.
66my %hard_coded_enums =
67 ( gcb => [
68 'Control',
69 'CR',
70 'Extend',
71 'L',
72 'LF',
73 'LV',
74 'LVT',
75 'Other',
76 'Prepend',
77 'Regional_Indicator',
78 'SpacingMark',
79 'T',
80 'V',
81 ],
82 sb => [
83 'ATerm',
84 'Close',
85 'CR',
86 'Extend',
87 'Format',
88 'LF',
89 'Lower',
90 'Numeric',
91 'OLetter',
92 'Other',
93 'SContinue',
94 'Sep',
95 'Sp',
96 'STerm',
97 'Upper',
98 ],
99 wb => [
100 'ALetter',
101 'CR',
102 'Double_Quote',
103 'Extend',
104 'ExtendNumLet',
105 'Format',
106 'Hebrew_Letter',
107 'Katakana',
108 'LF',
109 'MidLetter',
110 'MidNum',
111 'MidNumLet',
112 'Newline',
113 'Numeric',
114 'Other',
f1f6961f 115 'Perl_Tailored_HSpace',
f79a09fc
KW
116 'Regional_Indicator',
117 'Single_Quote',
f79a09fc
KW
118 ],
119);
120
99f21fb9
KW
121my @a2n;
122
123sub uniques {
124 # Returns non-duplicated input values. From "Perl Best Practices:
125 # Encapsulated Cleverness". p. 455 in first edition.
126
127 my %seen;
128 return grep { ! $seen{$_}++ } @_;
129}
130
131sub a2n($) {
132 my $cp = shift;
133
134 # Returns the input Unicode code point translated to native.
135
136 return $cp if $cp !~ $numeric_re || $cp > 255;
137 return $a2n[$cp];
138}
139
bffc0129
KW
140sub end_file_pound_if {
141 if ($in_file_pound_if) {
142 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
143 $in_file_pound_if = 0;
144 }
145}
146
147sub switch_pound_if ($$) {
148 my $name = shift;
149 my $new_pound_if = shift;
150
151 # Switch to new #if given by the 2nd argument. If there is an override
152 # for this, it instead switches to that. The 1st argument is the
153 # static's name, used to look up the overrides
154
155 if (exists $exceptions_to_where_to_define{$name}) {
156 $new_pound_if = $exceptions_to_where_to_define{$name};
157 }
158
159 # Exit current #if if the new one is different from the old
160 if ($in_file_pound_if
161 && $in_file_pound_if !~ /$new_pound_if/)
162 {
163 end_file_pound_if;
164 }
165
166 # Enter new #if, if not already in it.
167 if (! $in_file_pound_if) {
168 $in_file_pound_if = "defined($new_pound_if)";
169 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
170 }
171}
172
0c4ecf42 173sub output_invlist ($$;$) {
9d9177be
KW
174 my $name = shift;
175 my $invlist = shift; # Reference to inversion list array
0c4ecf42 176 my $charset = shift // ""; # name of character set for comment
9d9177be 177
76d3994c 178 die "No inversion list for $name" unless defined $invlist
ad85f59a 179 && ref $invlist eq 'ARRAY';
76d3994c 180
9d9177be
KW
181 # Output the inversion list $invlist using the name $name for it.
182 # It is output in the exact internal form for inversion lists.
183
a0316a6c
KW
184 # Is the last element of the header 0, or 1 ?
185 my $zero_or_one = 0;
ad85f59a 186 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 187 unshift @$invlist, 0;
9d9177be
KW
188 $zero_or_one = 1;
189 }
0a07b44b 190 my $count = @$invlist;
9d9177be 191
bffc0129 192 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 193
0c4ecf42
KW
194 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
195 print $out_fh " /* for $charset */" if $charset;
196 print $out_fh "\n";
9d9177be 197
a0316a6c 198 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
199 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
200 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
201 ",\t/* 0 if the list starts at 0;",
202 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
203
204 # The main body are the UVs passed in to this routine. Do the final
205 # element separately
47d53124
KW
206 for my $i (0 .. @$invlist - 1) {
207 printf $out_fh "\t0x%X", $invlist->[$i];
208 print $out_fh "," if $i < @$invlist - 1;
209 print $out_fh "\n";
9d9177be
KW
210 }
211
9d9177be
KW
212 print $out_fh "};\n";
213}
214
99f21fb9
KW
215sub output_invmap ($$$$$$$) {
216 my $name = shift;
217 my $invmap = shift; # Reference to inversion map array
218 my $prop_name = shift;
219 my $input_format = shift; # The inversion map's format
220 my $default = shift; # The property value for code points who
221 # otherwise don't have a value specified.
222 my $extra_enums = shift; # comma-separated list of our additions to the
223 # property's standard possible values
224 my $charset = shift // ""; # name of character set for comment
225
226 # Output the inversion map $invmap for property $prop_name, but use $name
227 # as the actual data structure's name.
228
229 my $count = @$invmap;
230
231 my $output_format;
232 my $declaration_type;
233 my %enums;
234 my $name_prefix;
235
236 if ($input_format eq 's') {
b83e6484 237 my $orig_prop_name = $prop_name;
02f811dd
KW
238 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
239 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
b83e6484
KW
240 my @enums;
241 if ($orig_prop_name eq $prop_name) {
242 @enums = prop_values($prop_name);
243 }
244 else {
245 @enums = uniques(@$invmap);
246 }
99f21fb9
KW
247 if (! @enums) {
248 die "Only enum properties are currently handled; '$prop_name' isn't one";
249 }
250 else {
251
f79a09fc 252 # Convert short names to long
99f21fb9 253 @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums;
f79a09fc
KW
254
255 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
256 die 'You need to update %hard_coded_enums to reflect new entries in this Unicode version'
257 if @expected_enums < @enums;
258
259 # Remove the enums found in the input from the ones we expect
260 for (my $i = @expected_enums - 1; $i >= 0; $i--) {
261 splice(@expected_enums, $i, 1)
262 if grep { $expected_enums[$i] eq $_ } @enums;
263 }
264
265 # The ones remaining must be because we're using an older
266 # Unicode version. Add them to the list.
267 push @enums, @expected_enums;
268
269 # Add in the extra values coded into this program, and sort.
99f21fb9
KW
270 @enums = sort @enums;
271
6dc80864
KW
272 # The internal enums comes last.
273 push @enums, split /,/, $extra_enums if $extra_enums ne "";
274
99f21fb9
KW
275 # Assign a value to each element of the enum. The default
276 # value always gets 0; the others are arbitrarily assigned.
277 my $enum_val = 0;
02f811dd
KW
278 my $canonical_default = prop_value_aliases($prop_name, $default);
279 $default = $canonical_default if defined $canonical_default;
99f21fb9
KW
280 $enums{$default} = $enum_val++;
281 for my $enum (@enums) {
282 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
283 }
284 }
285
bffc0129
KW
286 # Inversion map stuff is currently used only by regexec
287 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
99f21fb9
KW
288 {
289
99f21fb9
KW
290 # The short names tend to be two lower case letters, but it looks
291 # better for those if they are upper. XXX
292 $short_name = uc($short_name) if length($short_name) < 3
293 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
85e5f08b 294 $name_prefix = "${short_name}_";
99f21fb9
KW
295 my $enum_count = keys %enums;
296 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
297
298 print $out_fh "\ntypedef enum {\n";
6dc80864
KW
299 my @enum_list;
300 foreach my $enum (keys %enums) {
301 $enum_list[$enums{$enum}] = $enum;
302 }
303 foreach my $i (0 .. @enum_list - 1) {
304 my $name = $enum_list[$i];
305 print $out_fh "\t${name_prefix}$name = $i";
306 print $out_fh "," if $i < $enum_count - 1;
307 print $out_fh "\n";
99f21fb9
KW
308 }
309 $declaration_type = "${name_prefix}enum";
310 print $out_fh "} $declaration_type;\n";
311
312 $output_format = "${name_prefix}%s";
313 }
314 }
315 else {
316 die "'$input_format' invmap() format for '$prop_name' unimplemented";
317 }
318
319 die "No inversion map for $prop_name" unless defined $invmap
320 && ref $invmap eq 'ARRAY'
321 && $count;
322
323 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
324 print $out_fh " /* for $charset */" if $charset;
325 print $out_fh "\n";
326
327 # The main body are the scalars passed in to this routine.
328 for my $i (0 .. $count - 1) {
329 my $element = $invmap->[$i];
02f811dd
KW
330 my $full_element_name = prop_value_aliases($prop_name, $element);
331 $element = $full_element_name if defined $full_element_name;
332 $element = $name_prefix . $element;
99f21fb9
KW
333 print $out_fh "\t$element";
334 print $out_fh "," if $i < $count - 1;
335 print $out_fh "\n";
336 }
337 print $out_fh "};\n";
99f21fb9
KW
338}
339
5a7e5385 340sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
341
342 # Returns an inversion list constructed from the sorted input array of
343 # code points
344
345 my $list_ref = shift;
346
99f21fb9
KW
347 return unless @$list_ref;
348
a02047bf
KW
349 # Initialize to just the first element
350 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
351
352 # For each succeeding element, if it extends the previous range, adjust
353 # up, otherwise add it.
354 for my $i (1 .. @$list_ref - 1) {
355 if ($invlist[-1] == $list_ref->[$i]) {
356 $invlist[-1]++;
357 }
358 else {
359 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
360 }
361 }
362 return @invlist;
363}
364
365# Read in the Case Folding rules, and construct arrays of code points for the
366# properties we need.
367my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
368die "Could not find inversion map for Case_Folding" unless defined $format;
369die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
370 unless $format eq 'al'
371 || $format eq 'a';
a02047bf
KW
372my @has_multi_char_fold;
373my @is_non_final_fold;
374
375for my $i (0 .. @$folds_ref - 1) {
376 next unless ref $folds_ref->[$i]; # Skip single-char folds
377 push @has_multi_char_fold, $cp_ref->[$i];
378
b6a6e956 379 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
380 # position
381 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
382 push @is_non_final_fold, $folds_ref->[$i][$j]
383 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
384 }
385}
386
a02047bf
KW
387sub _Perl_Non_Final_Folds {
388 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 389 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
390}
391
99f21fb9
KW
392sub prop_name_for_cmp ($) { # Sort helper
393 my $name = shift;
394
395 # Returns the input lowercased, with non-alphas removed, as well as
396 # everything starting with a comma
397
398 $name =~ s/,.*//;
399 $name =~ s/[[:^alpha:]]//g;
400 return lc $name;
401}
402
892d8259 403sub UpperLatin1 {
5a7e5385 404 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
405}
406
9d9177be
KW
407output_invlist("Latin1", [ 0, 256 ]);
408output_invlist("AboveLatin1", [ 256 ]);
409
bffc0129 410end_file_pound_if;
43b443dd 411
3f427fd9
KW
412# We construct lists for all the POSIX and backslash sequence character
413# classes in two forms:
414# 1) ones which match only in the ASCII range
415# 2) ones which match either in the Latin1 range, or the entire Unicode range
416#
417# These get compiled in, and hence affect the memory footprint of every Perl
418# program, even those not using Unicode. To minimize the size, currently
419# the Latin1 version is generated for the beyond ASCII range except for those
420# lists that are quite small for the entire range, such as for \s, which is 22
421# UVs long plus 4 UVs (currently) for the header.
422#
423# To save even more memory, the ASCII versions could be derived from the
424# larger ones at runtime, saving some memory (minus the expense of the machine
425# instructions to do so), but these are all small anyway, so their total is
426# about 100 UVs.
427#
428# In the list of properties below that get generated, the L1 prefix is a fake
429# property that means just the Latin1 range of the full property (whose name
430# has an X prefix instead of L1).
a02047bf
KW
431#
432# An initial & means to use the subroutine from this file instead of an
433# official inversion list.
3f427fd9 434
0c4ecf42
KW
435for my $charset (get_supported_code_pages()) {
436 print $out_fh "\n" . get_conditional_compile_line_start($charset);
437
99f21fb9
KW
438 @a2n = @{get_a2n($charset)};
439 no warnings 'qw';
440 # Ignore non-alpha in sort
441 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
1c8c3428
KW
442 ASCII
443 Cased
444 VertSpace
445 XPerlSpace
446 XPosixAlnum
447 XPosixAlpha
448 XPosixBlank
449 XPosixCntrl
450 XPosixDigit
451 XPosixGraph
452 XPosixLower
453 XPosixPrint
454 XPosixPunct
455 XPosixSpace
456 XPosixUpper
457 XPosixWord
458 XPosixXDigit
459 _Perl_Any_Folds
460 &NonL1_Perl_Non_Final_Folds
461 _Perl_Folds_To_Multi_Char
462 &UpperLatin1
463 _Perl_IDStart
464 _Perl_IDCont
02f811dd 465 _Perl_GCB,EDGE
bf4268fa 466 _Perl_SB,EDGE
190d69bb 467 _Perl_WB,EDGE,UNKNOWN
1c8c3428 468 )
0f5e3c71
KW
469 ) {
470
471 # For the Latin1 properties, we change to use the eXtended version of the
472 # base property, then go through the result and get rid of everything not
473 # in Latin1 (above 255). Actually, we retain the element for the range
474 # that crosses the 255/256 boundary if it is one that matches the
475 # property. For example, in the Word property, there is a range of code
476 # points that start at U+00F8 and goes through U+02C1. Instead of
477 # artificially cutting that off at 256 because 256 is the first code point
478 # above Latin1, we let the range go to its natural ending. That gives us
479 # extra information with no added space taken. But if the range that
480 # crosses the boundary is one that doesn't match the property, we don't
481 # start a new range above 255, as that could be construed as going to
482 # infinity. For example, the Upper property doesn't include the character
483 # at 255, but does include the one at 256. We don't include the 256 one.
484 my $prop_name = $prop;
485 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
486 my $extra_enums = "";
487 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
488 my $lookup_prop = $prop_name;
489 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
490 or $lookup_prop =~ s/^L1//);
491 my $nonl1_only = 0;
492 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 493 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
494
495 my @invlist;
99f21fb9
KW
496 my @invmap;
497 my $map_format;
498 my $map_default;
499 my $maps_to_code_point;
500 my $to_adjust;
0f5e3c71
KW
501 if ($is_local_sub) {
502 @invlist = eval $lookup_prop;
503 }
504 else {
505 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 506 if (! @invlist) {
99f21fb9 507
ad85f59a
KW
508 # If couldn't find a non-empty inversion list, see if it is
509 # instead an inversion map
510 my ($list_ref, $map_ref, $format, $default)
99f21fb9 511 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
512 if (! $list_ref) {
513 # An empty return here could mean an unknown property, or
514 # merely that the original inversion list is empty. Call
515 # in scalar context to differentiate
516 my $count = prop_invlist($lookup_prop,
517 '_perl_core_internal_ok');
518 die "Could not find inversion list for '$lookup_prop'"
519 unless defined $count;
520 }
521 else {
18b852b3
KW
522 @invlist = @$list_ref;
523 @invmap = @$map_ref;
524 $map_format = $format;
525 $map_default = $default;
526 $maps_to_code_point = $map_format =~ /x/;
527 $to_adjust = $map_format =~ /a/;
ad85f59a 528 }
99f21fb9 529 }
0f5e3c71 530 }
ad85f59a
KW
531
532
533 # Short-circuit an empty inversion list.
534 if (! @invlist) {
535 output_invlist($prop_name, \@invlist, $charset);
536 next;
537 }
ceb1de32 538
99f21fb9
KW
539 # Re-order the Unicode code points to native ones for this platform.
540 # This is only needed for code points below 256, because native code
541 # points are only in that range. For inversion maps of properties
542 # where the mappings are adjusted (format =~ /a/), this reordering
543 # could mess up the adjustment pattern that was in the input, so that
544 # has to be dealt with.
545 #
546 # And inversion maps that map to code points need to eventually have
547 # all those code points remapped to native, and it's better to do that
548 # here, going through the whole list not just those below 256. This
549 # is because some inversion maps have adjustments (format =~ /a/)
550 # which may be affected by the reordering. This code needs to be done
551 # both for when we are translating the inversion lists for < 256, and
552 # for the inversion maps for everything. By doing both in this loop,
553 # we can share that code.
554 #
555 # So, we go through everything for an inversion map to code points;
556 # otherwise, we can skip any remapping at all if we are going to
557 # output only the above-Latin1 values, or if the range spans the whole
558 # of 0..256, as the remap will also include all of 0..256 (256 not
559 # 255 because a re-ordering could cause 256 to need to be in the same
560 # range as 255.)
561 if ((@invmap && $maps_to_code_point)
562 || (! $nonl1_only || ($invlist[0] < 256
563 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 564 {
fb4554ea 565
99f21fb9 566 if (! @invmap) { # Straight inversion list
fb4554ea
KW
567 # Look at all the ranges that start before 257.
568 my @latin1_list;
569 while (@invlist) {
570 last if $invlist[0] > 256;
571 my $upper = @invlist > 1
572 ? $invlist[1] - 1 # In range
8a6c81cf
KW
573
574 # To infinity. You may want to stop much much
575 # earlier; going this high may expose perl
576 # deficiencies with very large numbers.
577 : $Unicode::UCD::MAX_CP;
fb4554ea 578 for my $j ($invlist[0] .. $upper) {
99f21fb9 579 push @latin1_list, a2n($j);
0f5e3c71 580 }
fb4554ea
KW
581
582 shift @invlist; # Shift off the range that's in the list
583 shift @invlist; # Shift off the range not in the list
0c4ecf42 584 }
fb4554ea
KW
585
586 # Here @invlist contains all the ranges in the original that start
587 # at code points above 256, and @latin1_list contains all the
588 # native code points for ranges that start with a Unicode code
589 # point below 257. We sort the latter and convert it to inversion
590 # list format. Then simply prepend it to the list of the higher
591 # code points.
592 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 593 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 594 unshift @invlist, @latin1_list;
99f21fb9
KW
595 }
596 else { # Is an inversion map
597
598 # This is a similar procedure as plain inversion list, but has
599 # multiple buckets. A plain inversion list just has two
600 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
601 # pretty much can ignore the 2nd bucket, as it is completely
602 # defined by the 1st. But here, what we do is create buckets
603 # which contain the code points that map to each, translated
604 # to native and turned into an inversion list. Thus each
605 # bucket is an inversion list of native code points that map
606 # to it or don't map to it. We use these to create an
607 # inversion map for the whole property.
608
609 # As mentioned earlier, we use this procedure to not just
610 # remap the inversion list to native values, but also the maps
611 # of code points to native ones. In the latter case we have
612 # to look at the whole of the inversion map (or at least to
613 # above Unicode; as the maps of code points above that should
614 # all be to the default).
615 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
616
617 my %mapped_lists; # A hash whose keys are the buckets.
618 while (@invlist) {
619 last if $invlist[0] > $upper_limit;
620
621 # This shouldn't actually happen, as prop_invmap() returns
622 # an extra element at the end that is beyond $upper_limit
623 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
624
625 my $bucket;
626
627 # A hash key can't be a ref (we are only expecting arrays
628 # of scalars here), so convert any such to a string that
629 # will be converted back later (using a vertical tab as
630 # the separator). Even if the mapping is to code points,
631 # we don't translate to native here because the code
632 # output_map() calls to output these arrays assumes the
633 # input is Unicode, not native.
634 if (ref $invmap[0]) {
635 $bucket = join "\cK", @{$invmap[0]};
636 }
637 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
638
639 # Do convert to native for maps to single code points.
640 # There are some properties that have a few outlier
641 # maps that aren't code points, so the above test
642 # skips those.
643 $bucket = a2n($invmap[0]);
644 } else {
645 $bucket = $invmap[0];
646 }
647
648 # We now have the bucket that all code points in the range
649 # map to, though possibly they need to be adjusted. Go
650 # through the range and put each translated code point in
651 # it into its bucket.
652 my $base_map = $invmap[0];
653 for my $j ($invlist[0] .. $invlist[1] - 1) {
654 if ($to_adjust
655 # The 1st code point doesn't need adjusting
656 && $j > $invlist[0]
657
658 # Skip any non-numeric maps: these are outliers
659 # that aren't code points.
660 && $base_map =~ $numeric_re
661
662 # 'ne' because the default can be a string
663 && $base_map ne $map_default)
664 {
665 # We adjust, by incrementing each the bucket and
666 # the map. For code point maps, translate to
667 # native
668 $base_map++;
669 $bucket = ($maps_to_code_point)
670 ? a2n($base_map)
671 : $base_map;
672 }
673
674 # Add the native code point to the bucket for the
675 # current map
676 push @{$mapped_lists{$bucket}}, a2n($j);
677 } # End of loop through all code points in the range
678
679 # Get ready for the next range
680 shift @invlist;
681 shift @invmap;
682 } # End of loop through all ranges in the map.
683
684 # Here, @invlist and @invmap retain all the ranges from the
685 # originals that start with code points above $upper_limit.
686 # Each bucket in %mapped_lists contains all the code points
687 # that map to that bucket. If the bucket is for a map to a
688 # single code point is a single code point, the bucket has
689 # been converted to native. If something else (including
690 # multiple code points), no conversion is done.
691 #
692 # Now we recreate the inversion map into %xlated, but this
693 # time for the native character set.
694 my %xlated;
695 foreach my $bucket (keys %mapped_lists) {
696
697 # Sort and convert this bucket to an inversion list. The
698 # result will be that ranges that start with even-numbered
699 # indexes will be for code points that map to this bucket;
700 # odd ones map to some other bucket, and are discarded
701 # below.
702 @{$mapped_lists{$bucket}}
703 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
704 @{$mapped_lists{$bucket}}
705 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
706
707 # Add each even-numbered range in the bucket to %xlated;
708 # so that the keys of %xlated become the range start code
709 # points, and the values are their corresponding maps.
710 while (@{$mapped_lists{$bucket}}) {
711 my $range_start = $mapped_lists{$bucket}->[0];
712 if ($bucket =~ /\cK/) {
713 @{$xlated{$range_start}} = split /\cK/, $bucket;
714 }
715 else {
716 $xlated{$range_start} = $bucket;
717 }
718 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
719 shift @{$mapped_lists{$bucket}}; # Get ready for next
720 # iteration
721 }
722 } # End of loop through all the buckets.
723
724 # Here %xlated's keys are the range starts of all the code
725 # points in the inversion map. Construct an inversion list
726 # from them.
727 my @new_invlist = sort { $a <=> $b } keys %xlated;
728
729 # If the list is adjusted, we want to munge this list so that
730 # we only have one entry for where consecutive code points map
731 # to consecutive values. We just skip the subsequent entries
732 # where this is the case.
733 if ($to_adjust) {
734 my @temp;
735 for my $i (0 .. @new_invlist - 1) {
736 next if $i > 0
737 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
738 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
739 && $xlated{$new_invlist[$i]} =~ $numeric_re
740 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
741 push @temp, $new_invlist[$i];
742 }
743 @new_invlist = @temp;
744 }
745
746 # The inversion map comes from %xlated's values. We can
747 # unshift each onto the front of the untouched portion, in
748 # reverse order of the portion we did process.
749 foreach my $start (reverse @new_invlist) {
750 unshift @invmap, $xlated{$start};
751 }
752
753 # Finally prepend the inversion list we have just constructed to the
754 # one that contains anything we didn't process.
755 unshift @invlist, @new_invlist;
756 }
757 }
758
759 # prop_invmap() returns an extra final entry, which we can now
760 # discard.
761 if (@invmap) {
762 pop @invlist;
763 pop @invmap;
ceb1de32 764 }
0f5e3c71
KW
765
766 if ($l1_only) {
99f21fb9 767 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
768 for my $i (0 .. @invlist - 1 - 1) {
769 if ($invlist[$i] > 255) {
770
771 # In an inversion list, even-numbered elements give the code
772 # points that begin ranges that match the property;
773 # odd-numbered give ones that begin ranges that don't match.
774 # If $i is odd, we are at the first code point above 255 that
775 # doesn't match, which means the range it is ending does
776 # match, and crosses the 255/256 boundary. We want to include
777 # this ending point, so increment $i, so the splice below
778 # includes it. Conversely, if $i is even, it is the first
779 # code point above 255 that matches, which means there was no
780 # matching range that crossed the boundary, and we don't want
781 # to include this code point, so splice before it.
782 $i++ if $i % 2 != 0;
783
784 # Remove everything past this.
785 splice @invlist, $i;
99f21fb9 786 splice @invmap, $i if @invmap;
0f5e3c71
KW
787 last;
788 }
0c4ecf42
KW
789 }
790 }
0f5e3c71
KW
791 elsif ($nonl1_only) {
792 my $found_nonl1 = 0;
793 for my $i (0 .. @invlist - 1 - 1) {
794 next if $invlist[$i] < 256;
795
796 # Here, we have the first element in the array that indicates an
797 # element above Latin1. Get rid of all previous ones.
798 splice @invlist, 0, $i;
99f21fb9 799 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
800
801 # If this one's index is not divisible by 2, it means that this
802 # element is inverting away from being in the list, which means
99f21fb9
KW
803 # all code points from 256 to this one are in this list (or
804 # map to the default for inversion maps)
805 if ($i % 2 != 0) {
806 unshift @invlist, 256;
807 unshift @invmap, $map_default if @invmap;
808 }
0f5e3c71 809 $found_nonl1 = 1;
3f427fd9
KW
810 last;
811 }
0f5e3c71 812 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 813 }
3f427fd9 814
0f5e3c71 815 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 816 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 817 }
bffc0129 818 end_file_pound_if;
0c4ecf42 819 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
820}
821
2308ab83 822my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
823my @sources = ($0, qw(lib/unicore/mktables
824 lib/Unicode/UCD.pm
825 regen/charset_translations.pl
826 ));
9a3da3ad
FC
827{
828 # Depend on mktables’ own sources. It’s a shorter list of files than
829 # those that Unicode::UCD uses.
2308ab83
KW
830 if (! open my $mktables_list, $sources_list) {
831
832 # This should force a rebuild once $sources_list exists
833 push @sources, $sources_list;
834 }
835 else {
836 while(<$mktables_list>) {
837 last if /===/;
838 chomp;
839 push @sources, "lib/unicore/$_" if /^[^#]/;
840 }
9a3da3ad
FC
841 }
842}
843read_only_bottom_close_and_rename($out_fh, \@sources)