This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/ebcdic.pl: Output tables in hex
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
9d9177be 11require 'regen/regen_lib.pl';
0c4ecf42 12require 'regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
9d9177be
KW
43print $out_fh "/* See the generating file for comments */\n\n";
44
bffc0129
KW
45# The symbols generated by this program are all currently defined only in a
46# single dot c each. The code knows where most of them go, but this hash
47# gives overrides for the exceptions to the typical place
48my %exceptions_to_where_to_define =
49 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
50 AboveLatin1 => 'PERL_IN_REGCOMP_C',
51 Latin1 => 'PERL_IN_REGCOMP_C',
52 UpperLatin1 => 'PERL_IN_REGCOMP_C',
53 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
54 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
55 _Perl_IDCont => 'PERL_IN_UTF8_C',
56 _Perl_IDStart => 'PERL_IN_UTF8_C',
57 );
015bb97c 58
f79a09fc
KW
59# This hash contains the properties with enums that have hard-coded references
60# to them in C code. Its only use is to make sure that if perl is compiled
61# with an older Unicode data set, that all the enum values the code is
62# expecting will still be in the enum typedef. Thus the code doesn't have to
63# change. The Unicode version won't have any code points that have these enum
64# values, so the code that handles them will not get exercised. This is far
65# better than having to #ifdef things.
66my %hard_coded_enums =
67 ( gcb => [
68 'Control',
69 'CR',
70 'Extend',
71 'L',
72 'LF',
73 'LV',
74 'LVT',
75 'Other',
76 'Prepend',
77 'Regional_Indicator',
78 'SpacingMark',
79 'T',
80 'V',
81 ],
82 sb => [
83 'ATerm',
84 'Close',
85 'CR',
86 'Extend',
87 'Format',
88 'LF',
89 'Lower',
90 'Numeric',
91 'OLetter',
92 'Other',
93 'SContinue',
94 'Sep',
95 'Sp',
96 'STerm',
97 'Upper',
98 ],
99 wb => [
100 'ALetter',
101 'CR',
102 'Double_Quote',
103 'Extend',
104 'ExtendNumLet',
105 'Format',
106 'Hebrew_Letter',
107 'Katakana',
108 'LF',
109 'MidLetter',
110 'MidNum',
111 'MidNumLet',
112 'Newline',
113 'Numeric',
114 'Other',
115 'Regional_Indicator',
116 'Single_Quote',
117 'UNKNOWN',
118 ],
119);
120
99f21fb9
KW
121my @a2n;
122
123sub uniques {
124 # Returns non-duplicated input values. From "Perl Best Practices:
125 # Encapsulated Cleverness". p. 455 in first edition.
126
127 my %seen;
128 return grep { ! $seen{$_}++ } @_;
129}
130
131sub a2n($) {
132 my $cp = shift;
133
134 # Returns the input Unicode code point translated to native.
135
136 return $cp if $cp !~ $numeric_re || $cp > 255;
137 return $a2n[$cp];
138}
139
bffc0129
KW
140sub end_file_pound_if {
141 if ($in_file_pound_if) {
142 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
143 $in_file_pound_if = 0;
144 }
145}
146
147sub switch_pound_if ($$) {
148 my $name = shift;
149 my $new_pound_if = shift;
150
151 # Switch to new #if given by the 2nd argument. If there is an override
152 # for this, it instead switches to that. The 1st argument is the
153 # static's name, used to look up the overrides
154
155 if (exists $exceptions_to_where_to_define{$name}) {
156 $new_pound_if = $exceptions_to_where_to_define{$name};
157 }
158
159 # Exit current #if if the new one is different from the old
160 if ($in_file_pound_if
161 && $in_file_pound_if !~ /$new_pound_if/)
162 {
163 end_file_pound_if;
164 }
165
166 # Enter new #if, if not already in it.
167 if (! $in_file_pound_if) {
168 $in_file_pound_if = "defined($new_pound_if)";
169 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
170 }
171}
172
0c4ecf42 173sub output_invlist ($$;$) {
9d9177be
KW
174 my $name = shift;
175 my $invlist = shift; # Reference to inversion list array
0c4ecf42 176 my $charset = shift // ""; # name of character set for comment
9d9177be 177
76d3994c 178 die "No inversion list for $name" unless defined $invlist
ad85f59a 179 && ref $invlist eq 'ARRAY';
76d3994c 180
9d9177be
KW
181 # Output the inversion list $invlist using the name $name for it.
182 # It is output in the exact internal form for inversion lists.
183
a0316a6c
KW
184 # Is the last element of the header 0, or 1 ?
185 my $zero_or_one = 0;
ad85f59a 186 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 187 unshift @$invlist, 0;
9d9177be
KW
188 $zero_or_one = 1;
189 }
0a07b44b 190 my $count = @$invlist;
9d9177be 191
bffc0129 192 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 193
0c4ecf42
KW
194 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
195 print $out_fh " /* for $charset */" if $charset;
196 print $out_fh "\n";
9d9177be 197
a0316a6c 198 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
199 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
200 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
201 ",\t/* 0 if the list starts at 0;",
202 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
203
204 # The main body are the UVs passed in to this routine. Do the final
205 # element separately
47d53124
KW
206 for my $i (0 .. @$invlist - 1) {
207 printf $out_fh "\t0x%X", $invlist->[$i];
208 print $out_fh "," if $i < @$invlist - 1;
209 print $out_fh "\n";
9d9177be
KW
210 }
211
9d9177be
KW
212 print $out_fh "};\n";
213}
214
99f21fb9
KW
215sub output_invmap ($$$$$$$) {
216 my $name = shift;
217 my $invmap = shift; # Reference to inversion map array
218 my $prop_name = shift;
219 my $input_format = shift; # The inversion map's format
220 my $default = shift; # The property value for code points who
221 # otherwise don't have a value specified.
222 my $extra_enums = shift; # comma-separated list of our additions to the
223 # property's standard possible values
224 my $charset = shift // ""; # name of character set for comment
225
226 # Output the inversion map $invmap for property $prop_name, but use $name
227 # as the actual data structure's name.
228
229 my $count = @$invmap;
230
231 my $output_format;
232 my $declaration_type;
233 my %enums;
234 my $name_prefix;
235
236 if ($input_format eq 's') {
02f811dd
KW
237 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
238 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
99f21fb9
KW
239 my @enums = prop_values($prop_name);
240 if (! @enums) {
241 die "Only enum properties are currently handled; '$prop_name' isn't one";
242 }
243 else {
244
f79a09fc 245 # Convert short names to long
99f21fb9 246 @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums;
f79a09fc
KW
247
248 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
249 die 'You need to update %hard_coded_enums to reflect new entries in this Unicode version'
250 if @expected_enums < @enums;
251
252 # Remove the enums found in the input from the ones we expect
253 for (my $i = @expected_enums - 1; $i >= 0; $i--) {
254 splice(@expected_enums, $i, 1)
255 if grep { $expected_enums[$i] eq $_ } @enums;
256 }
257
258 # The ones remaining must be because we're using an older
259 # Unicode version. Add them to the list.
260 push @enums, @expected_enums;
261
262 # Add in the extra values coded into this program, and sort.
99f21fb9
KW
263 push @enums, split /,/, $extra_enums if $extra_enums ne "";
264 @enums = sort @enums;
265
266 # Assign a value to each element of the enum. The default
267 # value always gets 0; the others are arbitrarily assigned.
268 my $enum_val = 0;
02f811dd
KW
269 my $canonical_default = prop_value_aliases($prop_name, $default);
270 $default = $canonical_default if defined $canonical_default;
99f21fb9
KW
271 $enums{$default} = $enum_val++;
272 for my $enum (@enums) {
273 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
274 }
275 }
276
bffc0129
KW
277 # Inversion map stuff is currently used only by regexec
278 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
99f21fb9
KW
279 {
280
99f21fb9
KW
281 # The short names tend to be two lower case letters, but it looks
282 # better for those if they are upper. XXX
283 $short_name = uc($short_name) if length($short_name) < 3
284 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
85e5f08b 285 $name_prefix = "${short_name}_";
99f21fb9
KW
286 my $enum_count = keys %enums;
287 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
288
289 print $out_fh "\ntypedef enum {\n";
290 print $out_fh "\t${name_prefix}$default = $enums{$default},\n";
291 delete $enums{$default};
292 foreach my $enum (sort { $a cmp $b } keys %enums) {
293 print $out_fh "\t${name_prefix}$enum = $enums{$enum}";
294 print $out_fh "," if $enums{$enum} < $enum_count - 1;
295 print $out_fh "\n";
296 }
297 $declaration_type = "${name_prefix}enum";
298 print $out_fh "} $declaration_type;\n";
299
300 $output_format = "${name_prefix}%s";
301 }
302 }
303 else {
304 die "'$input_format' invmap() format for '$prop_name' unimplemented";
305 }
306
307 die "No inversion map for $prop_name" unless defined $invmap
308 && ref $invmap eq 'ARRAY'
309 && $count;
310
311 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
312 print $out_fh " /* for $charset */" if $charset;
313 print $out_fh "\n";
314
315 # The main body are the scalars passed in to this routine.
316 for my $i (0 .. $count - 1) {
317 my $element = $invmap->[$i];
02f811dd
KW
318 my $full_element_name = prop_value_aliases($prop_name, $element);
319 $element = $full_element_name if defined $full_element_name;
320 $element = $name_prefix . $element;
99f21fb9
KW
321 print $out_fh "\t$element";
322 print $out_fh "," if $i < $count - 1;
323 print $out_fh "\n";
324 }
325 print $out_fh "};\n";
99f21fb9
KW
326}
327
5a7e5385 328sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
329
330 # Returns an inversion list constructed from the sorted input array of
331 # code points
332
333 my $list_ref = shift;
334
99f21fb9
KW
335 return unless @$list_ref;
336
a02047bf
KW
337 # Initialize to just the first element
338 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
339
340 # For each succeeding element, if it extends the previous range, adjust
341 # up, otherwise add it.
342 for my $i (1 .. @$list_ref - 1) {
343 if ($invlist[-1] == $list_ref->[$i]) {
344 $invlist[-1]++;
345 }
346 else {
347 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
348 }
349 }
350 return @invlist;
351}
352
353# Read in the Case Folding rules, and construct arrays of code points for the
354# properties we need.
355my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
356die "Could not find inversion map for Case_Folding" unless defined $format;
357die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
358 unless $format eq 'al'
359 || $format eq 'a';
a02047bf
KW
360my @has_multi_char_fold;
361my @is_non_final_fold;
362
363for my $i (0 .. @$folds_ref - 1) {
364 next unless ref $folds_ref->[$i]; # Skip single-char folds
365 push @has_multi_char_fold, $cp_ref->[$i];
366
b6a6e956 367 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
368 # position
369 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
370 push @is_non_final_fold, $folds_ref->[$i][$j]
371 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
372 }
373}
374
a02047bf
KW
375sub _Perl_Non_Final_Folds {
376 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 377 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
378}
379
99f21fb9
KW
380sub prop_name_for_cmp ($) { # Sort helper
381 my $name = shift;
382
383 # Returns the input lowercased, with non-alphas removed, as well as
384 # everything starting with a comma
385
386 $name =~ s/,.*//;
387 $name =~ s/[[:^alpha:]]//g;
388 return lc $name;
389}
390
892d8259 391sub UpperLatin1 {
5a7e5385 392 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
393}
394
9d9177be
KW
395output_invlist("Latin1", [ 0, 256 ]);
396output_invlist("AboveLatin1", [ 256 ]);
397
bffc0129 398end_file_pound_if;
43b443dd 399
3f427fd9
KW
400# We construct lists for all the POSIX and backslash sequence character
401# classes in two forms:
402# 1) ones which match only in the ASCII range
403# 2) ones which match either in the Latin1 range, or the entire Unicode range
404#
405# These get compiled in, and hence affect the memory footprint of every Perl
406# program, even those not using Unicode. To minimize the size, currently
407# the Latin1 version is generated for the beyond ASCII range except for those
408# lists that are quite small for the entire range, such as for \s, which is 22
409# UVs long plus 4 UVs (currently) for the header.
410#
411# To save even more memory, the ASCII versions could be derived from the
412# larger ones at runtime, saving some memory (minus the expense of the machine
413# instructions to do so), but these are all small anyway, so their total is
414# about 100 UVs.
415#
416# In the list of properties below that get generated, the L1 prefix is a fake
417# property that means just the Latin1 range of the full property (whose name
418# has an X prefix instead of L1).
a02047bf
KW
419#
420# An initial & means to use the subroutine from this file instead of an
421# official inversion list.
3f427fd9 422
0c4ecf42
KW
423for my $charset (get_supported_code_pages()) {
424 print $out_fh "\n" . get_conditional_compile_line_start($charset);
425
99f21fb9
KW
426 @a2n = @{get_a2n($charset)};
427 no warnings 'qw';
428 # Ignore non-alpha in sort
429 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
1c8c3428
KW
430 ASCII
431 Cased
432 VertSpace
433 XPerlSpace
434 XPosixAlnum
435 XPosixAlpha
436 XPosixBlank
437 XPosixCntrl
438 XPosixDigit
439 XPosixGraph
440 XPosixLower
441 XPosixPrint
442 XPosixPunct
443 XPosixSpace
444 XPosixUpper
445 XPosixWord
446 XPosixXDigit
447 _Perl_Any_Folds
448 &NonL1_Perl_Non_Final_Folds
449 _Perl_Folds_To_Multi_Char
450 &UpperLatin1
451 _Perl_IDStart
452 _Perl_IDCont
02f811dd 453 _Perl_GCB,EDGE
bf4268fa
KW
454 _Perl_WB,EDGE,UNKNOWN
455 _Perl_SB,EDGE
1c8c3428 456 )
0f5e3c71
KW
457 ) {
458
459 # For the Latin1 properties, we change to use the eXtended version of the
460 # base property, then go through the result and get rid of everything not
461 # in Latin1 (above 255). Actually, we retain the element for the range
462 # that crosses the 255/256 boundary if it is one that matches the
463 # property. For example, in the Word property, there is a range of code
464 # points that start at U+00F8 and goes through U+02C1. Instead of
465 # artificially cutting that off at 256 because 256 is the first code point
466 # above Latin1, we let the range go to its natural ending. That gives us
467 # extra information with no added space taken. But if the range that
468 # crosses the boundary is one that doesn't match the property, we don't
469 # start a new range above 255, as that could be construed as going to
470 # infinity. For example, the Upper property doesn't include the character
471 # at 255, but does include the one at 256. We don't include the 256 one.
472 my $prop_name = $prop;
473 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
474 my $extra_enums = "";
475 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
476 my $lookup_prop = $prop_name;
477 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
478 or $lookup_prop =~ s/^L1//);
479 my $nonl1_only = 0;
480 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 481 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
482
483 my @invlist;
99f21fb9
KW
484 my @invmap;
485 my $map_format;
486 my $map_default;
487 my $maps_to_code_point;
488 my $to_adjust;
0f5e3c71
KW
489 if ($is_local_sub) {
490 @invlist = eval $lookup_prop;
491 }
492 else {
493 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 494 if (! @invlist) {
99f21fb9 495
ad85f59a
KW
496 # If couldn't find a non-empty inversion list, see if it is
497 # instead an inversion map
498 my ($list_ref, $map_ref, $format, $default)
99f21fb9 499 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
500 if (! $list_ref) {
501 # An empty return here could mean an unknown property, or
502 # merely that the original inversion list is empty. Call
503 # in scalar context to differentiate
504 my $count = prop_invlist($lookup_prop,
505 '_perl_core_internal_ok');
506 die "Could not find inversion list for '$lookup_prop'"
507 unless defined $count;
508 }
509 else {
18b852b3
KW
510 @invlist = @$list_ref;
511 @invmap = @$map_ref;
512 $map_format = $format;
513 $map_default = $default;
514 $maps_to_code_point = $map_format =~ /x/;
515 $to_adjust = $map_format =~ /a/;
ad85f59a 516 }
99f21fb9 517 }
0f5e3c71 518 }
ad85f59a
KW
519
520
521 # Short-circuit an empty inversion list.
522 if (! @invlist) {
523 output_invlist($prop_name, \@invlist, $charset);
524 next;
525 }
ceb1de32 526
99f21fb9
KW
527 # Re-order the Unicode code points to native ones for this platform.
528 # This is only needed for code points below 256, because native code
529 # points are only in that range. For inversion maps of properties
530 # where the mappings are adjusted (format =~ /a/), this reordering
531 # could mess up the adjustment pattern that was in the input, so that
532 # has to be dealt with.
533 #
534 # And inversion maps that map to code points need to eventually have
535 # all those code points remapped to native, and it's better to do that
536 # here, going through the whole list not just those below 256. This
537 # is because some inversion maps have adjustments (format =~ /a/)
538 # which may be affected by the reordering. This code needs to be done
539 # both for when we are translating the inversion lists for < 256, and
540 # for the inversion maps for everything. By doing both in this loop,
541 # we can share that code.
542 #
543 # So, we go through everything for an inversion map to code points;
544 # otherwise, we can skip any remapping at all if we are going to
545 # output only the above-Latin1 values, or if the range spans the whole
546 # of 0..256, as the remap will also include all of 0..256 (256 not
547 # 255 because a re-ordering could cause 256 to need to be in the same
548 # range as 255.)
549 if ((@invmap && $maps_to_code_point)
550 || (! $nonl1_only || ($invlist[0] < 256
551 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 552 {
fb4554ea 553
99f21fb9 554 if (! @invmap) { # Straight inversion list
fb4554ea
KW
555 # Look at all the ranges that start before 257.
556 my @latin1_list;
557 while (@invlist) {
558 last if $invlist[0] > 256;
559 my $upper = @invlist > 1
560 ? $invlist[1] - 1 # In range
8a6c81cf
KW
561
562 # To infinity. You may want to stop much much
563 # earlier; going this high may expose perl
564 # deficiencies with very large numbers.
565 : $Unicode::UCD::MAX_CP;
fb4554ea 566 for my $j ($invlist[0] .. $upper) {
99f21fb9 567 push @latin1_list, a2n($j);
0f5e3c71 568 }
fb4554ea
KW
569
570 shift @invlist; # Shift off the range that's in the list
571 shift @invlist; # Shift off the range not in the list
0c4ecf42 572 }
fb4554ea
KW
573
574 # Here @invlist contains all the ranges in the original that start
575 # at code points above 256, and @latin1_list contains all the
576 # native code points for ranges that start with a Unicode code
577 # point below 257. We sort the latter and convert it to inversion
578 # list format. Then simply prepend it to the list of the higher
579 # code points.
580 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 581 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 582 unshift @invlist, @latin1_list;
99f21fb9
KW
583 }
584 else { # Is an inversion map
585
586 # This is a similar procedure as plain inversion list, but has
587 # multiple buckets. A plain inversion list just has two
588 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
589 # pretty much can ignore the 2nd bucket, as it is completely
590 # defined by the 1st. But here, what we do is create buckets
591 # which contain the code points that map to each, translated
592 # to native and turned into an inversion list. Thus each
593 # bucket is an inversion list of native code points that map
594 # to it or don't map to it. We use these to create an
595 # inversion map for the whole property.
596
597 # As mentioned earlier, we use this procedure to not just
598 # remap the inversion list to native values, but also the maps
599 # of code points to native ones. In the latter case we have
600 # to look at the whole of the inversion map (or at least to
601 # above Unicode; as the maps of code points above that should
602 # all be to the default).
603 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
604
605 my %mapped_lists; # A hash whose keys are the buckets.
606 while (@invlist) {
607 last if $invlist[0] > $upper_limit;
608
609 # This shouldn't actually happen, as prop_invmap() returns
610 # an extra element at the end that is beyond $upper_limit
611 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
612
613 my $bucket;
614
615 # A hash key can't be a ref (we are only expecting arrays
616 # of scalars here), so convert any such to a string that
617 # will be converted back later (using a vertical tab as
618 # the separator). Even if the mapping is to code points,
619 # we don't translate to native here because the code
620 # output_map() calls to output these arrays assumes the
621 # input is Unicode, not native.
622 if (ref $invmap[0]) {
623 $bucket = join "\cK", @{$invmap[0]};
624 }
625 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
626
627 # Do convert to native for maps to single code points.
628 # There are some properties that have a few outlier
629 # maps that aren't code points, so the above test
630 # skips those.
631 $bucket = a2n($invmap[0]);
632 } else {
633 $bucket = $invmap[0];
634 }
635
636 # We now have the bucket that all code points in the range
637 # map to, though possibly they need to be adjusted. Go
638 # through the range and put each translated code point in
639 # it into its bucket.
640 my $base_map = $invmap[0];
641 for my $j ($invlist[0] .. $invlist[1] - 1) {
642 if ($to_adjust
643 # The 1st code point doesn't need adjusting
644 && $j > $invlist[0]
645
646 # Skip any non-numeric maps: these are outliers
647 # that aren't code points.
648 && $base_map =~ $numeric_re
649
650 # 'ne' because the default can be a string
651 && $base_map ne $map_default)
652 {
653 # We adjust, by incrementing each the bucket and
654 # the map. For code point maps, translate to
655 # native
656 $base_map++;
657 $bucket = ($maps_to_code_point)
658 ? a2n($base_map)
659 : $base_map;
660 }
661
662 # Add the native code point to the bucket for the
663 # current map
664 push @{$mapped_lists{$bucket}}, a2n($j);
665 } # End of loop through all code points in the range
666
667 # Get ready for the next range
668 shift @invlist;
669 shift @invmap;
670 } # End of loop through all ranges in the map.
671
672 # Here, @invlist and @invmap retain all the ranges from the
673 # originals that start with code points above $upper_limit.
674 # Each bucket in %mapped_lists contains all the code points
675 # that map to that bucket. If the bucket is for a map to a
676 # single code point is a single code point, the bucket has
677 # been converted to native. If something else (including
678 # multiple code points), no conversion is done.
679 #
680 # Now we recreate the inversion map into %xlated, but this
681 # time for the native character set.
682 my %xlated;
683 foreach my $bucket (keys %mapped_lists) {
684
685 # Sort and convert this bucket to an inversion list. The
686 # result will be that ranges that start with even-numbered
687 # indexes will be for code points that map to this bucket;
688 # odd ones map to some other bucket, and are discarded
689 # below.
690 @{$mapped_lists{$bucket}}
691 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
692 @{$mapped_lists{$bucket}}
693 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
694
695 # Add each even-numbered range in the bucket to %xlated;
696 # so that the keys of %xlated become the range start code
697 # points, and the values are their corresponding maps.
698 while (@{$mapped_lists{$bucket}}) {
699 my $range_start = $mapped_lists{$bucket}->[0];
700 if ($bucket =~ /\cK/) {
701 @{$xlated{$range_start}} = split /\cK/, $bucket;
702 }
703 else {
704 $xlated{$range_start} = $bucket;
705 }
706 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
707 shift @{$mapped_lists{$bucket}}; # Get ready for next
708 # iteration
709 }
710 } # End of loop through all the buckets.
711
712 # Here %xlated's keys are the range starts of all the code
713 # points in the inversion map. Construct an inversion list
714 # from them.
715 my @new_invlist = sort { $a <=> $b } keys %xlated;
716
717 # If the list is adjusted, we want to munge this list so that
718 # we only have one entry for where consecutive code points map
719 # to consecutive values. We just skip the subsequent entries
720 # where this is the case.
721 if ($to_adjust) {
722 my @temp;
723 for my $i (0 .. @new_invlist - 1) {
724 next if $i > 0
725 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
726 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
727 && $xlated{$new_invlist[$i]} =~ $numeric_re
728 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
729 push @temp, $new_invlist[$i];
730 }
731 @new_invlist = @temp;
732 }
733
734 # The inversion map comes from %xlated's values. We can
735 # unshift each onto the front of the untouched portion, in
736 # reverse order of the portion we did process.
737 foreach my $start (reverse @new_invlist) {
738 unshift @invmap, $xlated{$start};
739 }
740
741 # Finally prepend the inversion list we have just constructed to the
742 # one that contains anything we didn't process.
743 unshift @invlist, @new_invlist;
744 }
745 }
746
747 # prop_invmap() returns an extra final entry, which we can now
748 # discard.
749 if (@invmap) {
750 pop @invlist;
751 pop @invmap;
ceb1de32 752 }
0f5e3c71
KW
753
754 if ($l1_only) {
99f21fb9 755 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
756 for my $i (0 .. @invlist - 1 - 1) {
757 if ($invlist[$i] > 255) {
758
759 # In an inversion list, even-numbered elements give the code
760 # points that begin ranges that match the property;
761 # odd-numbered give ones that begin ranges that don't match.
762 # If $i is odd, we are at the first code point above 255 that
763 # doesn't match, which means the range it is ending does
764 # match, and crosses the 255/256 boundary. We want to include
765 # this ending point, so increment $i, so the splice below
766 # includes it. Conversely, if $i is even, it is the first
767 # code point above 255 that matches, which means there was no
768 # matching range that crossed the boundary, and we don't want
769 # to include this code point, so splice before it.
770 $i++ if $i % 2 != 0;
771
772 # Remove everything past this.
773 splice @invlist, $i;
99f21fb9 774 splice @invmap, $i if @invmap;
0f5e3c71
KW
775 last;
776 }
0c4ecf42
KW
777 }
778 }
0f5e3c71
KW
779 elsif ($nonl1_only) {
780 my $found_nonl1 = 0;
781 for my $i (0 .. @invlist - 1 - 1) {
782 next if $invlist[$i] < 256;
783
784 # Here, we have the first element in the array that indicates an
785 # element above Latin1. Get rid of all previous ones.
786 splice @invlist, 0, $i;
99f21fb9 787 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
788
789 # If this one's index is not divisible by 2, it means that this
790 # element is inverting away from being in the list, which means
99f21fb9
KW
791 # all code points from 256 to this one are in this list (or
792 # map to the default for inversion maps)
793 if ($i % 2 != 0) {
794 unshift @invlist, 256;
795 unshift @invmap, $map_default if @invmap;
796 }
0f5e3c71 797 $found_nonl1 = 1;
3f427fd9
KW
798 last;
799 }
0f5e3c71 800 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 801 }
3f427fd9 802
0f5e3c71 803 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 804 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 805 }
bffc0129 806 end_file_pound_if;
0c4ecf42 807 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
808}
809
2308ab83 810my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
811my @sources = ($0, qw(lib/unicore/mktables
812 lib/Unicode/UCD.pm
813 regen/charset_translations.pl
814 ));
9a3da3ad
FC
815{
816 # Depend on mktables’ own sources. It’s a shorter list of files than
817 # those that Unicode::UCD uses.
2308ab83
KW
818 if (! open my $mktables_list, $sources_list) {
819
820 # This should force a rebuild once $sources_list exists
821 push @sources, $sources_list;
822 }
823 else {
824 while(<$mktables_list>) {
825 last if /===/;
826 chomp;
827 push @sources, "lib/unicore/$_" if /^[^#]/;
828 }
9a3da3ad
FC
829 }
830}
831read_only_bottom_close_and_rename($out_fh, \@sources)