This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Make tables for Perl-tailored Unicode Line_Break property
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
9d9177be 11require 'regen/regen_lib.pl';
0c4ecf42 12require 'regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
9d9177be
KW
43print $out_fh "/* See the generating file for comments */\n\n";
44
bffc0129
KW
45# The symbols generated by this program are all currently defined only in a
46# single dot c each. The code knows where most of them go, but this hash
47# gives overrides for the exceptions to the typical place
48my %exceptions_to_where_to_define =
49 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
50 AboveLatin1 => 'PERL_IN_REGCOMP_C',
51 Latin1 => 'PERL_IN_REGCOMP_C',
52 UpperLatin1 => 'PERL_IN_REGCOMP_C',
53 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
54 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
55 _Perl_IDCont => 'PERL_IN_UTF8_C',
56 _Perl_IDStart => 'PERL_IN_UTF8_C',
57 );
015bb97c 58
f79a09fc
KW
59# This hash contains the properties with enums that have hard-coded references
60# to them in C code. Its only use is to make sure that if perl is compiled
61# with an older Unicode data set, that all the enum values the code is
62# expecting will still be in the enum typedef. Thus the code doesn't have to
63# change. The Unicode version won't have any code points that have these enum
64# values, so the code that handles them will not get exercised. This is far
65# better than having to #ifdef things.
66my %hard_coded_enums =
67 ( gcb => [
68 'Control',
69 'CR',
70 'Extend',
71 'L',
72 'LF',
73 'LV',
74 'LVT',
75 'Other',
76 'Prepend',
77 'Regional_Indicator',
78 'SpacingMark',
79 'T',
80 'V',
81 ],
ca8226cf
KW
82 lb => [
83 'Alphabetic',
84 'Break_After',
85 'Break_Before',
86 'Break_Both',
87 'Break_Symbols',
88 'Carriage_Return',
89 'Close_Parenthesis',
90 'Close_Punctuation',
91 'Combining_Mark',
92 'Contingent_Break',
93 'Exclamation',
94 'Glue',
95 'H2',
96 'H3',
97 'Hebrew_Letter',
98 'Hyphen',
99 'Ideographic',
100 'Infix_Numeric',
101 'Inseparable',
102 'JL',
103 'JT',
104 'JV',
105 'Line_Feed',
106 'Mandatory_Break',
107 'Next_Line',
108 'Nonstarter',
109 'Numeric',
110 'Open_Punctuation',
111 'Postfix_Numeric',
112 'Prefix_Numeric',
113 'Quotation',
114 'Regional_Indicator',
115 'Space',
116 'Word_Joiner',
117 'ZWSpace',
118 ],
f79a09fc
KW
119 sb => [
120 'ATerm',
121 'Close',
122 'CR',
123 'Extend',
124 'Format',
125 'LF',
126 'Lower',
127 'Numeric',
128 'OLetter',
129 'Other',
130 'SContinue',
131 'Sep',
132 'Sp',
133 'STerm',
134 'Upper',
135 ],
136 wb => [
137 'ALetter',
138 'CR',
139 'Double_Quote',
140 'Extend',
141 'ExtendNumLet',
142 'Format',
143 'Hebrew_Letter',
144 'Katakana',
145 'LF',
146 'MidLetter',
147 'MidNum',
148 'MidNumLet',
149 'Newline',
150 'Numeric',
151 'Other',
f1f6961f 152 'Perl_Tailored_HSpace',
f79a09fc
KW
153 'Regional_Indicator',
154 'Single_Quote',
f79a09fc
KW
155 ],
156);
157
99f21fb9
KW
158my @a2n;
159
160sub uniques {
161 # Returns non-duplicated input values. From "Perl Best Practices:
162 # Encapsulated Cleverness". p. 455 in first edition.
163
164 my %seen;
165 return grep { ! $seen{$_}++ } @_;
166}
167
168sub a2n($) {
169 my $cp = shift;
170
171 # Returns the input Unicode code point translated to native.
172
173 return $cp if $cp !~ $numeric_re || $cp > 255;
174 return $a2n[$cp];
175}
176
bffc0129
KW
177sub end_file_pound_if {
178 if ($in_file_pound_if) {
179 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
180 $in_file_pound_if = 0;
181 }
182}
183
184sub switch_pound_if ($$) {
185 my $name = shift;
186 my $new_pound_if = shift;
187
188 # Switch to new #if given by the 2nd argument. If there is an override
189 # for this, it instead switches to that. The 1st argument is the
190 # static's name, used to look up the overrides
191
192 if (exists $exceptions_to_where_to_define{$name}) {
193 $new_pound_if = $exceptions_to_where_to_define{$name};
194 }
195
196 # Exit current #if if the new one is different from the old
197 if ($in_file_pound_if
198 && $in_file_pound_if !~ /$new_pound_if/)
199 {
200 end_file_pound_if;
201 }
202
203 # Enter new #if, if not already in it.
204 if (! $in_file_pound_if) {
205 $in_file_pound_if = "defined($new_pound_if)";
206 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
207 }
208}
209
0c4ecf42 210sub output_invlist ($$;$) {
9d9177be
KW
211 my $name = shift;
212 my $invlist = shift; # Reference to inversion list array
0c4ecf42 213 my $charset = shift // ""; # name of character set for comment
9d9177be 214
76d3994c 215 die "No inversion list for $name" unless defined $invlist
ad85f59a 216 && ref $invlist eq 'ARRAY';
76d3994c 217
9d9177be
KW
218 # Output the inversion list $invlist using the name $name for it.
219 # It is output in the exact internal form for inversion lists.
220
a0316a6c
KW
221 # Is the last element of the header 0, or 1 ?
222 my $zero_or_one = 0;
ad85f59a 223 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 224 unshift @$invlist, 0;
9d9177be
KW
225 $zero_or_one = 1;
226 }
0a07b44b 227 my $count = @$invlist;
9d9177be 228
bffc0129 229 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 230
0c4ecf42
KW
231 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
232 print $out_fh " /* for $charset */" if $charset;
233 print $out_fh "\n";
9d9177be 234
a0316a6c 235 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
236 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
237 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
238 ",\t/* 0 if the list starts at 0;",
239 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
240
241 # The main body are the UVs passed in to this routine. Do the final
242 # element separately
47d53124
KW
243 for my $i (0 .. @$invlist - 1) {
244 printf $out_fh "\t0x%X", $invlist->[$i];
245 print $out_fh "," if $i < @$invlist - 1;
246 print $out_fh "\n";
9d9177be
KW
247 }
248
9d9177be
KW
249 print $out_fh "};\n";
250}
251
99f21fb9
KW
252sub output_invmap ($$$$$$$) {
253 my $name = shift;
254 my $invmap = shift; # Reference to inversion map array
255 my $prop_name = shift;
256 my $input_format = shift; # The inversion map's format
257 my $default = shift; # The property value for code points who
258 # otherwise don't have a value specified.
259 my $extra_enums = shift; # comma-separated list of our additions to the
260 # property's standard possible values
261 my $charset = shift // ""; # name of character set for comment
262
263 # Output the inversion map $invmap for property $prop_name, but use $name
264 # as the actual data structure's name.
265
266 my $count = @$invmap;
267
268 my $output_format;
269 my $declaration_type;
270 my %enums;
271 my $name_prefix;
272
273 if ($input_format eq 's') {
b83e6484 274 my $orig_prop_name = $prop_name;
02f811dd
KW
275 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
276 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
b83e6484
KW
277 my @enums;
278 if ($orig_prop_name eq $prop_name) {
279 @enums = prop_values($prop_name);
280 }
281 else {
282 @enums = uniques(@$invmap);
283 }
99f21fb9
KW
284 if (! @enums) {
285 die "Only enum properties are currently handled; '$prop_name' isn't one";
286 }
287 else {
288
f79a09fc 289 # Convert short names to long
99f21fb9 290 @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums;
f79a09fc
KW
291
292 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
293 die 'You need to update %hard_coded_enums to reflect new entries in this Unicode version'
294 if @expected_enums < @enums;
295
296 # Remove the enums found in the input from the ones we expect
297 for (my $i = @expected_enums - 1; $i >= 0; $i--) {
298 splice(@expected_enums, $i, 1)
299 if grep { $expected_enums[$i] eq $_ } @enums;
300 }
301
302 # The ones remaining must be because we're using an older
303 # Unicode version. Add them to the list.
304 push @enums, @expected_enums;
305
306 # Add in the extra values coded into this program, and sort.
99f21fb9
KW
307 @enums = sort @enums;
308
6dc80864
KW
309 # The internal enums comes last.
310 push @enums, split /,/, $extra_enums if $extra_enums ne "";
311
99f21fb9
KW
312 # Assign a value to each element of the enum. The default
313 # value always gets 0; the others are arbitrarily assigned.
314 my $enum_val = 0;
02f811dd
KW
315 my $canonical_default = prop_value_aliases($prop_name, $default);
316 $default = $canonical_default if defined $canonical_default;
99f21fb9
KW
317 $enums{$default} = $enum_val++;
318 for my $enum (@enums) {
319 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
320 }
321 }
322
bffc0129
KW
323 # Inversion map stuff is currently used only by regexec
324 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
99f21fb9
KW
325 {
326
99f21fb9
KW
327 # The short names tend to be two lower case letters, but it looks
328 # better for those if they are upper. XXX
329 $short_name = uc($short_name) if length($short_name) < 3
330 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
85e5f08b 331 $name_prefix = "${short_name}_";
99f21fb9
KW
332 my $enum_count = keys %enums;
333 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
334
335 print $out_fh "\ntypedef enum {\n";
6dc80864
KW
336 my @enum_list;
337 foreach my $enum (keys %enums) {
338 $enum_list[$enums{$enum}] = $enum;
339 }
340 foreach my $i (0 .. @enum_list - 1) {
341 my $name = $enum_list[$i];
342 print $out_fh "\t${name_prefix}$name = $i";
343 print $out_fh "," if $i < $enum_count - 1;
344 print $out_fh "\n";
99f21fb9
KW
345 }
346 $declaration_type = "${name_prefix}enum";
347 print $out_fh "} $declaration_type;\n";
348
349 $output_format = "${name_prefix}%s";
350 }
351 }
352 else {
353 die "'$input_format' invmap() format for '$prop_name' unimplemented";
354 }
355
356 die "No inversion map for $prop_name" unless defined $invmap
357 && ref $invmap eq 'ARRAY'
358 && $count;
359
360 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
361 print $out_fh " /* for $charset */" if $charset;
362 print $out_fh "\n";
363
364 # The main body are the scalars passed in to this routine.
365 for my $i (0 .. $count - 1) {
366 my $element = $invmap->[$i];
02f811dd
KW
367 my $full_element_name = prop_value_aliases($prop_name, $element);
368 $element = $full_element_name if defined $full_element_name;
369 $element = $name_prefix . $element;
99f21fb9
KW
370 print $out_fh "\t$element";
371 print $out_fh "," if $i < $count - 1;
372 print $out_fh "\n";
373 }
374 print $out_fh "};\n";
99f21fb9
KW
375}
376
5a7e5385 377sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
378
379 # Returns an inversion list constructed from the sorted input array of
380 # code points
381
382 my $list_ref = shift;
383
99f21fb9
KW
384 return unless @$list_ref;
385
a02047bf
KW
386 # Initialize to just the first element
387 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
388
389 # For each succeeding element, if it extends the previous range, adjust
390 # up, otherwise add it.
391 for my $i (1 .. @$list_ref - 1) {
392 if ($invlist[-1] == $list_ref->[$i]) {
393 $invlist[-1]++;
394 }
395 else {
396 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
397 }
398 }
399 return @invlist;
400}
401
402# Read in the Case Folding rules, and construct arrays of code points for the
403# properties we need.
404my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
405die "Could not find inversion map for Case_Folding" unless defined $format;
406die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
407 unless $format eq 'al'
408 || $format eq 'a';
a02047bf
KW
409my @has_multi_char_fold;
410my @is_non_final_fold;
411
412for my $i (0 .. @$folds_ref - 1) {
413 next unless ref $folds_ref->[$i]; # Skip single-char folds
414 push @has_multi_char_fold, $cp_ref->[$i];
415
b6a6e956 416 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
417 # position
418 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
419 push @is_non_final_fold, $folds_ref->[$i][$j]
420 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
421 }
422}
423
a02047bf
KW
424sub _Perl_Non_Final_Folds {
425 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 426 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
427}
428
99f21fb9
KW
429sub prop_name_for_cmp ($) { # Sort helper
430 my $name = shift;
431
432 # Returns the input lowercased, with non-alphas removed, as well as
433 # everything starting with a comma
434
435 $name =~ s/,.*//;
436 $name =~ s/[[:^alpha:]]//g;
437 return lc $name;
438}
439
892d8259 440sub UpperLatin1 {
5a7e5385 441 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
442}
443
9d9177be
KW
444output_invlist("Latin1", [ 0, 256 ]);
445output_invlist("AboveLatin1", [ 256 ]);
446
bffc0129 447end_file_pound_if;
43b443dd 448
3f427fd9
KW
449# We construct lists for all the POSIX and backslash sequence character
450# classes in two forms:
451# 1) ones which match only in the ASCII range
452# 2) ones which match either in the Latin1 range, or the entire Unicode range
453#
454# These get compiled in, and hence affect the memory footprint of every Perl
455# program, even those not using Unicode. To minimize the size, currently
456# the Latin1 version is generated for the beyond ASCII range except for those
457# lists that are quite small for the entire range, such as for \s, which is 22
458# UVs long plus 4 UVs (currently) for the header.
459#
460# To save even more memory, the ASCII versions could be derived from the
461# larger ones at runtime, saving some memory (minus the expense of the machine
462# instructions to do so), but these are all small anyway, so their total is
463# about 100 UVs.
464#
465# In the list of properties below that get generated, the L1 prefix is a fake
466# property that means just the Latin1 range of the full property (whose name
467# has an X prefix instead of L1).
a02047bf
KW
468#
469# An initial & means to use the subroutine from this file instead of an
470# official inversion list.
3f427fd9 471
0c4ecf42
KW
472for my $charset (get_supported_code_pages()) {
473 print $out_fh "\n" . get_conditional_compile_line_start($charset);
474
99f21fb9
KW
475 @a2n = @{get_a2n($charset)};
476 no warnings 'qw';
477 # Ignore non-alpha in sort
478 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
1c8c3428
KW
479 ASCII
480 Cased
481 VertSpace
482 XPerlSpace
483 XPosixAlnum
484 XPosixAlpha
485 XPosixBlank
486 XPosixCntrl
487 XPosixDigit
488 XPosixGraph
489 XPosixLower
490 XPosixPrint
491 XPosixPunct
492 XPosixSpace
493 XPosixUpper
494 XPosixWord
495 XPosixXDigit
496 _Perl_Any_Folds
497 &NonL1_Perl_Non_Final_Folds
498 _Perl_Folds_To_Multi_Char
499 &UpperLatin1
500 _Perl_IDStart
501 _Perl_IDCont
02f811dd 502 _Perl_GCB,EDGE
ca8226cf 503 _Perl_LB,EDGE
bf4268fa 504 _Perl_SB,EDGE
190d69bb 505 _Perl_WB,EDGE,UNKNOWN
1c8c3428 506 )
0f5e3c71
KW
507 ) {
508
509 # For the Latin1 properties, we change to use the eXtended version of the
510 # base property, then go through the result and get rid of everything not
511 # in Latin1 (above 255). Actually, we retain the element for the range
512 # that crosses the 255/256 boundary if it is one that matches the
513 # property. For example, in the Word property, there is a range of code
514 # points that start at U+00F8 and goes through U+02C1. Instead of
515 # artificially cutting that off at 256 because 256 is the first code point
516 # above Latin1, we let the range go to its natural ending. That gives us
517 # extra information with no added space taken. But if the range that
518 # crosses the boundary is one that doesn't match the property, we don't
519 # start a new range above 255, as that could be construed as going to
520 # infinity. For example, the Upper property doesn't include the character
521 # at 255, but does include the one at 256. We don't include the 256 one.
522 my $prop_name = $prop;
523 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
524 my $extra_enums = "";
525 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
526 my $lookup_prop = $prop_name;
527 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
528 or $lookup_prop =~ s/^L1//);
529 my $nonl1_only = 0;
530 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 531 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
532
533 my @invlist;
99f21fb9
KW
534 my @invmap;
535 my $map_format;
536 my $map_default;
537 my $maps_to_code_point;
538 my $to_adjust;
0f5e3c71
KW
539 if ($is_local_sub) {
540 @invlist = eval $lookup_prop;
541 }
542 else {
543 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 544 if (! @invlist) {
99f21fb9 545
ad85f59a
KW
546 # If couldn't find a non-empty inversion list, see if it is
547 # instead an inversion map
548 my ($list_ref, $map_ref, $format, $default)
99f21fb9 549 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
550 if (! $list_ref) {
551 # An empty return here could mean an unknown property, or
552 # merely that the original inversion list is empty. Call
553 # in scalar context to differentiate
554 my $count = prop_invlist($lookup_prop,
555 '_perl_core_internal_ok');
556 die "Could not find inversion list for '$lookup_prop'"
557 unless defined $count;
558 }
559 else {
18b852b3
KW
560 @invlist = @$list_ref;
561 @invmap = @$map_ref;
562 $map_format = $format;
563 $map_default = $default;
564 $maps_to_code_point = $map_format =~ /x/;
565 $to_adjust = $map_format =~ /a/;
ad85f59a 566 }
99f21fb9 567 }
0f5e3c71 568 }
ad85f59a
KW
569
570
571 # Short-circuit an empty inversion list.
572 if (! @invlist) {
573 output_invlist($prop_name, \@invlist, $charset);
574 next;
575 }
ceb1de32 576
99f21fb9
KW
577 # Re-order the Unicode code points to native ones for this platform.
578 # This is only needed for code points below 256, because native code
579 # points are only in that range. For inversion maps of properties
580 # where the mappings are adjusted (format =~ /a/), this reordering
581 # could mess up the adjustment pattern that was in the input, so that
582 # has to be dealt with.
583 #
584 # And inversion maps that map to code points need to eventually have
585 # all those code points remapped to native, and it's better to do that
586 # here, going through the whole list not just those below 256. This
587 # is because some inversion maps have adjustments (format =~ /a/)
588 # which may be affected by the reordering. This code needs to be done
589 # both for when we are translating the inversion lists for < 256, and
590 # for the inversion maps for everything. By doing both in this loop,
591 # we can share that code.
592 #
593 # So, we go through everything for an inversion map to code points;
594 # otherwise, we can skip any remapping at all if we are going to
595 # output only the above-Latin1 values, or if the range spans the whole
596 # of 0..256, as the remap will also include all of 0..256 (256 not
597 # 255 because a re-ordering could cause 256 to need to be in the same
598 # range as 255.)
599 if ((@invmap && $maps_to_code_point)
600 || (! $nonl1_only || ($invlist[0] < 256
601 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 602 {
fb4554ea 603
99f21fb9 604 if (! @invmap) { # Straight inversion list
fb4554ea
KW
605 # Look at all the ranges that start before 257.
606 my @latin1_list;
607 while (@invlist) {
608 last if $invlist[0] > 256;
609 my $upper = @invlist > 1
610 ? $invlist[1] - 1 # In range
8a6c81cf
KW
611
612 # To infinity. You may want to stop much much
613 # earlier; going this high may expose perl
614 # deficiencies with very large numbers.
615 : $Unicode::UCD::MAX_CP;
fb4554ea 616 for my $j ($invlist[0] .. $upper) {
99f21fb9 617 push @latin1_list, a2n($j);
0f5e3c71 618 }
fb4554ea
KW
619
620 shift @invlist; # Shift off the range that's in the list
621 shift @invlist; # Shift off the range not in the list
0c4ecf42 622 }
fb4554ea
KW
623
624 # Here @invlist contains all the ranges in the original that start
625 # at code points above 256, and @latin1_list contains all the
626 # native code points for ranges that start with a Unicode code
627 # point below 257. We sort the latter and convert it to inversion
628 # list format. Then simply prepend it to the list of the higher
629 # code points.
630 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 631 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 632 unshift @invlist, @latin1_list;
99f21fb9
KW
633 }
634 else { # Is an inversion map
635
636 # This is a similar procedure as plain inversion list, but has
637 # multiple buckets. A plain inversion list just has two
638 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
639 # pretty much can ignore the 2nd bucket, as it is completely
640 # defined by the 1st. But here, what we do is create buckets
641 # which contain the code points that map to each, translated
642 # to native and turned into an inversion list. Thus each
643 # bucket is an inversion list of native code points that map
644 # to it or don't map to it. We use these to create an
645 # inversion map for the whole property.
646
647 # As mentioned earlier, we use this procedure to not just
648 # remap the inversion list to native values, but also the maps
649 # of code points to native ones. In the latter case we have
650 # to look at the whole of the inversion map (or at least to
651 # above Unicode; as the maps of code points above that should
652 # all be to the default).
653 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
654
655 my %mapped_lists; # A hash whose keys are the buckets.
656 while (@invlist) {
657 last if $invlist[0] > $upper_limit;
658
659 # This shouldn't actually happen, as prop_invmap() returns
660 # an extra element at the end that is beyond $upper_limit
661 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
662
663 my $bucket;
664
665 # A hash key can't be a ref (we are only expecting arrays
666 # of scalars here), so convert any such to a string that
667 # will be converted back later (using a vertical tab as
668 # the separator). Even if the mapping is to code points,
669 # we don't translate to native here because the code
670 # output_map() calls to output these arrays assumes the
671 # input is Unicode, not native.
672 if (ref $invmap[0]) {
673 $bucket = join "\cK", @{$invmap[0]};
674 }
675 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
676
677 # Do convert to native for maps to single code points.
678 # There are some properties that have a few outlier
679 # maps that aren't code points, so the above test
680 # skips those.
681 $bucket = a2n($invmap[0]);
682 } else {
683 $bucket = $invmap[0];
684 }
685
686 # We now have the bucket that all code points in the range
687 # map to, though possibly they need to be adjusted. Go
688 # through the range and put each translated code point in
689 # it into its bucket.
690 my $base_map = $invmap[0];
691 for my $j ($invlist[0] .. $invlist[1] - 1) {
692 if ($to_adjust
693 # The 1st code point doesn't need adjusting
694 && $j > $invlist[0]
695
696 # Skip any non-numeric maps: these are outliers
697 # that aren't code points.
698 && $base_map =~ $numeric_re
699
700 # 'ne' because the default can be a string
701 && $base_map ne $map_default)
702 {
703 # We adjust, by incrementing each the bucket and
704 # the map. For code point maps, translate to
705 # native
706 $base_map++;
707 $bucket = ($maps_to_code_point)
708 ? a2n($base_map)
709 : $base_map;
710 }
711
712 # Add the native code point to the bucket for the
713 # current map
714 push @{$mapped_lists{$bucket}}, a2n($j);
715 } # End of loop through all code points in the range
716
717 # Get ready for the next range
718 shift @invlist;
719 shift @invmap;
720 } # End of loop through all ranges in the map.
721
722 # Here, @invlist and @invmap retain all the ranges from the
723 # originals that start with code points above $upper_limit.
724 # Each bucket in %mapped_lists contains all the code points
725 # that map to that bucket. If the bucket is for a map to a
726 # single code point is a single code point, the bucket has
727 # been converted to native. If something else (including
728 # multiple code points), no conversion is done.
729 #
730 # Now we recreate the inversion map into %xlated, but this
731 # time for the native character set.
732 my %xlated;
733 foreach my $bucket (keys %mapped_lists) {
734
735 # Sort and convert this bucket to an inversion list. The
736 # result will be that ranges that start with even-numbered
737 # indexes will be for code points that map to this bucket;
738 # odd ones map to some other bucket, and are discarded
739 # below.
740 @{$mapped_lists{$bucket}}
741 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
742 @{$mapped_lists{$bucket}}
743 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
744
745 # Add each even-numbered range in the bucket to %xlated;
746 # so that the keys of %xlated become the range start code
747 # points, and the values are their corresponding maps.
748 while (@{$mapped_lists{$bucket}}) {
749 my $range_start = $mapped_lists{$bucket}->[0];
750 if ($bucket =~ /\cK/) {
751 @{$xlated{$range_start}} = split /\cK/, $bucket;
752 }
753 else {
754 $xlated{$range_start} = $bucket;
755 }
756 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
757 shift @{$mapped_lists{$bucket}}; # Get ready for next
758 # iteration
759 }
760 } # End of loop through all the buckets.
761
762 # Here %xlated's keys are the range starts of all the code
763 # points in the inversion map. Construct an inversion list
764 # from them.
765 my @new_invlist = sort { $a <=> $b } keys %xlated;
766
767 # If the list is adjusted, we want to munge this list so that
768 # we only have one entry for where consecutive code points map
769 # to consecutive values. We just skip the subsequent entries
770 # where this is the case.
771 if ($to_adjust) {
772 my @temp;
773 for my $i (0 .. @new_invlist - 1) {
774 next if $i > 0
775 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
776 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
777 && $xlated{$new_invlist[$i]} =~ $numeric_re
778 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
779 push @temp, $new_invlist[$i];
780 }
781 @new_invlist = @temp;
782 }
783
784 # The inversion map comes from %xlated's values. We can
785 # unshift each onto the front of the untouched portion, in
786 # reverse order of the portion we did process.
787 foreach my $start (reverse @new_invlist) {
788 unshift @invmap, $xlated{$start};
789 }
790
791 # Finally prepend the inversion list we have just constructed to the
792 # one that contains anything we didn't process.
793 unshift @invlist, @new_invlist;
794 }
795 }
796
797 # prop_invmap() returns an extra final entry, which we can now
798 # discard.
799 if (@invmap) {
800 pop @invlist;
801 pop @invmap;
ceb1de32 802 }
0f5e3c71
KW
803
804 if ($l1_only) {
99f21fb9 805 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
806 for my $i (0 .. @invlist - 1 - 1) {
807 if ($invlist[$i] > 255) {
808
809 # In an inversion list, even-numbered elements give the code
810 # points that begin ranges that match the property;
811 # odd-numbered give ones that begin ranges that don't match.
812 # If $i is odd, we are at the first code point above 255 that
813 # doesn't match, which means the range it is ending does
814 # match, and crosses the 255/256 boundary. We want to include
815 # this ending point, so increment $i, so the splice below
816 # includes it. Conversely, if $i is even, it is the first
817 # code point above 255 that matches, which means there was no
818 # matching range that crossed the boundary, and we don't want
819 # to include this code point, so splice before it.
820 $i++ if $i % 2 != 0;
821
822 # Remove everything past this.
823 splice @invlist, $i;
99f21fb9 824 splice @invmap, $i if @invmap;
0f5e3c71
KW
825 last;
826 }
0c4ecf42
KW
827 }
828 }
0f5e3c71
KW
829 elsif ($nonl1_only) {
830 my $found_nonl1 = 0;
831 for my $i (0 .. @invlist - 1 - 1) {
832 next if $invlist[$i] < 256;
833
834 # Here, we have the first element in the array that indicates an
835 # element above Latin1. Get rid of all previous ones.
836 splice @invlist, 0, $i;
99f21fb9 837 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
838
839 # If this one's index is not divisible by 2, it means that this
840 # element is inverting away from being in the list, which means
99f21fb9
KW
841 # all code points from 256 to this one are in this list (or
842 # map to the default for inversion maps)
843 if ($i % 2 != 0) {
844 unshift @invlist, 256;
845 unshift @invmap, $map_default if @invmap;
846 }
0f5e3c71 847 $found_nonl1 = 1;
3f427fd9
KW
848 last;
849 }
0f5e3c71 850 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 851 }
3f427fd9 852
0f5e3c71 853 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 854 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 855 }
bffc0129 856 end_file_pound_if;
0c4ecf42 857 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
858}
859
2308ab83 860my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
861my @sources = ($0, qw(lib/unicore/mktables
862 lib/Unicode/UCD.pm
863 regen/charset_translations.pl
864 ));
9a3da3ad
FC
865{
866 # Depend on mktables’ own sources. It’s a shorter list of files than
867 # those that Unicode::UCD uses.
2308ab83
KW
868 if (! open my $mktables_list, $sources_list) {
869
870 # This should force a rebuild once $sources_list exists
871 push @sources, $sources_list;
872 }
873 else {
874 while(<$mktables_list>) {
875 last if /===/;
876 chomp;
877 push @sources, "lib/unicore/$_" if /^[^#]/;
878 }
9a3da3ad
FC
879 }
880}
881read_only_bottom_close_and_rename($out_fh, \@sources)