This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/mk_invlists.pl: Add tables for \b{sb}
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
9d9177be 11require 'regen/regen_lib.pl';
0c4ecf42 12require 'regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
9d9177be
KW
43print $out_fh "/* See the generating file for comments */\n\n";
44
bffc0129
KW
45# The symbols generated by this program are all currently defined only in a
46# single dot c each. The code knows where most of them go, but this hash
47# gives overrides for the exceptions to the typical place
48my %exceptions_to_where_to_define =
49 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
50 AboveLatin1 => 'PERL_IN_REGCOMP_C',
51 Latin1 => 'PERL_IN_REGCOMP_C',
52 UpperLatin1 => 'PERL_IN_REGCOMP_C',
53 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
54 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
55 _Perl_IDCont => 'PERL_IN_UTF8_C',
56 _Perl_IDStart => 'PERL_IN_UTF8_C',
57 );
015bb97c 58
99f21fb9
KW
59my @a2n;
60
61sub uniques {
62 # Returns non-duplicated input values. From "Perl Best Practices:
63 # Encapsulated Cleverness". p. 455 in first edition.
64
65 my %seen;
66 return grep { ! $seen{$_}++ } @_;
67}
68
69sub a2n($) {
70 my $cp = shift;
71
72 # Returns the input Unicode code point translated to native.
73
74 return $cp if $cp !~ $numeric_re || $cp > 255;
75 return $a2n[$cp];
76}
77
bffc0129
KW
78sub end_file_pound_if {
79 if ($in_file_pound_if) {
80 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
81 $in_file_pound_if = 0;
82 }
83}
84
85sub switch_pound_if ($$) {
86 my $name = shift;
87 my $new_pound_if = shift;
88
89 # Switch to new #if given by the 2nd argument. If there is an override
90 # for this, it instead switches to that. The 1st argument is the
91 # static's name, used to look up the overrides
92
93 if (exists $exceptions_to_where_to_define{$name}) {
94 $new_pound_if = $exceptions_to_where_to_define{$name};
95 }
96
97 # Exit current #if if the new one is different from the old
98 if ($in_file_pound_if
99 && $in_file_pound_if !~ /$new_pound_if/)
100 {
101 end_file_pound_if;
102 }
103
104 # Enter new #if, if not already in it.
105 if (! $in_file_pound_if) {
106 $in_file_pound_if = "defined($new_pound_if)";
107 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
108 }
109}
110
0c4ecf42 111sub output_invlist ($$;$) {
9d9177be
KW
112 my $name = shift;
113 my $invlist = shift; # Reference to inversion list array
0c4ecf42 114 my $charset = shift // ""; # name of character set for comment
9d9177be 115
76d3994c
KW
116 die "No inversion list for $name" unless defined $invlist
117 && ref $invlist eq 'ARRAY'
118 && @$invlist;
119
9d9177be
KW
120 # Output the inversion list $invlist using the name $name for it.
121 # It is output in the exact internal form for inversion lists.
122
a0316a6c
KW
123 # Is the last element of the header 0, or 1 ?
124 my $zero_or_one = 0;
a0316a6c
KW
125 if ($invlist->[0] != 0) {
126 unshift @$invlist, 0;
9d9177be
KW
127 $zero_or_one = 1;
128 }
0a07b44b 129 my $count = @$invlist;
9d9177be 130
bffc0129 131 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 132
0c4ecf42
KW
133 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
134 print $out_fh " /* for $charset */" if $charset;
135 print $out_fh "\n";
9d9177be 136
a0316a6c 137 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
138 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
139 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
140 ",\t/* 0 if the list starts at 0;",
141 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
142
143 # The main body are the UVs passed in to this routine. Do the final
144 # element separately
47d53124
KW
145 for my $i (0 .. @$invlist - 1) {
146 printf $out_fh "\t0x%X", $invlist->[$i];
147 print $out_fh "," if $i < @$invlist - 1;
148 print $out_fh "\n";
9d9177be
KW
149 }
150
9d9177be
KW
151 print $out_fh "};\n";
152}
153
99f21fb9
KW
154sub output_invmap ($$$$$$$) {
155 my $name = shift;
156 my $invmap = shift; # Reference to inversion map array
157 my $prop_name = shift;
158 my $input_format = shift; # The inversion map's format
159 my $default = shift; # The property value for code points who
160 # otherwise don't have a value specified.
161 my $extra_enums = shift; # comma-separated list of our additions to the
162 # property's standard possible values
163 my $charset = shift // ""; # name of character set for comment
164
165 # Output the inversion map $invmap for property $prop_name, but use $name
166 # as the actual data structure's name.
167
168 my $count = @$invmap;
169
170 my $output_format;
171 my $declaration_type;
172 my %enums;
173 my $name_prefix;
174
175 if ($input_format eq 's') {
176 $prop_name = (prop_aliases($prop_name))[1]; # Get full name
177 my @enums = prop_values($prop_name);
178 if (! @enums) {
179 die "Only enum properties are currently handled; '$prop_name' isn't one";
180 }
181 else {
182
183 # Convert short names to long, add in the extras, and sort.
184 @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums;
185 push @enums, split /,/, $extra_enums if $extra_enums ne "";
186 @enums = sort @enums;
187
188 # Assign a value to each element of the enum. The default
189 # value always gets 0; the others are arbitrarily assigned.
190 my $enum_val = 0;
191 $default = prop_value_aliases($prop_name, $default);
192 $enums{$default} = $enum_val++;
193 for my $enum (@enums) {
194 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
195 }
196 }
197
bffc0129
KW
198 # Inversion map stuff is currently used only by regexec
199 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
99f21fb9
KW
200 {
201
202 my $short_name = (prop_aliases($prop_name))[0];
203
204 # The short names tend to be two lower case letters, but it looks
205 # better for those if they are upper. XXX
206 $short_name = uc($short_name) if length($short_name) < 3
207 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
208 $name_prefix = "PL_${short_name}_";
209 my $enum_count = keys %enums;
210 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
211
212 print $out_fh "\ntypedef enum {\n";
213 print $out_fh "\t${name_prefix}$default = $enums{$default},\n";
214 delete $enums{$default};
215 foreach my $enum (sort { $a cmp $b } keys %enums) {
216 print $out_fh "\t${name_prefix}$enum = $enums{$enum}";
217 print $out_fh "," if $enums{$enum} < $enum_count - 1;
218 print $out_fh "\n";
219 }
220 $declaration_type = "${name_prefix}enum";
221 print $out_fh "} $declaration_type;\n";
222
223 $output_format = "${name_prefix}%s";
224 }
225 }
226 else {
227 die "'$input_format' invmap() format for '$prop_name' unimplemented";
228 }
229
230 die "No inversion map for $prop_name" unless defined $invmap
231 && ref $invmap eq 'ARRAY'
232 && $count;
233
234 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
235 print $out_fh " /* for $charset */" if $charset;
236 print $out_fh "\n";
237
238 # The main body are the scalars passed in to this routine.
239 for my $i (0 .. $count - 1) {
240 my $element = $invmap->[$i];
241 $element = $name_prefix . prop_value_aliases($prop_name, $element);
242 print $out_fh "\t$element";
243 print $out_fh "," if $i < $count - 1;
244 print $out_fh "\n";
245 }
246 print $out_fh "};\n";
99f21fb9
KW
247}
248
5a7e5385 249sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
250
251 # Returns an inversion list constructed from the sorted input array of
252 # code points
253
254 my $list_ref = shift;
255
99f21fb9
KW
256 return unless @$list_ref;
257
a02047bf
KW
258 # Initialize to just the first element
259 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
260
261 # For each succeeding element, if it extends the previous range, adjust
262 # up, otherwise add it.
263 for my $i (1 .. @$list_ref - 1) {
264 if ($invlist[-1] == $list_ref->[$i]) {
265 $invlist[-1]++;
266 }
267 else {
268 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
269 }
270 }
271 return @invlist;
272}
273
274# Read in the Case Folding rules, and construct arrays of code points for the
275# properties we need.
276my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
277die "Could not find inversion map for Case_Folding" unless defined $format;
278die "Incorrect format '$format' for Case_Folding inversion map"
279 unless $format eq 'al';
280my @has_multi_char_fold;
281my @is_non_final_fold;
282
283for my $i (0 .. @$folds_ref - 1) {
284 next unless ref $folds_ref->[$i]; # Skip single-char folds
285 push @has_multi_char_fold, $cp_ref->[$i];
286
b6a6e956 287 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
288 # position
289 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
290 push @is_non_final_fold, $folds_ref->[$i][$j]
291 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
292 }
293}
294
a02047bf
KW
295sub _Perl_Non_Final_Folds {
296 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 297 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
298}
299
99f21fb9
KW
300sub prop_name_for_cmp ($) { # Sort helper
301 my $name = shift;
302
303 # Returns the input lowercased, with non-alphas removed, as well as
304 # everything starting with a comma
305
306 $name =~ s/,.*//;
307 $name =~ s/[[:^alpha:]]//g;
308 return lc $name;
309}
310
892d8259 311sub UpperLatin1 {
5a7e5385 312 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
313}
314
9d9177be
KW
315output_invlist("Latin1", [ 0, 256 ]);
316output_invlist("AboveLatin1", [ 256 ]);
317
bffc0129 318end_file_pound_if;
43b443dd 319
3f427fd9
KW
320# We construct lists for all the POSIX and backslash sequence character
321# classes in two forms:
322# 1) ones which match only in the ASCII range
323# 2) ones which match either in the Latin1 range, or the entire Unicode range
324#
325# These get compiled in, and hence affect the memory footprint of every Perl
326# program, even those not using Unicode. To minimize the size, currently
327# the Latin1 version is generated for the beyond ASCII range except for those
328# lists that are quite small for the entire range, such as for \s, which is 22
329# UVs long plus 4 UVs (currently) for the header.
330#
331# To save even more memory, the ASCII versions could be derived from the
332# larger ones at runtime, saving some memory (minus the expense of the machine
333# instructions to do so), but these are all small anyway, so their total is
334# about 100 UVs.
335#
336# In the list of properties below that get generated, the L1 prefix is a fake
337# property that means just the Latin1 range of the full property (whose name
338# has an X prefix instead of L1).
a02047bf
KW
339#
340# An initial & means to use the subroutine from this file instead of an
341# official inversion list.
3f427fd9 342
0c4ecf42
KW
343for my $charset (get_supported_code_pages()) {
344 print $out_fh "\n" . get_conditional_compile_line_start($charset);
345
99f21fb9
KW
346 @a2n = @{get_a2n($charset)};
347 no warnings 'qw';
348 # Ignore non-alpha in sort
349 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
1c8c3428
KW
350 ASCII
351 Cased
352 VertSpace
353 XPerlSpace
354 XPosixAlnum
355 XPosixAlpha
356 XPosixBlank
357 XPosixCntrl
358 XPosixDigit
359 XPosixGraph
360 XPosixLower
361 XPosixPrint
362 XPosixPunct
363 XPosixSpace
364 XPosixUpper
365 XPosixWord
366 XPosixXDigit
367 _Perl_Any_Folds
368 &NonL1_Perl_Non_Final_Folds
369 _Perl_Folds_To_Multi_Char
370 &UpperLatin1
371 _Perl_IDStart
372 _Perl_IDCont
0e0b9356 373 Grapheme_Cluster_Break,EDGE
53255578 374 Word_Break,EDGE,UNKNOWN
d3e558c4 375 Sentence_Break,EDGE
1c8c3428 376 )
0f5e3c71
KW
377 ) {
378
379 # For the Latin1 properties, we change to use the eXtended version of the
380 # base property, then go through the result and get rid of everything not
381 # in Latin1 (above 255). Actually, we retain the element for the range
382 # that crosses the 255/256 boundary if it is one that matches the
383 # property. For example, in the Word property, there is a range of code
384 # points that start at U+00F8 and goes through U+02C1. Instead of
385 # artificially cutting that off at 256 because 256 is the first code point
386 # above Latin1, we let the range go to its natural ending. That gives us
387 # extra information with no added space taken. But if the range that
388 # crosses the boundary is one that doesn't match the property, we don't
389 # start a new range above 255, as that could be construed as going to
390 # infinity. For example, the Upper property doesn't include the character
391 # at 255, but does include the one at 256. We don't include the 256 one.
392 my $prop_name = $prop;
393 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
394 my $extra_enums = "";
395 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
396 my $lookup_prop = $prop_name;
397 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
398 or $lookup_prop =~ s/^L1//);
399 my $nonl1_only = 0;
400 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 401 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
402
403 my @invlist;
99f21fb9
KW
404 my @invmap;
405 my $map_format;
406 my $map_default;
407 my $maps_to_code_point;
408 my $to_adjust;
0f5e3c71
KW
409 if ($is_local_sub) {
410 @invlist = eval $lookup_prop;
411 }
412 else {
413 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9
KW
414 if (! @invlist) {
415 my ($list_ref, $map_ref, $format, $default);
416
417 ($list_ref, $map_ref, $format, $default)
418 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
419 die "Could not find inversion list for '$lookup_prop'" unless $list_ref;
420 @invlist = @$list_ref;
421 @invmap = @$map_ref;
422 $map_format = $format;
423 $map_default = $default;
424 $maps_to_code_point = $map_format =~ /x/;
425 $to_adjust = $map_format =~ /a/;
426 }
0f5e3c71
KW
427 }
428 die "Could not find inversion list for '$lookup_prop'" unless @invlist;
ceb1de32 429
99f21fb9
KW
430 # Re-order the Unicode code points to native ones for this platform.
431 # This is only needed for code points below 256, because native code
432 # points are only in that range. For inversion maps of properties
433 # where the mappings are adjusted (format =~ /a/), this reordering
434 # could mess up the adjustment pattern that was in the input, so that
435 # has to be dealt with.
436 #
437 # And inversion maps that map to code points need to eventually have
438 # all those code points remapped to native, and it's better to do that
439 # here, going through the whole list not just those below 256. This
440 # is because some inversion maps have adjustments (format =~ /a/)
441 # which may be affected by the reordering. This code needs to be done
442 # both for when we are translating the inversion lists for < 256, and
443 # for the inversion maps for everything. By doing both in this loop,
444 # we can share that code.
445 #
446 # So, we go through everything for an inversion map to code points;
447 # otherwise, we can skip any remapping at all if we are going to
448 # output only the above-Latin1 values, or if the range spans the whole
449 # of 0..256, as the remap will also include all of 0..256 (256 not
450 # 255 because a re-ordering could cause 256 to need to be in the same
451 # range as 255.)
452 if ((@invmap && $maps_to_code_point)
453 || (! $nonl1_only || ($invlist[0] < 256
454 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 455 {
fb4554ea 456
99f21fb9 457 if (! @invmap) { # Straight inversion list
fb4554ea
KW
458 # Look at all the ranges that start before 257.
459 my @latin1_list;
460 while (@invlist) {
461 last if $invlist[0] > 256;
462 my $upper = @invlist > 1
463 ? $invlist[1] - 1 # In range
8a6c81cf
KW
464
465 # To infinity. You may want to stop much much
466 # earlier; going this high may expose perl
467 # deficiencies with very large numbers.
468 : $Unicode::UCD::MAX_CP;
fb4554ea 469 for my $j ($invlist[0] .. $upper) {
99f21fb9 470 push @latin1_list, a2n($j);
0f5e3c71 471 }
fb4554ea
KW
472
473 shift @invlist; # Shift off the range that's in the list
474 shift @invlist; # Shift off the range not in the list
0c4ecf42 475 }
fb4554ea
KW
476
477 # Here @invlist contains all the ranges in the original that start
478 # at code points above 256, and @latin1_list contains all the
479 # native code points for ranges that start with a Unicode code
480 # point below 257. We sort the latter and convert it to inversion
481 # list format. Then simply prepend it to the list of the higher
482 # code points.
483 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 484 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 485 unshift @invlist, @latin1_list;
99f21fb9
KW
486 }
487 else { # Is an inversion map
488
489 # This is a similar procedure as plain inversion list, but has
490 # multiple buckets. A plain inversion list just has two
491 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
492 # pretty much can ignore the 2nd bucket, as it is completely
493 # defined by the 1st. But here, what we do is create buckets
494 # which contain the code points that map to each, translated
495 # to native and turned into an inversion list. Thus each
496 # bucket is an inversion list of native code points that map
497 # to it or don't map to it. We use these to create an
498 # inversion map for the whole property.
499
500 # As mentioned earlier, we use this procedure to not just
501 # remap the inversion list to native values, but also the maps
502 # of code points to native ones. In the latter case we have
503 # to look at the whole of the inversion map (or at least to
504 # above Unicode; as the maps of code points above that should
505 # all be to the default).
506 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
507
508 my %mapped_lists; # A hash whose keys are the buckets.
509 while (@invlist) {
510 last if $invlist[0] > $upper_limit;
511
512 # This shouldn't actually happen, as prop_invmap() returns
513 # an extra element at the end that is beyond $upper_limit
514 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
515
516 my $bucket;
517
518 # A hash key can't be a ref (we are only expecting arrays
519 # of scalars here), so convert any such to a string that
520 # will be converted back later (using a vertical tab as
521 # the separator). Even if the mapping is to code points,
522 # we don't translate to native here because the code
523 # output_map() calls to output these arrays assumes the
524 # input is Unicode, not native.
525 if (ref $invmap[0]) {
526 $bucket = join "\cK", @{$invmap[0]};
527 }
528 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
529
530 # Do convert to native for maps to single code points.
531 # There are some properties that have a few outlier
532 # maps that aren't code points, so the above test
533 # skips those.
534 $bucket = a2n($invmap[0]);
535 } else {
536 $bucket = $invmap[0];
537 }
538
539 # We now have the bucket that all code points in the range
540 # map to, though possibly they need to be adjusted. Go
541 # through the range and put each translated code point in
542 # it into its bucket.
543 my $base_map = $invmap[0];
544 for my $j ($invlist[0] .. $invlist[1] - 1) {
545 if ($to_adjust
546 # The 1st code point doesn't need adjusting
547 && $j > $invlist[0]
548
549 # Skip any non-numeric maps: these are outliers
550 # that aren't code points.
551 && $base_map =~ $numeric_re
552
553 # 'ne' because the default can be a string
554 && $base_map ne $map_default)
555 {
556 # We adjust, by incrementing each the bucket and
557 # the map. For code point maps, translate to
558 # native
559 $base_map++;
560 $bucket = ($maps_to_code_point)
561 ? a2n($base_map)
562 : $base_map;
563 }
564
565 # Add the native code point to the bucket for the
566 # current map
567 push @{$mapped_lists{$bucket}}, a2n($j);
568 } # End of loop through all code points in the range
569
570 # Get ready for the next range
571 shift @invlist;
572 shift @invmap;
573 } # End of loop through all ranges in the map.
574
575 # Here, @invlist and @invmap retain all the ranges from the
576 # originals that start with code points above $upper_limit.
577 # Each bucket in %mapped_lists contains all the code points
578 # that map to that bucket. If the bucket is for a map to a
579 # single code point is a single code point, the bucket has
580 # been converted to native. If something else (including
581 # multiple code points), no conversion is done.
582 #
583 # Now we recreate the inversion map into %xlated, but this
584 # time for the native character set.
585 my %xlated;
586 foreach my $bucket (keys %mapped_lists) {
587
588 # Sort and convert this bucket to an inversion list. The
589 # result will be that ranges that start with even-numbered
590 # indexes will be for code points that map to this bucket;
591 # odd ones map to some other bucket, and are discarded
592 # below.
593 @{$mapped_lists{$bucket}}
594 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
595 @{$mapped_lists{$bucket}}
596 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
597
598 # Add each even-numbered range in the bucket to %xlated;
599 # so that the keys of %xlated become the range start code
600 # points, and the values are their corresponding maps.
601 while (@{$mapped_lists{$bucket}}) {
602 my $range_start = $mapped_lists{$bucket}->[0];
603 if ($bucket =~ /\cK/) {
604 @{$xlated{$range_start}} = split /\cK/, $bucket;
605 }
606 else {
607 $xlated{$range_start} = $bucket;
608 }
609 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
610 shift @{$mapped_lists{$bucket}}; # Get ready for next
611 # iteration
612 }
613 } # End of loop through all the buckets.
614
615 # Here %xlated's keys are the range starts of all the code
616 # points in the inversion map. Construct an inversion list
617 # from them.
618 my @new_invlist = sort { $a <=> $b } keys %xlated;
619
620 # If the list is adjusted, we want to munge this list so that
621 # we only have one entry for where consecutive code points map
622 # to consecutive values. We just skip the subsequent entries
623 # where this is the case.
624 if ($to_adjust) {
625 my @temp;
626 for my $i (0 .. @new_invlist - 1) {
627 next if $i > 0
628 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
629 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
630 && $xlated{$new_invlist[$i]} =~ $numeric_re
631 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
632 push @temp, $new_invlist[$i];
633 }
634 @new_invlist = @temp;
635 }
636
637 # The inversion map comes from %xlated's values. We can
638 # unshift each onto the front of the untouched portion, in
639 # reverse order of the portion we did process.
640 foreach my $start (reverse @new_invlist) {
641 unshift @invmap, $xlated{$start};
642 }
643
644 # Finally prepend the inversion list we have just constructed to the
645 # one that contains anything we didn't process.
646 unshift @invlist, @new_invlist;
647 }
648 }
649
650 # prop_invmap() returns an extra final entry, which we can now
651 # discard.
652 if (@invmap) {
653 pop @invlist;
654 pop @invmap;
ceb1de32 655 }
0f5e3c71
KW
656
657 if ($l1_only) {
99f21fb9 658 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
659 for my $i (0 .. @invlist - 1 - 1) {
660 if ($invlist[$i] > 255) {
661
662 # In an inversion list, even-numbered elements give the code
663 # points that begin ranges that match the property;
664 # odd-numbered give ones that begin ranges that don't match.
665 # If $i is odd, we are at the first code point above 255 that
666 # doesn't match, which means the range it is ending does
667 # match, and crosses the 255/256 boundary. We want to include
668 # this ending point, so increment $i, so the splice below
669 # includes it. Conversely, if $i is even, it is the first
670 # code point above 255 that matches, which means there was no
671 # matching range that crossed the boundary, and we don't want
672 # to include this code point, so splice before it.
673 $i++ if $i % 2 != 0;
674
675 # Remove everything past this.
676 splice @invlist, $i;
99f21fb9 677 splice @invmap, $i if @invmap;
0f5e3c71
KW
678 last;
679 }
0c4ecf42
KW
680 }
681 }
0f5e3c71
KW
682 elsif ($nonl1_only) {
683 my $found_nonl1 = 0;
684 for my $i (0 .. @invlist - 1 - 1) {
685 next if $invlist[$i] < 256;
686
687 # Here, we have the first element in the array that indicates an
688 # element above Latin1. Get rid of all previous ones.
689 splice @invlist, 0, $i;
99f21fb9 690 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
691
692 # If this one's index is not divisible by 2, it means that this
693 # element is inverting away from being in the list, which means
99f21fb9
KW
694 # all code points from 256 to this one are in this list (or
695 # map to the default for inversion maps)
696 if ($i % 2 != 0) {
697 unshift @invlist, 256;
698 unshift @invmap, $map_default if @invmap;
699 }
0f5e3c71 700 $found_nonl1 = 1;
3f427fd9
KW
701 last;
702 }
0f5e3c71 703 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 704 }
3f427fd9 705
0f5e3c71 706 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 707 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 708 }
bffc0129 709 end_file_pound_if;
0c4ecf42 710 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
711}
712
9a3da3ad
FC
713my @sources = ($0, "lib/Unicode/UCD.pm");
714{
715 # Depend on mktables’ own sources. It’s a shorter list of files than
716 # those that Unicode::UCD uses.
717 open my $mktables_list, "lib/unicore/mktables.lst"
718 or die "$0 cannot open lib/unicore/mktables.lst: $!";
719 while(<$mktables_list>) {
720 last if /===/;
721 chomp;
722 push @sources, "lib/unicore/$_" if /^[^#]/;
723 }
724}
725read_only_bottom_close_and_rename($out_fh, \@sources)