This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/mk_invlists.pl: Add capability for line break props
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
9d9177be 11require 'regen/regen_lib.pl';
0c4ecf42 12require 'regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
43b443dd
KW
41my $is_in_ifndef_ext_re = 0;
42
9d9177be
KW
43print $out_fh "/* See the generating file for comments */\n\n";
44
015bb97c
CB
45my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );
46
99f21fb9
KW
47my @a2n;
48
49sub uniques {
50 # Returns non-duplicated input values. From "Perl Best Practices:
51 # Encapsulated Cleverness". p. 455 in first edition.
52
53 my %seen;
54 return grep { ! $seen{$_}++ } @_;
55}
56
57sub a2n($) {
58 my $cp = shift;
59
60 # Returns the input Unicode code point translated to native.
61
62 return $cp if $cp !~ $numeric_re || $cp > 255;
63 return $a2n[$cp];
64}
65
43b443dd
KW
66sub end_ifndef_ext_re {
67 if ($is_in_ifndef_ext_re) {
68 print $out_fh "\n#endif\t/* #ifndef PERL_IN_XSUB_RE */\n";
69 $is_in_ifndef_ext_re = 0;
70 }
71}
72
0c4ecf42 73sub output_invlist ($$;$) {
9d9177be
KW
74 my $name = shift;
75 my $invlist = shift; # Reference to inversion list array
0c4ecf42 76 my $charset = shift // ""; # name of character set for comment
9d9177be 77
76d3994c
KW
78 die "No inversion list for $name" unless defined $invlist
79 && ref $invlist eq 'ARRAY'
80 && @$invlist;
81
9d9177be
KW
82 # Output the inversion list $invlist using the name $name for it.
83 # It is output in the exact internal form for inversion lists.
84
a0316a6c
KW
85 # Is the last element of the header 0, or 1 ?
86 my $zero_or_one = 0;
a0316a6c
KW
87 if ($invlist->[0] != 0) {
88 unshift @$invlist, 0;
9d9177be
KW
89 $zero_or_one = 1;
90 }
0a07b44b 91 my $count = @$invlist;
9d9177be 92
43b443dd
KW
93 if ($is_in_ifndef_ext_re) {
94 if (exists $include_in_ext_re{$name}) {
95 end_ifndef_ext_re;
96 }
97 }
98 elsif (! exists $include_in_ext_re{$name}) {
99 print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
100 $is_in_ifndef_ext_re = 1;
101 }
102
0c4ecf42
KW
103 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
104 print $out_fh " /* for $charset */" if $charset;
105 print $out_fh "\n";
9d9177be 106
a0316a6c 107 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
108 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
109 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
110 ",\t/* 0 if the list starts at 0;",
111 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
112
113 # The main body are the UVs passed in to this routine. Do the final
114 # element separately
47d53124
KW
115 for my $i (0 .. @$invlist - 1) {
116 printf $out_fh "\t0x%X", $invlist->[$i];
117 print $out_fh "," if $i < @$invlist - 1;
118 print $out_fh "\n";
9d9177be
KW
119 }
120
9d9177be
KW
121 print $out_fh "};\n";
122}
123
99f21fb9
KW
124sub output_invmap ($$$$$$$) {
125 my $name = shift;
126 my $invmap = shift; # Reference to inversion map array
127 my $prop_name = shift;
128 my $input_format = shift; # The inversion map's format
129 my $default = shift; # The property value for code points who
130 # otherwise don't have a value specified.
131 my $extra_enums = shift; # comma-separated list of our additions to the
132 # property's standard possible values
133 my $charset = shift // ""; # name of character set for comment
134
135 # Output the inversion map $invmap for property $prop_name, but use $name
136 # as the actual data structure's name.
137
138 my $count = @$invmap;
139
140 my $output_format;
141 my $declaration_type;
142 my %enums;
143 my $name_prefix;
144
145 if ($input_format eq 's') {
146 $prop_name = (prop_aliases($prop_name))[1]; # Get full name
147 my @enums = prop_values($prop_name);
148 if (! @enums) {
149 die "Only enum properties are currently handled; '$prop_name' isn't one";
150 }
151 else {
152
153 # Convert short names to long, add in the extras, and sort.
154 @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums;
155 push @enums, split /,/, $extra_enums if $extra_enums ne "";
156 @enums = sort @enums;
157
158 # Assign a value to each element of the enum. The default
159 # value always gets 0; the others are arbitrarily assigned.
160 my $enum_val = 0;
161 $default = prop_value_aliases($prop_name, $default);
162 $enums{$default} = $enum_val++;
163 for my $enum (@enums) {
164 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
165 }
166 }
167
168 end_ifndef_ext_re;
169 {
170
171 my $short_name = (prop_aliases($prop_name))[0];
172
173 # The short names tend to be two lower case letters, but it looks
174 # better for those if they are upper. XXX
175 $short_name = uc($short_name) if length($short_name) < 3
176 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
177 $name_prefix = "PL_${short_name}_";
178 my $enum_count = keys %enums;
179 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
180
181 print $out_fh "\ntypedef enum {\n";
182 print $out_fh "\t${name_prefix}$default = $enums{$default},\n";
183 delete $enums{$default};
184 foreach my $enum (sort { $a cmp $b } keys %enums) {
185 print $out_fh "\t${name_prefix}$enum = $enums{$enum}";
186 print $out_fh "," if $enums{$enum} < $enum_count - 1;
187 print $out_fh "\n";
188 }
189 $declaration_type = "${name_prefix}enum";
190 print $out_fh "} $declaration_type;\n";
191
192 $output_format = "${name_prefix}%s";
193 }
194 }
195 else {
196 die "'$input_format' invmap() format for '$prop_name' unimplemented";
197 }
198
199 die "No inversion map for $prop_name" unless defined $invmap
200 && ref $invmap eq 'ARRAY'
201 && $count;
202
203 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
204 print $out_fh " /* for $charset */" if $charset;
205 print $out_fh "\n";
206
207 # The main body are the scalars passed in to this routine.
208 for my $i (0 .. $count - 1) {
209 my $element = $invmap->[$i];
210 $element = $name_prefix . prop_value_aliases($prop_name, $element);
211 print $out_fh "\t$element";
212 print $out_fh "," if $i < $count - 1;
213 print $out_fh "\n";
214 }
215 print $out_fh "};\n";
216
217 unless ($is_in_ifndef_ext_re) {
218 print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n";
219 $is_in_ifndef_ext_re = 1;
220 }
221}
222
5a7e5385 223sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
224
225 # Returns an inversion list constructed from the sorted input array of
226 # code points
227
228 my $list_ref = shift;
229
99f21fb9
KW
230 return unless @$list_ref;
231
a02047bf
KW
232 # Initialize to just the first element
233 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
234
235 # For each succeeding element, if it extends the previous range, adjust
236 # up, otherwise add it.
237 for my $i (1 .. @$list_ref - 1) {
238 if ($invlist[-1] == $list_ref->[$i]) {
239 $invlist[-1]++;
240 }
241 else {
242 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
243 }
244 }
245 return @invlist;
246}
247
248# Read in the Case Folding rules, and construct arrays of code points for the
249# properties we need.
250my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
251die "Could not find inversion map for Case_Folding" unless defined $format;
252die "Incorrect format '$format' for Case_Folding inversion map"
253 unless $format eq 'al';
254my @has_multi_char_fold;
255my @is_non_final_fold;
256
257for my $i (0 .. @$folds_ref - 1) {
258 next unless ref $folds_ref->[$i]; # Skip single-char folds
259 push @has_multi_char_fold, $cp_ref->[$i];
260
b6a6e956 261 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
262 # position
263 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
264 push @is_non_final_fold, $folds_ref->[$i][$j]
265 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
266 }
267}
268
a02047bf
KW
269sub _Perl_Non_Final_Folds {
270 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 271 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
272}
273
99f21fb9
KW
274sub prop_name_for_cmp ($) { # Sort helper
275 my $name = shift;
276
277 # Returns the input lowercased, with non-alphas removed, as well as
278 # everything starting with a comma
279
280 $name =~ s/,.*//;
281 $name =~ s/[[:^alpha:]]//g;
282 return lc $name;
283}
284
892d8259 285sub UpperLatin1 {
5a7e5385 286 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
287}
288
9d9177be
KW
289output_invlist("Latin1", [ 0, 256 ]);
290output_invlist("AboveLatin1", [ 256 ]);
291
43b443dd
KW
292end_ifndef_ext_re;
293
3f427fd9
KW
294# We construct lists for all the POSIX and backslash sequence character
295# classes in two forms:
296# 1) ones which match only in the ASCII range
297# 2) ones which match either in the Latin1 range, or the entire Unicode range
298#
299# These get compiled in, and hence affect the memory footprint of every Perl
300# program, even those not using Unicode. To minimize the size, currently
301# the Latin1 version is generated for the beyond ASCII range except for those
302# lists that are quite small for the entire range, such as for \s, which is 22
303# UVs long plus 4 UVs (currently) for the header.
304#
305# To save even more memory, the ASCII versions could be derived from the
306# larger ones at runtime, saving some memory (minus the expense of the machine
307# instructions to do so), but these are all small anyway, so their total is
308# about 100 UVs.
309#
310# In the list of properties below that get generated, the L1 prefix is a fake
311# property that means just the Latin1 range of the full property (whose name
312# has an X prefix instead of L1).
a02047bf
KW
313#
314# An initial & means to use the subroutine from this file instead of an
315# official inversion list.
3f427fd9 316
0c4ecf42
KW
317for my $charset (get_supported_code_pages()) {
318 print $out_fh "\n" . get_conditional_compile_line_start($charset);
319
99f21fb9
KW
320 @a2n = @{get_a2n($charset)};
321 no warnings 'qw';
322 # Ignore non-alpha in sort
323 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
1c8c3428
KW
324 ASCII
325 Cased
326 VertSpace
327 XPerlSpace
328 XPosixAlnum
329 XPosixAlpha
330 XPosixBlank
331 XPosixCntrl
332 XPosixDigit
333 XPosixGraph
334 XPosixLower
335 XPosixPrint
336 XPosixPunct
337 XPosixSpace
338 XPosixUpper
339 XPosixWord
340 XPosixXDigit
341 _Perl_Any_Folds
342 &NonL1_Perl_Non_Final_Folds
343 _Perl_Folds_To_Multi_Char
344 &UpperLatin1
345 _Perl_IDStart
346 _Perl_IDCont
347 )
0f5e3c71
KW
348 ) {
349
350 # For the Latin1 properties, we change to use the eXtended version of the
351 # base property, then go through the result and get rid of everything not
352 # in Latin1 (above 255). Actually, we retain the element for the range
353 # that crosses the 255/256 boundary if it is one that matches the
354 # property. For example, in the Word property, there is a range of code
355 # points that start at U+00F8 and goes through U+02C1. Instead of
356 # artificially cutting that off at 256 because 256 is the first code point
357 # above Latin1, we let the range go to its natural ending. That gives us
358 # extra information with no added space taken. But if the range that
359 # crosses the boundary is one that doesn't match the property, we don't
360 # start a new range above 255, as that could be construed as going to
361 # infinity. For example, the Upper property doesn't include the character
362 # at 255, but does include the one at 256. We don't include the 256 one.
363 my $prop_name = $prop;
364 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
365 my $extra_enums = "";
366 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
367 my $lookup_prop = $prop_name;
368 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
369 or $lookup_prop =~ s/^L1//);
370 my $nonl1_only = 0;
371 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 372 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
373
374 my @invlist;
99f21fb9
KW
375 my @invmap;
376 my $map_format;
377 my $map_default;
378 my $maps_to_code_point;
379 my $to_adjust;
0f5e3c71
KW
380 if ($is_local_sub) {
381 @invlist = eval $lookup_prop;
382 }
383 else {
384 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9
KW
385 if (! @invlist) {
386 my ($list_ref, $map_ref, $format, $default);
387
388 ($list_ref, $map_ref, $format, $default)
389 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
390 die "Could not find inversion list for '$lookup_prop'" unless $list_ref;
391 @invlist = @$list_ref;
392 @invmap = @$map_ref;
393 $map_format = $format;
394 $map_default = $default;
395 $maps_to_code_point = $map_format =~ /x/;
396 $to_adjust = $map_format =~ /a/;
397 }
0f5e3c71
KW
398 }
399 die "Could not find inversion list for '$lookup_prop'" unless @invlist;
ceb1de32 400
99f21fb9
KW
401 # Re-order the Unicode code points to native ones for this platform.
402 # This is only needed for code points below 256, because native code
403 # points are only in that range. For inversion maps of properties
404 # where the mappings are adjusted (format =~ /a/), this reordering
405 # could mess up the adjustment pattern that was in the input, so that
406 # has to be dealt with.
407 #
408 # And inversion maps that map to code points need to eventually have
409 # all those code points remapped to native, and it's better to do that
410 # here, going through the whole list not just those below 256. This
411 # is because some inversion maps have adjustments (format =~ /a/)
412 # which may be affected by the reordering. This code needs to be done
413 # both for when we are translating the inversion lists for < 256, and
414 # for the inversion maps for everything. By doing both in this loop,
415 # we can share that code.
416 #
417 # So, we go through everything for an inversion map to code points;
418 # otherwise, we can skip any remapping at all if we are going to
419 # output only the above-Latin1 values, or if the range spans the whole
420 # of 0..256, as the remap will also include all of 0..256 (256 not
421 # 255 because a re-ordering could cause 256 to need to be in the same
422 # range as 255.)
423 if ((@invmap && $maps_to_code_point)
424 || (! $nonl1_only || ($invlist[0] < 256
425 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 426 {
fb4554ea 427
99f21fb9 428 if (! @invmap) { # Straight inversion list
fb4554ea
KW
429 # Look at all the ranges that start before 257.
430 my @latin1_list;
431 while (@invlist) {
432 last if $invlist[0] > 256;
433 my $upper = @invlist > 1
434 ? $invlist[1] - 1 # In range
8a6c81cf
KW
435
436 # To infinity. You may want to stop much much
437 # earlier; going this high may expose perl
438 # deficiencies with very large numbers.
439 : $Unicode::UCD::MAX_CP;
fb4554ea 440 for my $j ($invlist[0] .. $upper) {
99f21fb9 441 push @latin1_list, a2n($j);
0f5e3c71 442 }
fb4554ea
KW
443
444 shift @invlist; # Shift off the range that's in the list
445 shift @invlist; # Shift off the range not in the list
0c4ecf42 446 }
fb4554ea
KW
447
448 # Here @invlist contains all the ranges in the original that start
449 # at code points above 256, and @latin1_list contains all the
450 # native code points for ranges that start with a Unicode code
451 # point below 257. We sort the latter and convert it to inversion
452 # list format. Then simply prepend it to the list of the higher
453 # code points.
454 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 455 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 456 unshift @invlist, @latin1_list;
99f21fb9
KW
457 }
458 else { # Is an inversion map
459
460 # This is a similar procedure as plain inversion list, but has
461 # multiple buckets. A plain inversion list just has two
462 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
463 # pretty much can ignore the 2nd bucket, as it is completely
464 # defined by the 1st. But here, what we do is create buckets
465 # which contain the code points that map to each, translated
466 # to native and turned into an inversion list. Thus each
467 # bucket is an inversion list of native code points that map
468 # to it or don't map to it. We use these to create an
469 # inversion map for the whole property.
470
471 # As mentioned earlier, we use this procedure to not just
472 # remap the inversion list to native values, but also the maps
473 # of code points to native ones. In the latter case we have
474 # to look at the whole of the inversion map (or at least to
475 # above Unicode; as the maps of code points above that should
476 # all be to the default).
477 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
478
479 my %mapped_lists; # A hash whose keys are the buckets.
480 while (@invlist) {
481 last if $invlist[0] > $upper_limit;
482
483 # This shouldn't actually happen, as prop_invmap() returns
484 # an extra element at the end that is beyond $upper_limit
485 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
486
487 my $bucket;
488
489 # A hash key can't be a ref (we are only expecting arrays
490 # of scalars here), so convert any such to a string that
491 # will be converted back later (using a vertical tab as
492 # the separator). Even if the mapping is to code points,
493 # we don't translate to native here because the code
494 # output_map() calls to output these arrays assumes the
495 # input is Unicode, not native.
496 if (ref $invmap[0]) {
497 $bucket = join "\cK", @{$invmap[0]};
498 }
499 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
500
501 # Do convert to native for maps to single code points.
502 # There are some properties that have a few outlier
503 # maps that aren't code points, so the above test
504 # skips those.
505 $bucket = a2n($invmap[0]);
506 } else {
507 $bucket = $invmap[0];
508 }
509
510 # We now have the bucket that all code points in the range
511 # map to, though possibly they need to be adjusted. Go
512 # through the range and put each translated code point in
513 # it into its bucket.
514 my $base_map = $invmap[0];
515 for my $j ($invlist[0] .. $invlist[1] - 1) {
516 if ($to_adjust
517 # The 1st code point doesn't need adjusting
518 && $j > $invlist[0]
519
520 # Skip any non-numeric maps: these are outliers
521 # that aren't code points.
522 && $base_map =~ $numeric_re
523
524 # 'ne' because the default can be a string
525 && $base_map ne $map_default)
526 {
527 # We adjust, by incrementing each the bucket and
528 # the map. For code point maps, translate to
529 # native
530 $base_map++;
531 $bucket = ($maps_to_code_point)
532 ? a2n($base_map)
533 : $base_map;
534 }
535
536 # Add the native code point to the bucket for the
537 # current map
538 push @{$mapped_lists{$bucket}}, a2n($j);
539 } # End of loop through all code points in the range
540
541 # Get ready for the next range
542 shift @invlist;
543 shift @invmap;
544 } # End of loop through all ranges in the map.
545
546 # Here, @invlist and @invmap retain all the ranges from the
547 # originals that start with code points above $upper_limit.
548 # Each bucket in %mapped_lists contains all the code points
549 # that map to that bucket. If the bucket is for a map to a
550 # single code point is a single code point, the bucket has
551 # been converted to native. If something else (including
552 # multiple code points), no conversion is done.
553 #
554 # Now we recreate the inversion map into %xlated, but this
555 # time for the native character set.
556 my %xlated;
557 foreach my $bucket (keys %mapped_lists) {
558
559 # Sort and convert this bucket to an inversion list. The
560 # result will be that ranges that start with even-numbered
561 # indexes will be for code points that map to this bucket;
562 # odd ones map to some other bucket, and are discarded
563 # below.
564 @{$mapped_lists{$bucket}}
565 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
566 @{$mapped_lists{$bucket}}
567 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
568
569 # Add each even-numbered range in the bucket to %xlated;
570 # so that the keys of %xlated become the range start code
571 # points, and the values are their corresponding maps.
572 while (@{$mapped_lists{$bucket}}) {
573 my $range_start = $mapped_lists{$bucket}->[0];
574 if ($bucket =~ /\cK/) {
575 @{$xlated{$range_start}} = split /\cK/, $bucket;
576 }
577 else {
578 $xlated{$range_start} = $bucket;
579 }
580 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
581 shift @{$mapped_lists{$bucket}}; # Get ready for next
582 # iteration
583 }
584 } # End of loop through all the buckets.
585
586 # Here %xlated's keys are the range starts of all the code
587 # points in the inversion map. Construct an inversion list
588 # from them.
589 my @new_invlist = sort { $a <=> $b } keys %xlated;
590
591 # If the list is adjusted, we want to munge this list so that
592 # we only have one entry for where consecutive code points map
593 # to consecutive values. We just skip the subsequent entries
594 # where this is the case.
595 if ($to_adjust) {
596 my @temp;
597 for my $i (0 .. @new_invlist - 1) {
598 next if $i > 0
599 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
600 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
601 && $xlated{$new_invlist[$i]} =~ $numeric_re
602 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
603 push @temp, $new_invlist[$i];
604 }
605 @new_invlist = @temp;
606 }
607
608 # The inversion map comes from %xlated's values. We can
609 # unshift each onto the front of the untouched portion, in
610 # reverse order of the portion we did process.
611 foreach my $start (reverse @new_invlist) {
612 unshift @invmap, $xlated{$start};
613 }
614
615 # Finally prepend the inversion list we have just constructed to the
616 # one that contains anything we didn't process.
617 unshift @invlist, @new_invlist;
618 }
619 }
620
621 # prop_invmap() returns an extra final entry, which we can now
622 # discard.
623 if (@invmap) {
624 pop @invlist;
625 pop @invmap;
ceb1de32 626 }
0f5e3c71
KW
627
628 if ($l1_only) {
99f21fb9 629 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
630 for my $i (0 .. @invlist - 1 - 1) {
631 if ($invlist[$i] > 255) {
632
633 # In an inversion list, even-numbered elements give the code
634 # points that begin ranges that match the property;
635 # odd-numbered give ones that begin ranges that don't match.
636 # If $i is odd, we are at the first code point above 255 that
637 # doesn't match, which means the range it is ending does
638 # match, and crosses the 255/256 boundary. We want to include
639 # this ending point, so increment $i, so the splice below
640 # includes it. Conversely, if $i is even, it is the first
641 # code point above 255 that matches, which means there was no
642 # matching range that crossed the boundary, and we don't want
643 # to include this code point, so splice before it.
644 $i++ if $i % 2 != 0;
645
646 # Remove everything past this.
647 splice @invlist, $i;
99f21fb9 648 splice @invmap, $i if @invmap;
0f5e3c71
KW
649 last;
650 }
0c4ecf42
KW
651 }
652 }
0f5e3c71
KW
653 elsif ($nonl1_only) {
654 my $found_nonl1 = 0;
655 for my $i (0 .. @invlist - 1 - 1) {
656 next if $invlist[$i] < 256;
657
658 # Here, we have the first element in the array that indicates an
659 # element above Latin1. Get rid of all previous ones.
660 splice @invlist, 0, $i;
99f21fb9 661 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
662
663 # If this one's index is not divisible by 2, it means that this
664 # element is inverting away from being in the list, which means
99f21fb9
KW
665 # all code points from 256 to this one are in this list (or
666 # map to the default for inversion maps)
667 if ($i % 2 != 0) {
668 unshift @invlist, 256;
669 unshift @invmap, $map_default if @invmap;
670 }
0f5e3c71 671 $found_nonl1 = 1;
3f427fd9
KW
672 last;
673 }
0f5e3c71 674 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 675 }
3f427fd9 676
0f5e3c71 677 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 678 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 679 }
43b443dd 680 end_ifndef_ext_re;
0c4ecf42 681 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
682}
683
9a3da3ad
FC
684my @sources = ($0, "lib/Unicode/UCD.pm");
685{
686 # Depend on mktables’ own sources. It’s a shorter list of files than
687 # those that Unicode::UCD uses.
688 open my $mktables_list, "lib/unicore/mktables.lst"
689 or die "$0 cannot open lib/unicore/mktables.lst: $!";
690 while(<$mktables_list>) {
691 last if /===/;
692 chomp;
693 push @sources, "lib/unicore/$_" if /^[^#]/;
694 }
695}
696read_only_bottom_close_and_rename($out_fh, \@sources)