Commit | Line | Data |
---|---|---|
9d9177be KW |
1 | #!perl -w |
2 | use 5.015; | |
3 | use strict; | |
4 | use warnings; | |
99f21fb9 KW |
5 | use Unicode::UCD qw(prop_aliases |
6 | prop_values | |
7 | prop_value_aliases | |
8 | prop_invlist | |
9 | prop_invmap search_invlist | |
10 | ); | |
9d9177be | 11 | require 'regen/regen_lib.pl'; |
0c4ecf42 | 12 | require 'regen/charset_translations.pl'; |
9d9177be KW |
13 | |
14 | # This program outputs charclass_invlists.h, which contains various inversion | |
15 | # lists in the form of C arrays that are to be used as-is for inversion lists. | |
16 | # Thus, the lists it contains are essentially pre-compiled, and need only a | |
17 | # light-weight fast wrapper to make them usable at run-time. | |
18 | ||
19 | # As such, this code knows about the internal structure of these lists, and | |
20 | # any change made to that has to be done here as well. A random number stored | |
21 | # in the headers is used to minimize the possibility of things getting | |
22 | # out-of-sync, or the wrong data structure being passed. Currently that | |
23 | # random number is: | |
99f21fb9 KW |
24 | |
25 | # charclass_invlists.h now also has a partial implementation of inversion | |
26 | # maps; enough to generate tables for the line break properties, such as GCB | |
27 | ||
0a07b44b | 28 | my $VERSION_DATA_STRUCTURE_TYPE = 148565664; |
9d9177be | 29 | |
99f21fb9 KW |
30 | # integer or float |
31 | my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax; | |
32 | ||
33 | # Matches valid C language enum names: begins with ASCII alphabetic, then any | |
34 | # ASCII \w | |
35 | my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax; | |
36 | ||
9d9177be KW |
37 | my $out_fh = open_new('charclass_invlists.h', '>', |
38 | {style => '*', by => $0, | |
39 | from => "Unicode::UCD"}); | |
40 | ||
bffc0129 | 41 | my $in_file_pound_if = 0; |
43b443dd | 42 | |
9d9177be KW |
43 | print $out_fh "/* See the generating file for comments */\n\n"; |
44 | ||
bffc0129 KW |
45 | # The symbols generated by this program are all currently defined only in a |
46 | # single dot c each. The code knows where most of them go, but this hash | |
47 | # gives overrides for the exceptions to the typical place | |
48 | my %exceptions_to_where_to_define = | |
49 | ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C', | |
50 | AboveLatin1 => 'PERL_IN_REGCOMP_C', | |
51 | Latin1 => 'PERL_IN_REGCOMP_C', | |
52 | UpperLatin1 => 'PERL_IN_REGCOMP_C', | |
53 | _Perl_Any_Folds => 'PERL_IN_REGCOMP_C', | |
54 | _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C', | |
55 | _Perl_IDCont => 'PERL_IN_UTF8_C', | |
56 | _Perl_IDStart => 'PERL_IN_UTF8_C', | |
57 | ); | |
015bb97c | 58 | |
f79a09fc KW |
59 | # This hash contains the properties with enums that have hard-coded references |
60 | # to them in C code. Its only use is to make sure that if perl is compiled | |
61 | # with an older Unicode data set, that all the enum values the code is | |
62 | # expecting will still be in the enum typedef. Thus the code doesn't have to | |
63 | # change. The Unicode version won't have any code points that have these enum | |
64 | # values, so the code that handles them will not get exercised. This is far | |
65 | # better than having to #ifdef things. | |
66 | my %hard_coded_enums = | |
67 | ( gcb => [ | |
68 | 'Control', | |
69 | 'CR', | |
70 | 'Extend', | |
71 | 'L', | |
72 | 'LF', | |
73 | 'LV', | |
74 | 'LVT', | |
75 | 'Other', | |
76 | 'Prepend', | |
77 | 'Regional_Indicator', | |
78 | 'SpacingMark', | |
79 | 'T', | |
80 | 'V', | |
81 | ], | |
82 | sb => [ | |
83 | 'ATerm', | |
84 | 'Close', | |
85 | 'CR', | |
86 | 'Extend', | |
87 | 'Format', | |
88 | 'LF', | |
89 | 'Lower', | |
90 | 'Numeric', | |
91 | 'OLetter', | |
92 | 'Other', | |
93 | 'SContinue', | |
94 | 'Sep', | |
95 | 'Sp', | |
96 | 'STerm', | |
97 | 'Upper', | |
98 | ], | |
99 | wb => [ | |
100 | 'ALetter', | |
101 | 'CR', | |
102 | 'Double_Quote', | |
103 | 'Extend', | |
104 | 'ExtendNumLet', | |
105 | 'Format', | |
106 | 'Hebrew_Letter', | |
107 | 'Katakana', | |
108 | 'LF', | |
109 | 'MidLetter', | |
110 | 'MidNum', | |
111 | 'MidNumLet', | |
112 | 'Newline', | |
113 | 'Numeric', | |
114 | 'Other', | |
115 | 'Regional_Indicator', | |
116 | 'Single_Quote', | |
117 | 'UNKNOWN', | |
118 | ], | |
119 | ); | |
120 | ||
99f21fb9 KW |
121 | my @a2n; |
122 | ||
123 | sub uniques { | |
124 | # Returns non-duplicated input values. From "Perl Best Practices: | |
125 | # Encapsulated Cleverness". p. 455 in first edition. | |
126 | ||
127 | my %seen; | |
128 | return grep { ! $seen{$_}++ } @_; | |
129 | } | |
130 | ||
131 | sub a2n($) { | |
132 | my $cp = shift; | |
133 | ||
134 | # Returns the input Unicode code point translated to native. | |
135 | ||
136 | return $cp if $cp !~ $numeric_re || $cp > 255; | |
137 | return $a2n[$cp]; | |
138 | } | |
139 | ||
bffc0129 KW |
140 | sub end_file_pound_if { |
141 | if ($in_file_pound_if) { | |
142 | print $out_fh "\n#endif\t/* $in_file_pound_if */\n"; | |
143 | $in_file_pound_if = 0; | |
144 | } | |
145 | } | |
146 | ||
147 | sub switch_pound_if ($$) { | |
148 | my $name = shift; | |
149 | my $new_pound_if = shift; | |
150 | ||
151 | # Switch to new #if given by the 2nd argument. If there is an override | |
152 | # for this, it instead switches to that. The 1st argument is the | |
153 | # static's name, used to look up the overrides | |
154 | ||
155 | if (exists $exceptions_to_where_to_define{$name}) { | |
156 | $new_pound_if = $exceptions_to_where_to_define{$name}; | |
157 | } | |
158 | ||
159 | # Exit current #if if the new one is different from the old | |
160 | if ($in_file_pound_if | |
161 | && $in_file_pound_if !~ /$new_pound_if/) | |
162 | { | |
163 | end_file_pound_if; | |
164 | } | |
165 | ||
166 | # Enter new #if, if not already in it. | |
167 | if (! $in_file_pound_if) { | |
168 | $in_file_pound_if = "defined($new_pound_if)"; | |
169 | print $out_fh "\n#if $in_file_pound_if\n"; | |
43b443dd KW |
170 | } |
171 | } | |
172 | ||
0c4ecf42 | 173 | sub output_invlist ($$;$) { |
9d9177be KW |
174 | my $name = shift; |
175 | my $invlist = shift; # Reference to inversion list array | |
0c4ecf42 | 176 | my $charset = shift // ""; # name of character set for comment |
9d9177be | 177 | |
76d3994c KW |
178 | die "No inversion list for $name" unless defined $invlist |
179 | && ref $invlist eq 'ARRAY' | |
180 | && @$invlist; | |
181 | ||
9d9177be KW |
182 | # Output the inversion list $invlist using the name $name for it. |
183 | # It is output in the exact internal form for inversion lists. | |
184 | ||
a0316a6c KW |
185 | # Is the last element of the header 0, or 1 ? |
186 | my $zero_or_one = 0; | |
a0316a6c KW |
187 | if ($invlist->[0] != 0) { |
188 | unshift @$invlist, 0; | |
9d9177be KW |
189 | $zero_or_one = 1; |
190 | } | |
0a07b44b | 191 | my $count = @$invlist; |
9d9177be | 192 | |
bffc0129 | 193 | switch_pound_if ($name, 'PERL_IN_PERL_C'); |
43b443dd | 194 | |
0c4ecf42 KW |
195 | print $out_fh "\nstatic const UV ${name}_invlist[] = {"; |
196 | print $out_fh " /* for $charset */" if $charset; | |
197 | print $out_fh "\n"; | |
9d9177be | 198 | |
a0316a6c | 199 | print $out_fh "\t$count,\t/* Number of elements */\n"; |
9d9177be KW |
200 | print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n"; |
201 | print $out_fh "\t", $zero_or_one, | |
a0316a6c KW |
202 | ",\t/* 0 if the list starts at 0;", |
203 | "\n\t\t 1 if it starts at the element beyond 0 */\n"; | |
9d9177be KW |
204 | |
205 | # The main body are the UVs passed in to this routine. Do the final | |
206 | # element separately | |
47d53124 KW |
207 | for my $i (0 .. @$invlist - 1) { |
208 | printf $out_fh "\t0x%X", $invlist->[$i]; | |
209 | print $out_fh "," if $i < @$invlist - 1; | |
210 | print $out_fh "\n"; | |
9d9177be KW |
211 | } |
212 | ||
9d9177be KW |
213 | print $out_fh "};\n"; |
214 | } | |
215 | ||
99f21fb9 KW |
216 | sub output_invmap ($$$$$$$) { |
217 | my $name = shift; | |
218 | my $invmap = shift; # Reference to inversion map array | |
219 | my $prop_name = shift; | |
220 | my $input_format = shift; # The inversion map's format | |
221 | my $default = shift; # The property value for code points who | |
222 | # otherwise don't have a value specified. | |
223 | my $extra_enums = shift; # comma-separated list of our additions to the | |
224 | # property's standard possible values | |
225 | my $charset = shift // ""; # name of character set for comment | |
226 | ||
227 | # Output the inversion map $invmap for property $prop_name, but use $name | |
228 | # as the actual data structure's name. | |
229 | ||
230 | my $count = @$invmap; | |
231 | ||
232 | my $output_format; | |
233 | my $declaration_type; | |
234 | my %enums; | |
235 | my $name_prefix; | |
236 | ||
237 | if ($input_format eq 's') { | |
238 | $prop_name = (prop_aliases($prop_name))[1]; # Get full name | |
f79a09fc | 239 | my $short_name = (prop_aliases($prop_name))[0]; |
99f21fb9 KW |
240 | my @enums = prop_values($prop_name); |
241 | if (! @enums) { | |
242 | die "Only enum properties are currently handled; '$prop_name' isn't one"; | |
243 | } | |
244 | else { | |
245 | ||
f79a09fc | 246 | # Convert short names to long |
99f21fb9 | 247 | @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums; |
f79a09fc KW |
248 | |
249 | my @expected_enums = @{$hard_coded_enums{lc $short_name}}; | |
250 | die 'You need to update %hard_coded_enums to reflect new entries in this Unicode version' | |
251 | if @expected_enums < @enums; | |
252 | ||
253 | # Remove the enums found in the input from the ones we expect | |
254 | for (my $i = @expected_enums - 1; $i >= 0; $i--) { | |
255 | splice(@expected_enums, $i, 1) | |
256 | if grep { $expected_enums[$i] eq $_ } @enums; | |
257 | } | |
258 | ||
259 | # The ones remaining must be because we're using an older | |
260 | # Unicode version. Add them to the list. | |
261 | push @enums, @expected_enums; | |
262 | ||
263 | # Add in the extra values coded into this program, and sort. | |
99f21fb9 KW |
264 | push @enums, split /,/, $extra_enums if $extra_enums ne ""; |
265 | @enums = sort @enums; | |
266 | ||
267 | # Assign a value to each element of the enum. The default | |
268 | # value always gets 0; the others are arbitrarily assigned. | |
269 | my $enum_val = 0; | |
270 | $default = prop_value_aliases($prop_name, $default); | |
271 | $enums{$default} = $enum_val++; | |
272 | for my $enum (@enums) { | |
273 | $enums{$enum} = $enum_val++ unless exists $enums{$enum}; | |
274 | } | |
275 | } | |
276 | ||
bffc0129 KW |
277 | # Inversion map stuff is currently used only by regexec |
278 | switch_pound_if($name, 'PERL_IN_REGEXEC_C'); | |
99f21fb9 KW |
279 | { |
280 | ||
99f21fb9 KW |
281 | # The short names tend to be two lower case letters, but it looks |
282 | # better for those if they are upper. XXX | |
283 | $short_name = uc($short_name) if length($short_name) < 3 | |
284 | || substr($short_name, 0, 1) =~ /[[:lower:]]/; | |
85e5f08b | 285 | $name_prefix = "${short_name}_"; |
99f21fb9 KW |
286 | my $enum_count = keys %enums; |
287 | print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n"; | |
288 | ||
289 | print $out_fh "\ntypedef enum {\n"; | |
290 | print $out_fh "\t${name_prefix}$default = $enums{$default},\n"; | |
291 | delete $enums{$default}; | |
292 | foreach my $enum (sort { $a cmp $b } keys %enums) { | |
293 | print $out_fh "\t${name_prefix}$enum = $enums{$enum}"; | |
294 | print $out_fh "," if $enums{$enum} < $enum_count - 1; | |
295 | print $out_fh "\n"; | |
296 | } | |
297 | $declaration_type = "${name_prefix}enum"; | |
298 | print $out_fh "} $declaration_type;\n"; | |
299 | ||
300 | $output_format = "${name_prefix}%s"; | |
301 | } | |
302 | } | |
303 | else { | |
304 | die "'$input_format' invmap() format for '$prop_name' unimplemented"; | |
305 | } | |
306 | ||
307 | die "No inversion map for $prop_name" unless defined $invmap | |
308 | && ref $invmap eq 'ARRAY' | |
309 | && $count; | |
310 | ||
311 | print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {"; | |
312 | print $out_fh " /* for $charset */" if $charset; | |
313 | print $out_fh "\n"; | |
314 | ||
315 | # The main body are the scalars passed in to this routine. | |
316 | for my $i (0 .. $count - 1) { | |
317 | my $element = $invmap->[$i]; | |
318 | $element = $name_prefix . prop_value_aliases($prop_name, $element); | |
319 | print $out_fh "\t$element"; | |
320 | print $out_fh "," if $i < $count - 1; | |
321 | print $out_fh "\n"; | |
322 | } | |
323 | print $out_fh "};\n"; | |
99f21fb9 KW |
324 | } |
325 | ||
5a7e5385 | 326 | sub mk_invlist_from_sorted_cp_list { |
a02047bf KW |
327 | |
328 | # Returns an inversion list constructed from the sorted input array of | |
329 | # code points | |
330 | ||
331 | my $list_ref = shift; | |
332 | ||
99f21fb9 KW |
333 | return unless @$list_ref; |
334 | ||
a02047bf KW |
335 | # Initialize to just the first element |
336 | my @invlist = ( $list_ref->[0], $list_ref->[0] + 1); | |
337 | ||
338 | # For each succeeding element, if it extends the previous range, adjust | |
339 | # up, otherwise add it. | |
340 | for my $i (1 .. @$list_ref - 1) { | |
341 | if ($invlist[-1] == $list_ref->[$i]) { | |
342 | $invlist[-1]++; | |
343 | } | |
344 | else { | |
345 | push @invlist, $list_ref->[$i], $list_ref->[$i] + 1; | |
346 | } | |
347 | } | |
348 | return @invlist; | |
349 | } | |
350 | ||
351 | # Read in the Case Folding rules, and construct arrays of code points for the | |
352 | # properties we need. | |
353 | my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding"); | |
354 | die "Could not find inversion map for Case_Folding" unless defined $format; | |
355 | die "Incorrect format '$format' for Case_Folding inversion map" | |
356 | unless $format eq 'al'; | |
357 | my @has_multi_char_fold; | |
358 | my @is_non_final_fold; | |
359 | ||
360 | for my $i (0 .. @$folds_ref - 1) { | |
361 | next unless ref $folds_ref->[$i]; # Skip single-char folds | |
362 | push @has_multi_char_fold, $cp_ref->[$i]; | |
363 | ||
b6a6e956 | 364 | # Add to the non-finals list each code point that is in a non-final |
a02047bf KW |
365 | # position |
366 | for my $j (0 .. @{$folds_ref->[$i]} - 2) { | |
367 | push @is_non_final_fold, $folds_ref->[$i][$j] | |
368 | unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold; | |
369 | } | |
370 | } | |
371 | ||
a02047bf KW |
372 | sub _Perl_Non_Final_Folds { |
373 | @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold; | |
5a7e5385 | 374 | return mk_invlist_from_sorted_cp_list(\@is_non_final_fold); |
a02047bf KW |
375 | } |
376 | ||
99f21fb9 KW |
377 | sub prop_name_for_cmp ($) { # Sort helper |
378 | my $name = shift; | |
379 | ||
380 | # Returns the input lowercased, with non-alphas removed, as well as | |
381 | # everything starting with a comma | |
382 | ||
383 | $name =~ s/,.*//; | |
384 | $name =~ s/[[:^alpha:]]//g; | |
385 | return lc $name; | |
386 | } | |
387 | ||
892d8259 | 388 | sub UpperLatin1 { |
5a7e5385 | 389 | return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]); |
892d8259 KW |
390 | } |
391 | ||
9d9177be KW |
392 | output_invlist("Latin1", [ 0, 256 ]); |
393 | output_invlist("AboveLatin1", [ 256 ]); | |
394 | ||
bffc0129 | 395 | end_file_pound_if; |
43b443dd | 396 | |
3f427fd9 KW |
397 | # We construct lists for all the POSIX and backslash sequence character |
398 | # classes in two forms: | |
399 | # 1) ones which match only in the ASCII range | |
400 | # 2) ones which match either in the Latin1 range, or the entire Unicode range | |
401 | # | |
402 | # These get compiled in, and hence affect the memory footprint of every Perl | |
403 | # program, even those not using Unicode. To minimize the size, currently | |
404 | # the Latin1 version is generated for the beyond ASCII range except for those | |
405 | # lists that are quite small for the entire range, such as for \s, which is 22 | |
406 | # UVs long plus 4 UVs (currently) for the header. | |
407 | # | |
408 | # To save even more memory, the ASCII versions could be derived from the | |
409 | # larger ones at runtime, saving some memory (minus the expense of the machine | |
410 | # instructions to do so), but these are all small anyway, so their total is | |
411 | # about 100 UVs. | |
412 | # | |
413 | # In the list of properties below that get generated, the L1 prefix is a fake | |
414 | # property that means just the Latin1 range of the full property (whose name | |
415 | # has an X prefix instead of L1). | |
a02047bf KW |
416 | # |
417 | # An initial & means to use the subroutine from this file instead of an | |
418 | # official inversion list. | |
3f427fd9 | 419 | |
0c4ecf42 KW |
420 | for my $charset (get_supported_code_pages()) { |
421 | print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
422 | ||
99f21fb9 KW |
423 | @a2n = @{get_a2n($charset)}; |
424 | no warnings 'qw'; | |
425 | # Ignore non-alpha in sort | |
426 | for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw( | |
1c8c3428 KW |
427 | ASCII |
428 | Cased | |
429 | VertSpace | |
430 | XPerlSpace | |
431 | XPosixAlnum | |
432 | XPosixAlpha | |
433 | XPosixBlank | |
434 | XPosixCntrl | |
435 | XPosixDigit | |
436 | XPosixGraph | |
437 | XPosixLower | |
438 | XPosixPrint | |
439 | XPosixPunct | |
440 | XPosixSpace | |
441 | XPosixUpper | |
442 | XPosixWord | |
443 | XPosixXDigit | |
444 | _Perl_Any_Folds | |
445 | &NonL1_Perl_Non_Final_Folds | |
446 | _Perl_Folds_To_Multi_Char | |
447 | &UpperLatin1 | |
448 | _Perl_IDStart | |
449 | _Perl_IDCont | |
0e0b9356 | 450 | Grapheme_Cluster_Break,EDGE |
53255578 | 451 | Word_Break,EDGE,UNKNOWN |
d3e558c4 | 452 | Sentence_Break,EDGE |
1c8c3428 | 453 | ) |
0f5e3c71 KW |
454 | ) { |
455 | ||
456 | # For the Latin1 properties, we change to use the eXtended version of the | |
457 | # base property, then go through the result and get rid of everything not | |
458 | # in Latin1 (above 255). Actually, we retain the element for the range | |
459 | # that crosses the 255/256 boundary if it is one that matches the | |
460 | # property. For example, in the Word property, there is a range of code | |
461 | # points that start at U+00F8 and goes through U+02C1. Instead of | |
462 | # artificially cutting that off at 256 because 256 is the first code point | |
463 | # above Latin1, we let the range go to its natural ending. That gives us | |
464 | # extra information with no added space taken. But if the range that | |
465 | # crosses the boundary is one that doesn't match the property, we don't | |
466 | # start a new range above 255, as that could be construed as going to | |
467 | # infinity. For example, the Upper property doesn't include the character | |
468 | # at 255, but does include the one at 256. We don't include the 256 one. | |
469 | my $prop_name = $prop; | |
470 | my $is_local_sub = $prop_name =~ s/^&//; | |
99f21fb9 KW |
471 | my $extra_enums = ""; |
472 | $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x; | |
0f5e3c71 KW |
473 | my $lookup_prop = $prop_name; |
474 | my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ | |
475 | or $lookup_prop =~ s/^L1//); | |
476 | my $nonl1_only = 0; | |
477 | $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only; | |
99f21fb9 | 478 | ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x; |
0f5e3c71 KW |
479 | |
480 | my @invlist; | |
99f21fb9 KW |
481 | my @invmap; |
482 | my $map_format; | |
483 | my $map_default; | |
484 | my $maps_to_code_point; | |
485 | my $to_adjust; | |
0f5e3c71 KW |
486 | if ($is_local_sub) { |
487 | @invlist = eval $lookup_prop; | |
488 | } | |
489 | else { | |
490 | @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok'); | |
99f21fb9 KW |
491 | if (! @invlist) { |
492 | my ($list_ref, $map_ref, $format, $default); | |
493 | ||
494 | ($list_ref, $map_ref, $format, $default) | |
495 | = prop_invmap($lookup_prop, '_perl_core_internal_ok'); | |
496 | die "Could not find inversion list for '$lookup_prop'" unless $list_ref; | |
497 | @invlist = @$list_ref; | |
498 | @invmap = @$map_ref; | |
499 | $map_format = $format; | |
500 | $map_default = $default; | |
501 | $maps_to_code_point = $map_format =~ /x/; | |
502 | $to_adjust = $map_format =~ /a/; | |
503 | } | |
0f5e3c71 KW |
504 | } |
505 | die "Could not find inversion list for '$lookup_prop'" unless @invlist; | |
ceb1de32 | 506 | |
99f21fb9 KW |
507 | # Re-order the Unicode code points to native ones for this platform. |
508 | # This is only needed for code points below 256, because native code | |
509 | # points are only in that range. For inversion maps of properties | |
510 | # where the mappings are adjusted (format =~ /a/), this reordering | |
511 | # could mess up the adjustment pattern that was in the input, so that | |
512 | # has to be dealt with. | |
513 | # | |
514 | # And inversion maps that map to code points need to eventually have | |
515 | # all those code points remapped to native, and it's better to do that | |
516 | # here, going through the whole list not just those below 256. This | |
517 | # is because some inversion maps have adjustments (format =~ /a/) | |
518 | # which may be affected by the reordering. This code needs to be done | |
519 | # both for when we are translating the inversion lists for < 256, and | |
520 | # for the inversion maps for everything. By doing both in this loop, | |
521 | # we can share that code. | |
522 | # | |
523 | # So, we go through everything for an inversion map to code points; | |
524 | # otherwise, we can skip any remapping at all if we are going to | |
525 | # output only the above-Latin1 values, or if the range spans the whole | |
526 | # of 0..256, as the remap will also include all of 0..256 (256 not | |
527 | # 255 because a re-ordering could cause 256 to need to be in the same | |
528 | # range as 255.) | |
529 | if ((@invmap && $maps_to_code_point) | |
530 | || (! $nonl1_only || ($invlist[0] < 256 | |
531 | && ! ($invlist[0] == 0 && $invlist[1] > 256)))) | |
ceb1de32 | 532 | { |
fb4554ea | 533 | |
99f21fb9 | 534 | if (! @invmap) { # Straight inversion list |
fb4554ea KW |
535 | # Look at all the ranges that start before 257. |
536 | my @latin1_list; | |
537 | while (@invlist) { | |
538 | last if $invlist[0] > 256; | |
539 | my $upper = @invlist > 1 | |
540 | ? $invlist[1] - 1 # In range | |
8a6c81cf KW |
541 | |
542 | # To infinity. You may want to stop much much | |
543 | # earlier; going this high may expose perl | |
544 | # deficiencies with very large numbers. | |
545 | : $Unicode::UCD::MAX_CP; | |
fb4554ea | 546 | for my $j ($invlist[0] .. $upper) { |
99f21fb9 | 547 | push @latin1_list, a2n($j); |
0f5e3c71 | 548 | } |
fb4554ea KW |
549 | |
550 | shift @invlist; # Shift off the range that's in the list | |
551 | shift @invlist; # Shift off the range not in the list | |
0c4ecf42 | 552 | } |
fb4554ea KW |
553 | |
554 | # Here @invlist contains all the ranges in the original that start | |
555 | # at code points above 256, and @latin1_list contains all the | |
556 | # native code points for ranges that start with a Unicode code | |
557 | # point below 257. We sort the latter and convert it to inversion | |
558 | # list format. Then simply prepend it to the list of the higher | |
559 | # code points. | |
560 | @latin1_list = sort { $a <=> $b } @latin1_list; | |
5a7e5385 | 561 | @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list); |
fb4554ea | 562 | unshift @invlist, @latin1_list; |
99f21fb9 KW |
563 | } |
564 | else { # Is an inversion map | |
565 | ||
566 | # This is a similar procedure as plain inversion list, but has | |
567 | # multiple buckets. A plain inversion list just has two | |
568 | # buckets, 1) 'in' the list; and 2) 'not' in the list, and we | |
569 | # pretty much can ignore the 2nd bucket, as it is completely | |
570 | # defined by the 1st. But here, what we do is create buckets | |
571 | # which contain the code points that map to each, translated | |
572 | # to native and turned into an inversion list. Thus each | |
573 | # bucket is an inversion list of native code points that map | |
574 | # to it or don't map to it. We use these to create an | |
575 | # inversion map for the whole property. | |
576 | ||
577 | # As mentioned earlier, we use this procedure to not just | |
578 | # remap the inversion list to native values, but also the maps | |
579 | # of code points to native ones. In the latter case we have | |
580 | # to look at the whole of the inversion map (or at least to | |
581 | # above Unicode; as the maps of code points above that should | |
582 | # all be to the default). | |
583 | my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256; | |
584 | ||
585 | my %mapped_lists; # A hash whose keys are the buckets. | |
586 | while (@invlist) { | |
587 | last if $invlist[0] > $upper_limit; | |
588 | ||
589 | # This shouldn't actually happen, as prop_invmap() returns | |
590 | # an extra element at the end that is beyond $upper_limit | |
591 | die "inversion map that extends to infinity is unimplemented" unless @invlist > 1; | |
592 | ||
593 | my $bucket; | |
594 | ||
595 | # A hash key can't be a ref (we are only expecting arrays | |
596 | # of scalars here), so convert any such to a string that | |
597 | # will be converted back later (using a vertical tab as | |
598 | # the separator). Even if the mapping is to code points, | |
599 | # we don't translate to native here because the code | |
600 | # output_map() calls to output these arrays assumes the | |
601 | # input is Unicode, not native. | |
602 | if (ref $invmap[0]) { | |
603 | $bucket = join "\cK", @{$invmap[0]}; | |
604 | } | |
605 | elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) { | |
606 | ||
607 | # Do convert to native for maps to single code points. | |
608 | # There are some properties that have a few outlier | |
609 | # maps that aren't code points, so the above test | |
610 | # skips those. | |
611 | $bucket = a2n($invmap[0]); | |
612 | } else { | |
613 | $bucket = $invmap[0]; | |
614 | } | |
615 | ||
616 | # We now have the bucket that all code points in the range | |
617 | # map to, though possibly they need to be adjusted. Go | |
618 | # through the range and put each translated code point in | |
619 | # it into its bucket. | |
620 | my $base_map = $invmap[0]; | |
621 | for my $j ($invlist[0] .. $invlist[1] - 1) { | |
622 | if ($to_adjust | |
623 | # The 1st code point doesn't need adjusting | |
624 | && $j > $invlist[0] | |
625 | ||
626 | # Skip any non-numeric maps: these are outliers | |
627 | # that aren't code points. | |
628 | && $base_map =~ $numeric_re | |
629 | ||
630 | # 'ne' because the default can be a string | |
631 | && $base_map ne $map_default) | |
632 | { | |
633 | # We adjust, by incrementing each the bucket and | |
634 | # the map. For code point maps, translate to | |
635 | # native | |
636 | $base_map++; | |
637 | $bucket = ($maps_to_code_point) | |
638 | ? a2n($base_map) | |
639 | : $base_map; | |
640 | } | |
641 | ||
642 | # Add the native code point to the bucket for the | |
643 | # current map | |
644 | push @{$mapped_lists{$bucket}}, a2n($j); | |
645 | } # End of loop through all code points in the range | |
646 | ||
647 | # Get ready for the next range | |
648 | shift @invlist; | |
649 | shift @invmap; | |
650 | } # End of loop through all ranges in the map. | |
651 | ||
652 | # Here, @invlist and @invmap retain all the ranges from the | |
653 | # originals that start with code points above $upper_limit. | |
654 | # Each bucket in %mapped_lists contains all the code points | |
655 | # that map to that bucket. If the bucket is for a map to a | |
656 | # single code point is a single code point, the bucket has | |
657 | # been converted to native. If something else (including | |
658 | # multiple code points), no conversion is done. | |
659 | # | |
660 | # Now we recreate the inversion map into %xlated, but this | |
661 | # time for the native character set. | |
662 | my %xlated; | |
663 | foreach my $bucket (keys %mapped_lists) { | |
664 | ||
665 | # Sort and convert this bucket to an inversion list. The | |
666 | # result will be that ranges that start with even-numbered | |
667 | # indexes will be for code points that map to this bucket; | |
668 | # odd ones map to some other bucket, and are discarded | |
669 | # below. | |
670 | @{$mapped_lists{$bucket}} | |
671 | = sort{ $a <=> $b} @{$mapped_lists{$bucket}}; | |
672 | @{$mapped_lists{$bucket}} | |
673 | = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}}); | |
674 | ||
675 | # Add each even-numbered range in the bucket to %xlated; | |
676 | # so that the keys of %xlated become the range start code | |
677 | # points, and the values are their corresponding maps. | |
678 | while (@{$mapped_lists{$bucket}}) { | |
679 | my $range_start = $mapped_lists{$bucket}->[0]; | |
680 | if ($bucket =~ /\cK/) { | |
681 | @{$xlated{$range_start}} = split /\cK/, $bucket; | |
682 | } | |
683 | else { | |
684 | $xlated{$range_start} = $bucket; | |
685 | } | |
686 | shift @{$mapped_lists{$bucket}}; # Discard odd ranges | |
687 | shift @{$mapped_lists{$bucket}}; # Get ready for next | |
688 | # iteration | |
689 | } | |
690 | } # End of loop through all the buckets. | |
691 | ||
692 | # Here %xlated's keys are the range starts of all the code | |
693 | # points in the inversion map. Construct an inversion list | |
694 | # from them. | |
695 | my @new_invlist = sort { $a <=> $b } keys %xlated; | |
696 | ||
697 | # If the list is adjusted, we want to munge this list so that | |
698 | # we only have one entry for where consecutive code points map | |
699 | # to consecutive values. We just skip the subsequent entries | |
700 | # where this is the case. | |
701 | if ($to_adjust) { | |
702 | my @temp; | |
703 | for my $i (0 .. @new_invlist - 1) { | |
704 | next if $i > 0 | |
705 | && $new_invlist[$i-1] + 1 == $new_invlist[$i] | |
706 | && $xlated{$new_invlist[$i-1]} =~ $numeric_re | |
707 | && $xlated{$new_invlist[$i]} =~ $numeric_re | |
708 | && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]}; | |
709 | push @temp, $new_invlist[$i]; | |
710 | } | |
711 | @new_invlist = @temp; | |
712 | } | |
713 | ||
714 | # The inversion map comes from %xlated's values. We can | |
715 | # unshift each onto the front of the untouched portion, in | |
716 | # reverse order of the portion we did process. | |
717 | foreach my $start (reverse @new_invlist) { | |
718 | unshift @invmap, $xlated{$start}; | |
719 | } | |
720 | ||
721 | # Finally prepend the inversion list we have just constructed to the | |
722 | # one that contains anything we didn't process. | |
723 | unshift @invlist, @new_invlist; | |
724 | } | |
725 | } | |
726 | ||
727 | # prop_invmap() returns an extra final entry, which we can now | |
728 | # discard. | |
729 | if (@invmap) { | |
730 | pop @invlist; | |
731 | pop @invmap; | |
ceb1de32 | 732 | } |
0f5e3c71 KW |
733 | |
734 | if ($l1_only) { | |
99f21fb9 | 735 | die "Unimplemented to do a Latin-1 only inversion map" if @invmap; |
0f5e3c71 KW |
736 | for my $i (0 .. @invlist - 1 - 1) { |
737 | if ($invlist[$i] > 255) { | |
738 | ||
739 | # In an inversion list, even-numbered elements give the code | |
740 | # points that begin ranges that match the property; | |
741 | # odd-numbered give ones that begin ranges that don't match. | |
742 | # If $i is odd, we are at the first code point above 255 that | |
743 | # doesn't match, which means the range it is ending does | |
744 | # match, and crosses the 255/256 boundary. We want to include | |
745 | # this ending point, so increment $i, so the splice below | |
746 | # includes it. Conversely, if $i is even, it is the first | |
747 | # code point above 255 that matches, which means there was no | |
748 | # matching range that crossed the boundary, and we don't want | |
749 | # to include this code point, so splice before it. | |
750 | $i++ if $i % 2 != 0; | |
751 | ||
752 | # Remove everything past this. | |
753 | splice @invlist, $i; | |
99f21fb9 | 754 | splice @invmap, $i if @invmap; |
0f5e3c71 KW |
755 | last; |
756 | } | |
0c4ecf42 KW |
757 | } |
758 | } | |
0f5e3c71 KW |
759 | elsif ($nonl1_only) { |
760 | my $found_nonl1 = 0; | |
761 | for my $i (0 .. @invlist - 1 - 1) { | |
762 | next if $invlist[$i] < 256; | |
763 | ||
764 | # Here, we have the first element in the array that indicates an | |
765 | # element above Latin1. Get rid of all previous ones. | |
766 | splice @invlist, 0, $i; | |
99f21fb9 | 767 | splice @invmap, 0, $i if @invmap; |
0f5e3c71 KW |
768 | |
769 | # If this one's index is not divisible by 2, it means that this | |
770 | # element is inverting away from being in the list, which means | |
99f21fb9 KW |
771 | # all code points from 256 to this one are in this list (or |
772 | # map to the default for inversion maps) | |
773 | if ($i % 2 != 0) { | |
774 | unshift @invlist, 256; | |
775 | unshift @invmap, $map_default if @invmap; | |
776 | } | |
0f5e3c71 | 777 | $found_nonl1 = 1; |
3f427fd9 KW |
778 | last; |
779 | } | |
0f5e3c71 | 780 | die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1; |
3f427fd9 | 781 | } |
3f427fd9 | 782 | |
0f5e3c71 | 783 | output_invlist($prop_name, \@invlist, $charset); |
99f21fb9 | 784 | output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap; |
0f5e3c71 | 785 | } |
bffc0129 | 786 | end_file_pound_if; |
0c4ecf42 | 787 | print $out_fh "\n" . get_conditional_compile_line_end(); |
9d9177be KW |
788 | } |
789 | ||
2308ab83 | 790 | my $sources_list = "lib/unicore/mktables.lst"; |
216b41c2 KW |
791 | my @sources = ($0, qw(lib/unicore/mktables |
792 | lib/Unicode/UCD.pm | |
793 | regen/charset_translations.pl | |
794 | )); | |
9a3da3ad FC |
795 | { |
796 | # Depend on mktables’ own sources. It’s a shorter list of files than | |
797 | # those that Unicode::UCD uses. | |
2308ab83 KW |
798 | if (! open my $mktables_list, $sources_list) { |
799 | ||
800 | # This should force a rebuild once $sources_list exists | |
801 | push @sources, $sources_list; | |
802 | } | |
803 | else { | |
804 | while(<$mktables_list>) { | |
805 | last if /===/; | |
806 | chomp; | |
807 | push @sources, "lib/unicore/$_" if /^[^#]/; | |
808 | } | |
9a3da3ad FC |
809 | } |
810 | } | |
811 | read_only_bottom_close_and_rename($out_fh, \@sources) |