Commit | Line | Data |
---|---|---|
9d9177be KW |
1 | #!perl -w |
2 | use 5.015; | |
3 | use strict; | |
4 | use warnings; | |
99f21fb9 KW |
5 | use Unicode::UCD qw(prop_aliases |
6 | prop_values | |
7 | prop_value_aliases | |
8 | prop_invlist | |
9 | prop_invmap search_invlist | |
10 | ); | |
9d9177be | 11 | require 'regen/regen_lib.pl'; |
0c4ecf42 | 12 | require 'regen/charset_translations.pl'; |
9d9177be KW |
13 | |
14 | # This program outputs charclass_invlists.h, which contains various inversion | |
15 | # lists in the form of C arrays that are to be used as-is for inversion lists. | |
16 | # Thus, the lists it contains are essentially pre-compiled, and need only a | |
17 | # light-weight fast wrapper to make them usable at run-time. | |
18 | ||
19 | # As such, this code knows about the internal structure of these lists, and | |
20 | # any change made to that has to be done here as well. A random number stored | |
21 | # in the headers is used to minimize the possibility of things getting | |
22 | # out-of-sync, or the wrong data structure being passed. Currently that | |
23 | # random number is: | |
99f21fb9 KW |
24 | |
25 | # charclass_invlists.h now also has a partial implementation of inversion | |
26 | # maps; enough to generate tables for the line break properties, such as GCB | |
27 | ||
0a07b44b | 28 | my $VERSION_DATA_STRUCTURE_TYPE = 148565664; |
9d9177be | 29 | |
99f21fb9 KW |
30 | # integer or float |
31 | my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax; | |
32 | ||
33 | # Matches valid C language enum names: begins with ASCII alphabetic, then any | |
34 | # ASCII \w | |
35 | my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax; | |
36 | ||
9d9177be KW |
37 | my $out_fh = open_new('charclass_invlists.h', '>', |
38 | {style => '*', by => $0, | |
39 | from => "Unicode::UCD"}); | |
40 | ||
bffc0129 | 41 | my $in_file_pound_if = 0; |
43b443dd | 42 | |
9d9177be KW |
43 | print $out_fh "/* See the generating file for comments */\n\n"; |
44 | ||
bffc0129 KW |
45 | # The symbols generated by this program are all currently defined only in a |
46 | # single dot c each. The code knows where most of them go, but this hash | |
47 | # gives overrides for the exceptions to the typical place | |
48 | my %exceptions_to_where_to_define = | |
49 | ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C', | |
50 | AboveLatin1 => 'PERL_IN_REGCOMP_C', | |
51 | Latin1 => 'PERL_IN_REGCOMP_C', | |
52 | UpperLatin1 => 'PERL_IN_REGCOMP_C', | |
53 | _Perl_Any_Folds => 'PERL_IN_REGCOMP_C', | |
54 | _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C', | |
55 | _Perl_IDCont => 'PERL_IN_UTF8_C', | |
56 | _Perl_IDStart => 'PERL_IN_UTF8_C', | |
57 | ); | |
015bb97c | 58 | |
f79a09fc KW |
59 | # This hash contains the properties with enums that have hard-coded references |
60 | # to them in C code. Its only use is to make sure that if perl is compiled | |
61 | # with an older Unicode data set, that all the enum values the code is | |
62 | # expecting will still be in the enum typedef. Thus the code doesn't have to | |
63 | # change. The Unicode version won't have any code points that have these enum | |
64 | # values, so the code that handles them will not get exercised. This is far | |
65 | # better than having to #ifdef things. | |
66 | my %hard_coded_enums = | |
67 | ( gcb => [ | |
68 | 'Control', | |
69 | 'CR', | |
70 | 'Extend', | |
71 | 'L', | |
72 | 'LF', | |
73 | 'LV', | |
74 | 'LVT', | |
75 | 'Other', | |
76 | 'Prepend', | |
77 | 'Regional_Indicator', | |
78 | 'SpacingMark', | |
79 | 'T', | |
80 | 'V', | |
81 | ], | |
82 | sb => [ | |
83 | 'ATerm', | |
84 | 'Close', | |
85 | 'CR', | |
86 | 'Extend', | |
87 | 'Format', | |
88 | 'LF', | |
89 | 'Lower', | |
90 | 'Numeric', | |
91 | 'OLetter', | |
92 | 'Other', | |
93 | 'SContinue', | |
94 | 'Sep', | |
95 | 'Sp', | |
96 | 'STerm', | |
97 | 'Upper', | |
98 | ], | |
99 | wb => [ | |
100 | 'ALetter', | |
101 | 'CR', | |
102 | 'Double_Quote', | |
103 | 'Extend', | |
104 | 'ExtendNumLet', | |
105 | 'Format', | |
106 | 'Hebrew_Letter', | |
107 | 'Katakana', | |
108 | 'LF', | |
109 | 'MidLetter', | |
110 | 'MidNum', | |
111 | 'MidNumLet', | |
112 | 'Newline', | |
113 | 'Numeric', | |
114 | 'Other', | |
115 | 'Regional_Indicator', | |
116 | 'Single_Quote', | |
117 | 'UNKNOWN', | |
118 | ], | |
119 | ); | |
120 | ||
99f21fb9 KW |
121 | my @a2n; |
122 | ||
123 | sub uniques { | |
124 | # Returns non-duplicated input values. From "Perl Best Practices: | |
125 | # Encapsulated Cleverness". p. 455 in first edition. | |
126 | ||
127 | my %seen; | |
128 | return grep { ! $seen{$_}++ } @_; | |
129 | } | |
130 | ||
131 | sub a2n($) { | |
132 | my $cp = shift; | |
133 | ||
134 | # Returns the input Unicode code point translated to native. | |
135 | ||
136 | return $cp if $cp !~ $numeric_re || $cp > 255; | |
137 | return $a2n[$cp]; | |
138 | } | |
139 | ||
bffc0129 KW |
140 | sub end_file_pound_if { |
141 | if ($in_file_pound_if) { | |
142 | print $out_fh "\n#endif\t/* $in_file_pound_if */\n"; | |
143 | $in_file_pound_if = 0; | |
144 | } | |
145 | } | |
146 | ||
147 | sub switch_pound_if ($$) { | |
148 | my $name = shift; | |
149 | my $new_pound_if = shift; | |
150 | ||
151 | # Switch to new #if given by the 2nd argument. If there is an override | |
152 | # for this, it instead switches to that. The 1st argument is the | |
153 | # static's name, used to look up the overrides | |
154 | ||
155 | if (exists $exceptions_to_where_to_define{$name}) { | |
156 | $new_pound_if = $exceptions_to_where_to_define{$name}; | |
157 | } | |
158 | ||
159 | # Exit current #if if the new one is different from the old | |
160 | if ($in_file_pound_if | |
161 | && $in_file_pound_if !~ /$new_pound_if/) | |
162 | { | |
163 | end_file_pound_if; | |
164 | } | |
165 | ||
166 | # Enter new #if, if not already in it. | |
167 | if (! $in_file_pound_if) { | |
168 | $in_file_pound_if = "defined($new_pound_if)"; | |
169 | print $out_fh "\n#if $in_file_pound_if\n"; | |
43b443dd KW |
170 | } |
171 | } | |
172 | ||
0c4ecf42 | 173 | sub output_invlist ($$;$) { |
9d9177be KW |
174 | my $name = shift; |
175 | my $invlist = shift; # Reference to inversion list array | |
0c4ecf42 | 176 | my $charset = shift // ""; # name of character set for comment |
9d9177be | 177 | |
76d3994c | 178 | die "No inversion list for $name" unless defined $invlist |
ad85f59a | 179 | && ref $invlist eq 'ARRAY'; |
76d3994c | 180 | |
9d9177be KW |
181 | # Output the inversion list $invlist using the name $name for it. |
182 | # It is output in the exact internal form for inversion lists. | |
183 | ||
a0316a6c KW |
184 | # Is the last element of the header 0, or 1 ? |
185 | my $zero_or_one = 0; | |
ad85f59a | 186 | if (@$invlist && $invlist->[0] != 0) { |
a0316a6c | 187 | unshift @$invlist, 0; |
9d9177be KW |
188 | $zero_or_one = 1; |
189 | } | |
0a07b44b | 190 | my $count = @$invlist; |
9d9177be | 191 | |
bffc0129 | 192 | switch_pound_if ($name, 'PERL_IN_PERL_C'); |
43b443dd | 193 | |
0c4ecf42 KW |
194 | print $out_fh "\nstatic const UV ${name}_invlist[] = {"; |
195 | print $out_fh " /* for $charset */" if $charset; | |
196 | print $out_fh "\n"; | |
9d9177be | 197 | |
a0316a6c | 198 | print $out_fh "\t$count,\t/* Number of elements */\n"; |
9d9177be KW |
199 | print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n"; |
200 | print $out_fh "\t", $zero_or_one, | |
a0316a6c KW |
201 | ",\t/* 0 if the list starts at 0;", |
202 | "\n\t\t 1 if it starts at the element beyond 0 */\n"; | |
9d9177be KW |
203 | |
204 | # The main body are the UVs passed in to this routine. Do the final | |
205 | # element separately | |
47d53124 KW |
206 | for my $i (0 .. @$invlist - 1) { |
207 | printf $out_fh "\t0x%X", $invlist->[$i]; | |
208 | print $out_fh "," if $i < @$invlist - 1; | |
209 | print $out_fh "\n"; | |
9d9177be KW |
210 | } |
211 | ||
9d9177be KW |
212 | print $out_fh "};\n"; |
213 | } | |
214 | ||
99f21fb9 KW |
215 | sub output_invmap ($$$$$$$) { |
216 | my $name = shift; | |
217 | my $invmap = shift; # Reference to inversion map array | |
218 | my $prop_name = shift; | |
219 | my $input_format = shift; # The inversion map's format | |
220 | my $default = shift; # The property value for code points who | |
221 | # otherwise don't have a value specified. | |
222 | my $extra_enums = shift; # comma-separated list of our additions to the | |
223 | # property's standard possible values | |
224 | my $charset = shift // ""; # name of character set for comment | |
225 | ||
226 | # Output the inversion map $invmap for property $prop_name, but use $name | |
227 | # as the actual data structure's name. | |
228 | ||
229 | my $count = @$invmap; | |
230 | ||
231 | my $output_format; | |
232 | my $declaration_type; | |
233 | my %enums; | |
234 | my $name_prefix; | |
235 | ||
236 | if ($input_format eq 's') { | |
02f811dd KW |
237 | $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name |
238 | my $short_name = (prop_aliases($prop_name))[0] // $prop_name; | |
99f21fb9 KW |
239 | my @enums = prop_values($prop_name); |
240 | if (! @enums) { | |
241 | die "Only enum properties are currently handled; '$prop_name' isn't one"; | |
242 | } | |
243 | else { | |
244 | ||
f79a09fc | 245 | # Convert short names to long |
99f21fb9 | 246 | @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums; |
f79a09fc KW |
247 | |
248 | my @expected_enums = @{$hard_coded_enums{lc $short_name}}; | |
249 | die 'You need to update %hard_coded_enums to reflect new entries in this Unicode version' | |
250 | if @expected_enums < @enums; | |
251 | ||
252 | # Remove the enums found in the input from the ones we expect | |
253 | for (my $i = @expected_enums - 1; $i >= 0; $i--) { | |
254 | splice(@expected_enums, $i, 1) | |
255 | if grep { $expected_enums[$i] eq $_ } @enums; | |
256 | } | |
257 | ||
258 | # The ones remaining must be because we're using an older | |
259 | # Unicode version. Add them to the list. | |
260 | push @enums, @expected_enums; | |
261 | ||
262 | # Add in the extra values coded into this program, and sort. | |
99f21fb9 KW |
263 | push @enums, split /,/, $extra_enums if $extra_enums ne ""; |
264 | @enums = sort @enums; | |
265 | ||
266 | # Assign a value to each element of the enum. The default | |
267 | # value always gets 0; the others are arbitrarily assigned. | |
268 | my $enum_val = 0; | |
02f811dd KW |
269 | my $canonical_default = prop_value_aliases($prop_name, $default); |
270 | $default = $canonical_default if defined $canonical_default; | |
99f21fb9 KW |
271 | $enums{$default} = $enum_val++; |
272 | for my $enum (@enums) { | |
273 | $enums{$enum} = $enum_val++ unless exists $enums{$enum}; | |
274 | } | |
275 | } | |
276 | ||
bffc0129 KW |
277 | # Inversion map stuff is currently used only by regexec |
278 | switch_pound_if($name, 'PERL_IN_REGEXEC_C'); | |
99f21fb9 KW |
279 | { |
280 | ||
99f21fb9 KW |
281 | # The short names tend to be two lower case letters, but it looks |
282 | # better for those if they are upper. XXX | |
283 | $short_name = uc($short_name) if length($short_name) < 3 | |
284 | || substr($short_name, 0, 1) =~ /[[:lower:]]/; | |
85e5f08b | 285 | $name_prefix = "${short_name}_"; |
99f21fb9 KW |
286 | my $enum_count = keys %enums; |
287 | print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n"; | |
288 | ||
289 | print $out_fh "\ntypedef enum {\n"; | |
290 | print $out_fh "\t${name_prefix}$default = $enums{$default},\n"; | |
291 | delete $enums{$default}; | |
292 | foreach my $enum (sort { $a cmp $b } keys %enums) { | |
293 | print $out_fh "\t${name_prefix}$enum = $enums{$enum}"; | |
294 | print $out_fh "," if $enums{$enum} < $enum_count - 1; | |
295 | print $out_fh "\n"; | |
296 | } | |
297 | $declaration_type = "${name_prefix}enum"; | |
298 | print $out_fh "} $declaration_type;\n"; | |
299 | ||
300 | $output_format = "${name_prefix}%s"; | |
301 | } | |
302 | } | |
303 | else { | |
304 | die "'$input_format' invmap() format for '$prop_name' unimplemented"; | |
305 | } | |
306 | ||
307 | die "No inversion map for $prop_name" unless defined $invmap | |
308 | && ref $invmap eq 'ARRAY' | |
309 | && $count; | |
310 | ||
311 | print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {"; | |
312 | print $out_fh " /* for $charset */" if $charset; | |
313 | print $out_fh "\n"; | |
314 | ||
315 | # The main body are the scalars passed in to this routine. | |
316 | for my $i (0 .. $count - 1) { | |
317 | my $element = $invmap->[$i]; | |
02f811dd KW |
318 | my $full_element_name = prop_value_aliases($prop_name, $element); |
319 | $element = $full_element_name if defined $full_element_name; | |
320 | $element = $name_prefix . $element; | |
99f21fb9 KW |
321 | print $out_fh "\t$element"; |
322 | print $out_fh "," if $i < $count - 1; | |
323 | print $out_fh "\n"; | |
324 | } | |
325 | print $out_fh "};\n"; | |
99f21fb9 KW |
326 | } |
327 | ||
5a7e5385 | 328 | sub mk_invlist_from_sorted_cp_list { |
a02047bf KW |
329 | |
330 | # Returns an inversion list constructed from the sorted input array of | |
331 | # code points | |
332 | ||
333 | my $list_ref = shift; | |
334 | ||
99f21fb9 KW |
335 | return unless @$list_ref; |
336 | ||
a02047bf KW |
337 | # Initialize to just the first element |
338 | my @invlist = ( $list_ref->[0], $list_ref->[0] + 1); | |
339 | ||
340 | # For each succeeding element, if it extends the previous range, adjust | |
341 | # up, otherwise add it. | |
342 | for my $i (1 .. @$list_ref - 1) { | |
343 | if ($invlist[-1] == $list_ref->[$i]) { | |
344 | $invlist[-1]++; | |
345 | } | |
346 | else { | |
347 | push @invlist, $list_ref->[$i], $list_ref->[$i] + 1; | |
348 | } | |
349 | } | |
350 | return @invlist; | |
351 | } | |
352 | ||
353 | # Read in the Case Folding rules, and construct arrays of code points for the | |
354 | # properties we need. | |
355 | my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding"); | |
356 | die "Could not find inversion map for Case_Folding" unless defined $format; | |
357 | die "Incorrect format '$format' for Case_Folding inversion map" | |
347b9066 KW |
358 | unless $format eq 'al' |
359 | || $format eq 'a'; | |
a02047bf KW |
360 | my @has_multi_char_fold; |
361 | my @is_non_final_fold; | |
362 | ||
363 | for my $i (0 .. @$folds_ref - 1) { | |
364 | next unless ref $folds_ref->[$i]; # Skip single-char folds | |
365 | push @has_multi_char_fold, $cp_ref->[$i]; | |
366 | ||
b6a6e956 | 367 | # Add to the non-finals list each code point that is in a non-final |
a02047bf KW |
368 | # position |
369 | for my $j (0 .. @{$folds_ref->[$i]} - 2) { | |
370 | push @is_non_final_fold, $folds_ref->[$i][$j] | |
371 | unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold; | |
372 | } | |
373 | } | |
374 | ||
a02047bf KW |
375 | sub _Perl_Non_Final_Folds { |
376 | @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold; | |
5a7e5385 | 377 | return mk_invlist_from_sorted_cp_list(\@is_non_final_fold); |
a02047bf KW |
378 | } |
379 | ||
99f21fb9 KW |
380 | sub prop_name_for_cmp ($) { # Sort helper |
381 | my $name = shift; | |
382 | ||
383 | # Returns the input lowercased, with non-alphas removed, as well as | |
384 | # everything starting with a comma | |
385 | ||
386 | $name =~ s/,.*//; | |
387 | $name =~ s/[[:^alpha:]]//g; | |
388 | return lc $name; | |
389 | } | |
390 | ||
892d8259 | 391 | sub UpperLatin1 { |
5a7e5385 | 392 | return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]); |
892d8259 KW |
393 | } |
394 | ||
9d9177be KW |
395 | output_invlist("Latin1", [ 0, 256 ]); |
396 | output_invlist("AboveLatin1", [ 256 ]); | |
397 | ||
bffc0129 | 398 | end_file_pound_if; |
43b443dd | 399 | |
3f427fd9 KW |
400 | # We construct lists for all the POSIX and backslash sequence character |
401 | # classes in two forms: | |
402 | # 1) ones which match only in the ASCII range | |
403 | # 2) ones which match either in the Latin1 range, or the entire Unicode range | |
404 | # | |
405 | # These get compiled in, and hence affect the memory footprint of every Perl | |
406 | # program, even those not using Unicode. To minimize the size, currently | |
407 | # the Latin1 version is generated for the beyond ASCII range except for those | |
408 | # lists that are quite small for the entire range, such as for \s, which is 22 | |
409 | # UVs long plus 4 UVs (currently) for the header. | |
410 | # | |
411 | # To save even more memory, the ASCII versions could be derived from the | |
412 | # larger ones at runtime, saving some memory (minus the expense of the machine | |
413 | # instructions to do so), but these are all small anyway, so their total is | |
414 | # about 100 UVs. | |
415 | # | |
416 | # In the list of properties below that get generated, the L1 prefix is a fake | |
417 | # property that means just the Latin1 range of the full property (whose name | |
418 | # has an X prefix instead of L1). | |
a02047bf KW |
419 | # |
420 | # An initial & means to use the subroutine from this file instead of an | |
421 | # official inversion list. | |
3f427fd9 | 422 | |
0c4ecf42 KW |
423 | for my $charset (get_supported_code_pages()) { |
424 | print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
425 | ||
99f21fb9 KW |
426 | @a2n = @{get_a2n($charset)}; |
427 | no warnings 'qw'; | |
428 | # Ignore non-alpha in sort | |
429 | for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw( | |
1c8c3428 KW |
430 | ASCII |
431 | Cased | |
432 | VertSpace | |
433 | XPerlSpace | |
434 | XPosixAlnum | |
435 | XPosixAlpha | |
436 | XPosixBlank | |
437 | XPosixCntrl | |
438 | XPosixDigit | |
439 | XPosixGraph | |
440 | XPosixLower | |
441 | XPosixPrint | |
442 | XPosixPunct | |
443 | XPosixSpace | |
444 | XPosixUpper | |
445 | XPosixWord | |
446 | XPosixXDigit | |
447 | _Perl_Any_Folds | |
448 | &NonL1_Perl_Non_Final_Folds | |
449 | _Perl_Folds_To_Multi_Char | |
450 | &UpperLatin1 | |
451 | _Perl_IDStart | |
452 | _Perl_IDCont | |
02f811dd | 453 | _Perl_GCB,EDGE |
bf4268fa KW |
454 | _Perl_WB,EDGE,UNKNOWN |
455 | _Perl_SB,EDGE | |
1c8c3428 | 456 | ) |
0f5e3c71 KW |
457 | ) { |
458 | ||
459 | # For the Latin1 properties, we change to use the eXtended version of the | |
460 | # base property, then go through the result and get rid of everything not | |
461 | # in Latin1 (above 255). Actually, we retain the element for the range | |
462 | # that crosses the 255/256 boundary if it is one that matches the | |
463 | # property. For example, in the Word property, there is a range of code | |
464 | # points that start at U+00F8 and goes through U+02C1. Instead of | |
465 | # artificially cutting that off at 256 because 256 is the first code point | |
466 | # above Latin1, we let the range go to its natural ending. That gives us | |
467 | # extra information with no added space taken. But if the range that | |
468 | # crosses the boundary is one that doesn't match the property, we don't | |
469 | # start a new range above 255, as that could be construed as going to | |
470 | # infinity. For example, the Upper property doesn't include the character | |
471 | # at 255, but does include the one at 256. We don't include the 256 one. | |
472 | my $prop_name = $prop; | |
473 | my $is_local_sub = $prop_name =~ s/^&//; | |
99f21fb9 KW |
474 | my $extra_enums = ""; |
475 | $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x; | |
0f5e3c71 KW |
476 | my $lookup_prop = $prop_name; |
477 | my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ | |
478 | or $lookup_prop =~ s/^L1//); | |
479 | my $nonl1_only = 0; | |
480 | $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only; | |
99f21fb9 | 481 | ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x; |
0f5e3c71 KW |
482 | |
483 | my @invlist; | |
99f21fb9 KW |
484 | my @invmap; |
485 | my $map_format; | |
486 | my $map_default; | |
487 | my $maps_to_code_point; | |
488 | my $to_adjust; | |
0f5e3c71 KW |
489 | if ($is_local_sub) { |
490 | @invlist = eval $lookup_prop; | |
491 | } | |
492 | else { | |
493 | @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok'); | |
99f21fb9 | 494 | if (! @invlist) { |
99f21fb9 | 495 | |
ad85f59a KW |
496 | # If couldn't find a non-empty inversion list, see if it is |
497 | # instead an inversion map | |
498 | my ($list_ref, $map_ref, $format, $default) | |
99f21fb9 | 499 | = prop_invmap($lookup_prop, '_perl_core_internal_ok'); |
ad85f59a KW |
500 | if (! $list_ref) { |
501 | # An empty return here could mean an unknown property, or | |
502 | # merely that the original inversion list is empty. Call | |
503 | # in scalar context to differentiate | |
504 | my $count = prop_invlist($lookup_prop, | |
505 | '_perl_core_internal_ok'); | |
506 | die "Could not find inversion list for '$lookup_prop'" | |
507 | unless defined $count; | |
508 | } | |
509 | else { | |
18b852b3 KW |
510 | @invlist = @$list_ref; |
511 | @invmap = @$map_ref; | |
512 | $map_format = $format; | |
513 | $map_default = $default; | |
514 | $maps_to_code_point = $map_format =~ /x/; | |
515 | $to_adjust = $map_format =~ /a/; | |
ad85f59a | 516 | } |
99f21fb9 | 517 | } |
0f5e3c71 | 518 | } |
ad85f59a KW |
519 | |
520 | ||
521 | # Short-circuit an empty inversion list. | |
522 | if (! @invlist) { | |
523 | output_invlist($prop_name, \@invlist, $charset); | |
524 | next; | |
525 | } | |
ceb1de32 | 526 | |
99f21fb9 KW |
527 | # Re-order the Unicode code points to native ones for this platform. |
528 | # This is only needed for code points below 256, because native code | |
529 | # points are only in that range. For inversion maps of properties | |
530 | # where the mappings are adjusted (format =~ /a/), this reordering | |
531 | # could mess up the adjustment pattern that was in the input, so that | |
532 | # has to be dealt with. | |
533 | # | |
534 | # And inversion maps that map to code points need to eventually have | |
535 | # all those code points remapped to native, and it's better to do that | |
536 | # here, going through the whole list not just those below 256. This | |
537 | # is because some inversion maps have adjustments (format =~ /a/) | |
538 | # which may be affected by the reordering. This code needs to be done | |
539 | # both for when we are translating the inversion lists for < 256, and | |
540 | # for the inversion maps for everything. By doing both in this loop, | |
541 | # we can share that code. | |
542 | # | |
543 | # So, we go through everything for an inversion map to code points; | |
544 | # otherwise, we can skip any remapping at all if we are going to | |
545 | # output only the above-Latin1 values, or if the range spans the whole | |
546 | # of 0..256, as the remap will also include all of 0..256 (256 not | |
547 | # 255 because a re-ordering could cause 256 to need to be in the same | |
548 | # range as 255.) | |
549 | if ((@invmap && $maps_to_code_point) | |
550 | || (! $nonl1_only || ($invlist[0] < 256 | |
551 | && ! ($invlist[0] == 0 && $invlist[1] > 256)))) | |
ceb1de32 | 552 | { |
fb4554ea | 553 | |
99f21fb9 | 554 | if (! @invmap) { # Straight inversion list |
fb4554ea KW |
555 | # Look at all the ranges that start before 257. |
556 | my @latin1_list; | |
557 | while (@invlist) { | |
558 | last if $invlist[0] > 256; | |
559 | my $upper = @invlist > 1 | |
560 | ? $invlist[1] - 1 # In range | |
8a6c81cf KW |
561 | |
562 | # To infinity. You may want to stop much much | |
563 | # earlier; going this high may expose perl | |
564 | # deficiencies with very large numbers. | |
565 | : $Unicode::UCD::MAX_CP; | |
fb4554ea | 566 | for my $j ($invlist[0] .. $upper) { |
99f21fb9 | 567 | push @latin1_list, a2n($j); |
0f5e3c71 | 568 | } |
fb4554ea KW |
569 | |
570 | shift @invlist; # Shift off the range that's in the list | |
571 | shift @invlist; # Shift off the range not in the list | |
0c4ecf42 | 572 | } |
fb4554ea KW |
573 | |
574 | # Here @invlist contains all the ranges in the original that start | |
575 | # at code points above 256, and @latin1_list contains all the | |
576 | # native code points for ranges that start with a Unicode code | |
577 | # point below 257. We sort the latter and convert it to inversion | |
578 | # list format. Then simply prepend it to the list of the higher | |
579 | # code points. | |
580 | @latin1_list = sort { $a <=> $b } @latin1_list; | |
5a7e5385 | 581 | @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list); |
fb4554ea | 582 | unshift @invlist, @latin1_list; |
99f21fb9 KW |
583 | } |
584 | else { # Is an inversion map | |
585 | ||
586 | # This is a similar procedure as plain inversion list, but has | |
587 | # multiple buckets. A plain inversion list just has two | |
588 | # buckets, 1) 'in' the list; and 2) 'not' in the list, and we | |
589 | # pretty much can ignore the 2nd bucket, as it is completely | |
590 | # defined by the 1st. But here, what we do is create buckets | |
591 | # which contain the code points that map to each, translated | |
592 | # to native and turned into an inversion list. Thus each | |
593 | # bucket is an inversion list of native code points that map | |
594 | # to it or don't map to it. We use these to create an | |
595 | # inversion map for the whole property. | |
596 | ||
597 | # As mentioned earlier, we use this procedure to not just | |
598 | # remap the inversion list to native values, but also the maps | |
599 | # of code points to native ones. In the latter case we have | |
600 | # to look at the whole of the inversion map (or at least to | |
601 | # above Unicode; as the maps of code points above that should | |
602 | # all be to the default). | |
603 | my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256; | |
604 | ||
605 | my %mapped_lists; # A hash whose keys are the buckets. | |
606 | while (@invlist) { | |
607 | last if $invlist[0] > $upper_limit; | |
608 | ||
609 | # This shouldn't actually happen, as prop_invmap() returns | |
610 | # an extra element at the end that is beyond $upper_limit | |
611 | die "inversion map that extends to infinity is unimplemented" unless @invlist > 1; | |
612 | ||
613 | my $bucket; | |
614 | ||
615 | # A hash key can't be a ref (we are only expecting arrays | |
616 | # of scalars here), so convert any such to a string that | |
617 | # will be converted back later (using a vertical tab as | |
618 | # the separator). Even if the mapping is to code points, | |
619 | # we don't translate to native here because the code | |
620 | # output_map() calls to output these arrays assumes the | |
621 | # input is Unicode, not native. | |
622 | if (ref $invmap[0]) { | |
623 | $bucket = join "\cK", @{$invmap[0]}; | |
624 | } | |
625 | elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) { | |
626 | ||
627 | # Do convert to native for maps to single code points. | |
628 | # There are some properties that have a few outlier | |
629 | # maps that aren't code points, so the above test | |
630 | # skips those. | |
631 | $bucket = a2n($invmap[0]); | |
632 | } else { | |
633 | $bucket = $invmap[0]; | |
634 | } | |
635 | ||
636 | # We now have the bucket that all code points in the range | |
637 | # map to, though possibly they need to be adjusted. Go | |
638 | # through the range and put each translated code point in | |
639 | # it into its bucket. | |
640 | my $base_map = $invmap[0]; | |
641 | for my $j ($invlist[0] .. $invlist[1] - 1) { | |
642 | if ($to_adjust | |
643 | # The 1st code point doesn't need adjusting | |
644 | && $j > $invlist[0] | |
645 | ||
646 | # Skip any non-numeric maps: these are outliers | |
647 | # that aren't code points. | |
648 | && $base_map =~ $numeric_re | |
649 | ||
650 | # 'ne' because the default can be a string | |
651 | && $base_map ne $map_default) | |
652 | { | |
653 | # We adjust, by incrementing each the bucket and | |
654 | # the map. For code point maps, translate to | |
655 | # native | |
656 | $base_map++; | |
657 | $bucket = ($maps_to_code_point) | |
658 | ? a2n($base_map) | |
659 | : $base_map; | |
660 | } | |
661 | ||
662 | # Add the native code point to the bucket for the | |
663 | # current map | |
664 | push @{$mapped_lists{$bucket}}, a2n($j); | |
665 | } # End of loop through all code points in the range | |
666 | ||
667 | # Get ready for the next range | |
668 | shift @invlist; | |
669 | shift @invmap; | |
670 | } # End of loop through all ranges in the map. | |
671 | ||
672 | # Here, @invlist and @invmap retain all the ranges from the | |
673 | # originals that start with code points above $upper_limit. | |
674 | # Each bucket in %mapped_lists contains all the code points | |
675 | # that map to that bucket. If the bucket is for a map to a | |
676 | # single code point is a single code point, the bucket has | |
677 | # been converted to native. If something else (including | |
678 | # multiple code points), no conversion is done. | |
679 | # | |
680 | # Now we recreate the inversion map into %xlated, but this | |
681 | # time for the native character set. | |
682 | my %xlated; | |
683 | foreach my $bucket (keys %mapped_lists) { | |
684 | ||
685 | # Sort and convert this bucket to an inversion list. The | |
686 | # result will be that ranges that start with even-numbered | |
687 | # indexes will be for code points that map to this bucket; | |
688 | # odd ones map to some other bucket, and are discarded | |
689 | # below. | |
690 | @{$mapped_lists{$bucket}} | |
691 | = sort{ $a <=> $b} @{$mapped_lists{$bucket}}; | |
692 | @{$mapped_lists{$bucket}} | |
693 | = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}}); | |
694 | ||
695 | # Add each even-numbered range in the bucket to %xlated; | |
696 | # so that the keys of %xlated become the range start code | |
697 | # points, and the values are their corresponding maps. | |
698 | while (@{$mapped_lists{$bucket}}) { | |
699 | my $range_start = $mapped_lists{$bucket}->[0]; | |
700 | if ($bucket =~ /\cK/) { | |
701 | @{$xlated{$range_start}} = split /\cK/, $bucket; | |
702 | } | |
703 | else { | |
704 | $xlated{$range_start} = $bucket; | |
705 | } | |
706 | shift @{$mapped_lists{$bucket}}; # Discard odd ranges | |
707 | shift @{$mapped_lists{$bucket}}; # Get ready for next | |
708 | # iteration | |
709 | } | |
710 | } # End of loop through all the buckets. | |
711 | ||
712 | # Here %xlated's keys are the range starts of all the code | |
713 | # points in the inversion map. Construct an inversion list | |
714 | # from them. | |
715 | my @new_invlist = sort { $a <=> $b } keys %xlated; | |
716 | ||
717 | # If the list is adjusted, we want to munge this list so that | |
718 | # we only have one entry for where consecutive code points map | |
719 | # to consecutive values. We just skip the subsequent entries | |
720 | # where this is the case. | |
721 | if ($to_adjust) { | |
722 | my @temp; | |
723 | for my $i (0 .. @new_invlist - 1) { | |
724 | next if $i > 0 | |
725 | && $new_invlist[$i-1] + 1 == $new_invlist[$i] | |
726 | && $xlated{$new_invlist[$i-1]} =~ $numeric_re | |
727 | && $xlated{$new_invlist[$i]} =~ $numeric_re | |
728 | && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]}; | |
729 | push @temp, $new_invlist[$i]; | |
730 | } | |
731 | @new_invlist = @temp; | |
732 | } | |
733 | ||
734 | # The inversion map comes from %xlated's values. We can | |
735 | # unshift each onto the front of the untouched portion, in | |
736 | # reverse order of the portion we did process. | |
737 | foreach my $start (reverse @new_invlist) { | |
738 | unshift @invmap, $xlated{$start}; | |
739 | } | |
740 | ||
741 | # Finally prepend the inversion list we have just constructed to the | |
742 | # one that contains anything we didn't process. | |
743 | unshift @invlist, @new_invlist; | |
744 | } | |
745 | } | |
746 | ||
747 | # prop_invmap() returns an extra final entry, which we can now | |
748 | # discard. | |
749 | if (@invmap) { | |
750 | pop @invlist; | |
751 | pop @invmap; | |
ceb1de32 | 752 | } |
0f5e3c71 KW |
753 | |
754 | if ($l1_only) { | |
99f21fb9 | 755 | die "Unimplemented to do a Latin-1 only inversion map" if @invmap; |
0f5e3c71 KW |
756 | for my $i (0 .. @invlist - 1 - 1) { |
757 | if ($invlist[$i] > 255) { | |
758 | ||
759 | # In an inversion list, even-numbered elements give the code | |
760 | # points that begin ranges that match the property; | |
761 | # odd-numbered give ones that begin ranges that don't match. | |
762 | # If $i is odd, we are at the first code point above 255 that | |
763 | # doesn't match, which means the range it is ending does | |
764 | # match, and crosses the 255/256 boundary. We want to include | |
765 | # this ending point, so increment $i, so the splice below | |
766 | # includes it. Conversely, if $i is even, it is the first | |
767 | # code point above 255 that matches, which means there was no | |
768 | # matching range that crossed the boundary, and we don't want | |
769 | # to include this code point, so splice before it. | |
770 | $i++ if $i % 2 != 0; | |
771 | ||
772 | # Remove everything past this. | |
773 | splice @invlist, $i; | |
99f21fb9 | 774 | splice @invmap, $i if @invmap; |
0f5e3c71 KW |
775 | last; |
776 | } | |
0c4ecf42 KW |
777 | } |
778 | } | |
0f5e3c71 KW |
779 | elsif ($nonl1_only) { |
780 | my $found_nonl1 = 0; | |
781 | for my $i (0 .. @invlist - 1 - 1) { | |
782 | next if $invlist[$i] < 256; | |
783 | ||
784 | # Here, we have the first element in the array that indicates an | |
785 | # element above Latin1. Get rid of all previous ones. | |
786 | splice @invlist, 0, $i; | |
99f21fb9 | 787 | splice @invmap, 0, $i if @invmap; |
0f5e3c71 KW |
788 | |
789 | # If this one's index is not divisible by 2, it means that this | |
790 | # element is inverting away from being in the list, which means | |
99f21fb9 KW |
791 | # all code points from 256 to this one are in this list (or |
792 | # map to the default for inversion maps) | |
793 | if ($i % 2 != 0) { | |
794 | unshift @invlist, 256; | |
795 | unshift @invmap, $map_default if @invmap; | |
796 | } | |
0f5e3c71 | 797 | $found_nonl1 = 1; |
3f427fd9 KW |
798 | last; |
799 | } | |
0f5e3c71 | 800 | die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1; |
3f427fd9 | 801 | } |
3f427fd9 | 802 | |
0f5e3c71 | 803 | output_invlist($prop_name, \@invlist, $charset); |
99f21fb9 | 804 | output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap; |
0f5e3c71 | 805 | } |
bffc0129 | 806 | end_file_pound_if; |
0c4ecf42 | 807 | print $out_fh "\n" . get_conditional_compile_line_end(); |
9d9177be KW |
808 | } |
809 | ||
2308ab83 | 810 | my $sources_list = "lib/unicore/mktables.lst"; |
216b41c2 KW |
811 | my @sources = ($0, qw(lib/unicore/mktables |
812 | lib/Unicode/UCD.pm | |
813 | regen/charset_translations.pl | |
814 | )); | |
9a3da3ad FC |
815 | { |
816 | # Depend on mktables’ own sources. It’s a shorter list of files than | |
817 | # those that Unicode::UCD uses. | |
2308ab83 KW |
818 | if (! open my $mktables_list, $sources_list) { |
819 | ||
820 | # This should force a rebuild once $sources_list exists | |
821 | push @sources, $sources_list; | |
822 | } | |
823 | else { | |
824 | while(<$mktables_list>) { | |
825 | last if /===/; | |
826 | chomp; | |
827 | push @sources, "lib/unicore/$_" if /^[^#]/; | |
828 | } | |
9a3da3ad FC |
829 | } |
830 | } | |
831 | read_only_bottom_close_and_rename($out_fh, \@sources) |