Commit | Line | Data |
---|---|---|
9d9177be KW |
1 | #!perl -w |
2 | use 5.015; | |
3 | use strict; | |
4 | use warnings; | |
99f21fb9 KW |
5 | use Unicode::UCD qw(prop_aliases |
6 | prop_values | |
7 | prop_value_aliases | |
8 | prop_invlist | |
9 | prop_invmap search_invlist | |
463b4a67 | 10 | charprop |
a1c8344d | 11 | num |
2cd613ec | 12 | charblock |
99f21fb9 | 13 | ); |
d1907b94 | 14 | use constant DEBUG => $ENV{DEBUG} // 0; |
3d7c117d MB |
15 | require './regen/regen_lib.pl'; |
16 | require './regen/charset_translations.pl'; | |
048bdb72 | 17 | require './lib/unicore/UCD.pl'; |
64ff2b0f | 18 | require './regen/mph.pl'; |
db95f459 | 19 | use re "/aa"; |
9d9177be | 20 | |
d1907b94 YO |
21 | print "Starting...\n" if DEBUG; |
22 | ||
9d9177be KW |
23 | # This program outputs charclass_invlists.h, which contains various inversion |
24 | # lists in the form of C arrays that are to be used as-is for inversion lists. | |
25 | # Thus, the lists it contains are essentially pre-compiled, and need only a | |
26 | # light-weight fast wrapper to make them usable at run-time. | |
27 | ||
28 | # As such, this code knows about the internal structure of these lists, and | |
29 | # any change made to that has to be done here as well. A random number stored | |
30 | # in the headers is used to minimize the possibility of things getting | |
31 | # out-of-sync, or the wrong data structure being passed. Currently that | |
32 | # random number is: | |
99f21fb9 | 33 | |
1c7f0e60 KW |
34 | my $VERSION_DATA_STRUCTURE_TYPE = 148565664; |
35 | ||
b72e0f31 KW |
36 | # charclass_invlists.h now also contains inversion maps and enum definitions |
37 | # for those maps that have a finite number of possible values | |
99f21fb9 | 38 | |
98a1b8f7 KW |
39 | # integer or float (no exponent) |
40 | my $integer_or_float_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /x; | |
41 | ||
42 | # Also includes rationals | |
43 | my $numeric_re = qr! $integer_or_float_re | ^ -? \d+ / \d+ $ !x; | |
99f21fb9 | 44 | |
cb2d98ed KW |
45 | # More than one code point may have the same code point as their fold. This |
46 | # gives the maximum number in the current Unicode release. (The folded-to | |
47 | # code point is not included in this count.) Most folds are pairs of code | |
48 | # points, like 'B' and 'b', so this number is at least one. | |
49 | my $max_fold_froms = 1; | |
50 | ||
f4b10e8e | 51 | my %keywords; |
cef72199 | 52 | my $table_name_prefix = "UNI_"; |
4eea95a6 | 53 | |
99f21fb9 KW |
54 | # Matches valid C language enum names: begins with ASCII alphabetic, then any |
55 | # ASCII \w | |
56 | my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax; | |
57 | ||
9d9177be | 58 | my $out_fh = open_new('charclass_invlists.h', '>', |
9824c081 | 59 | {style => '*', by => 'regen/mk_invlists.pl', |
9d9177be KW |
60 | from => "Unicode::UCD"}); |
61 | ||
0f8eed22 | 62 | my $in_file_pound_if = ""; |
43b443dd | 63 | |
289ce9cc KW |
64 | my $max_hdr_len = 3; # In headings, how wide a name is allowed? |
65 | ||
9d9177be KW |
66 | print $out_fh "/* See the generating file for comments */\n\n"; |
67 | ||
a405530a KW |
68 | print $out_fh <<'EOF'; |
69 | /* This gives the number of code points that can be in the bitmap of an ANYOF | |
70 | * node. The shift number must currently be one of: 8..12. It can't be less | |
71 | * than 8 (256) because some code relies on it being at least that. Above 12 | |
72 | * (4096), and you start running into warnings that some data structure widths | |
73 | * have been exceeded, though the test suite as of this writing still passes | |
74 | * for up through 16, which is as high as anyone would ever want to go, | |
75 | * encompassing all of the Unicode BMP, and thus including all the economically | |
76 | * important world scripts. At 12 most of them are: including Arabic, | |
77 | * Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han, | |
d1907b94 YO |
78 | * Japanese, nor Korean. The regnode sizing data structure in regnodes.h currently |
79 | * uses a U8, and the trie types TRIEC and AHOCORASICKC are larger than U8 for | |
80 | * shift values above 12.) Be sure to benchmark before changing, as larger sizes | |
81 | * do significantly slow down the test suite. */ | |
a405530a KW |
82 | |
83 | EOF | |
84 | ||
85 | my $num_anyof_code_points = '(1 << 8)'; | |
86 | ||
87 | print $out_fh "#define NUM_ANYOF_CODE_POINTS $num_anyof_code_points\n\n"; | |
88 | ||
89 | $num_anyof_code_points = eval $num_anyof_code_points; | |
90 | ||
da477c7d KW |
91 | no warnings 'once'; |
92 | print $out_fh <<"EOF"; | |
93 | /* The precision to use in "%.*e" formats */ | |
94 | #define PL_E_FORMAT_PRECISION $Unicode::UCD::e_precision | |
95 | EOF | |
96 | ||
d74e7480 KW |
97 | # enums that should be made public |
98 | my %public_enums = ( | |
f52cc976 | 99 | _Perl_SCX => 1 |
d74e7480 KW |
100 | ); |
101 | ||
bffc0129 KW |
102 | # The symbols generated by this program are all currently defined only in a |
103 | # single dot c each. The code knows where most of them go, but this hash | |
104 | # gives overrides for the exceptions to the typical place | |
105 | my %exceptions_to_where_to_define = | |
7dddaf74 | 106 | ( |
03d17b6e | 107 | #_Perl_IVCF => 'PERL_IN_REGCOMP_C', |
bffc0129 | 108 | ); |
4761f74a | 109 | |
c0221e16 | 110 | my %where_to_define_enums = (); |
015bb97c | 111 | |
59fc10af KW |
112 | my $applies_to_all_charsets_text = "all charsets"; |
113 | ||
973a28ed KW |
114 | my %gcb_enums; |
115 | my @gcb_short_enums; | |
289ce9cc | 116 | my %gcb_abbreviations; |
6b659339 KW |
117 | my %lb_enums; |
118 | my @lb_short_enums; | |
289ce9cc | 119 | my %lb_abbreviations; |
7e54b87f KW |
120 | my %wb_enums; |
121 | my @wb_short_enums; | |
289ce9cc | 122 | my %wb_abbreviations; |
6b659339 | 123 | |
99f21fb9 KW |
124 | my @a2n; |
125 | ||
394d2d3f KW |
126 | my %prop_name_aliases; |
127 | # Invert this hash so that for each canonical name, we get a list of things | |
128 | # that map to it (excluding itself) | |
048bdb72 KW |
129 | foreach my $name (sort keys %Unicode::UCD::loose_property_name_of) { |
130 | my $canonical = $Unicode::UCD::loose_property_name_of{$name}; | |
394d2d3f KW |
131 | push @{$prop_name_aliases{$canonical}}, $name if $canonical ne $name; |
132 | } | |
133 | ||
2d74dcf2 | 134 | # Output these tables in the same vicinity as each other, so that will get |
1aefa327 KW |
135 | # paged in at about the same time. These are also assumed to be the exact |
136 | # same list as those properties used internally by perl. | |
2d74dcf2 KW |
137 | my %keep_together = ( |
138 | assigned => 1, | |
139 | ascii => 1, | |
7a6f6841 KW |
140 | upper => 1, |
141 | lower => 1, | |
142 | title => 1, | |
2d74dcf2 | 143 | cased => 1, |
7a6f6841 KW |
144 | uppercaseletter => 1, |
145 | lowercaseletter => 1, | |
146 | titlecaseletter => 1, | |
147 | casedletter => 1, | |
2d74dcf2 KW |
148 | vertspace => 1, |
149 | xposixalnum => 1, | |
150 | xposixalpha => 1, | |
151 | xposixblank => 1, | |
152 | xposixcntrl => 1, | |
153 | xposixdigit => 1, | |
154 | xposixgraph => 1, | |
155 | xposixlower => 1, | |
156 | xposixprint => 1, | |
157 | xposixpunct => 1, | |
158 | xposixspace => 1, | |
159 | xposixupper => 1, | |
160 | xposixword => 1, | |
161 | xposixxdigit => 1, | |
162 | posixalnum => 1, | |
163 | posixalpha => 1, | |
164 | posixblank => 1, | |
165 | posixcntrl => 1, | |
166 | posixdigit => 1, | |
167 | posixgraph => 1, | |
168 | posixlower => 1, | |
169 | posixprint => 1, | |
170 | posixpunct => 1, | |
171 | posixspace => 1, | |
172 | posixupper => 1, | |
173 | posixword => 1, | |
174 | posixxdigit => 1, | |
175 | _perl_any_folds => 1, | |
176 | _perl_folds_to_multi_char => 1, | |
441f4830 KW |
177 | _perl_is_in_multi_char_fold => 1, |
178 | _perl_non_final_folds => 1, | |
2d74dcf2 KW |
179 | _perl_idstart => 1, |
180 | _perl_idcont => 1, | |
181 | _perl_charname_begin => 1, | |
182 | _perl_charname_continue => 1, | |
183 | _perl_problematic_locale_foldeds_start => 1, | |
184 | _perl_problematic_locale_folds => 1, | |
185 | _perl_quotemeta => 1, | |
186 | ); | |
1aefa327 | 187 | my %perl_tags; # So can find synonyms of the above properties |
2d74dcf2 | 188 | |
2027d365 KW |
189 | my $unused_table_hdr = 'u'; # Heading for row or column for unused values |
190 | ||
99f21fb9 KW |
191 | sub uniques { |
192 | # Returns non-duplicated input values. From "Perl Best Practices: | |
193 | # Encapsulated Cleverness". p. 455 in first edition. | |
194 | ||
195 | my %seen; | |
196 | return grep { ! $seen{$_}++ } @_; | |
197 | } | |
198 | ||
18072598 KW |
199 | sub caselessly { lc $a cmp lc $b } |
200 | ||
99f21fb9 KW |
201 | sub a2n($) { |
202 | my $cp = shift; | |
203 | ||
204 | # Returns the input Unicode code point translated to native. | |
205 | ||
98a1b8f7 | 206 | return $cp if $cp !~ $integer_or_float_re || $cp > 255; |
99f21fb9 KW |
207 | return $a2n[$cp]; |
208 | } | |
209 | ||
bffc0129 KW |
210 | sub end_file_pound_if { |
211 | if ($in_file_pound_if) { | |
212 | print $out_fh "\n#endif\t/* $in_file_pound_if */\n"; | |
0f8eed22 | 213 | $in_file_pound_if = ""; |
bffc0129 KW |
214 | } |
215 | } | |
216 | ||
48737b77 KW |
217 | sub end_charset_pound_if { |
218 | print $out_fh "\n" . get_conditional_compile_line_end(); | |
219 | } | |
220 | ||
8ec55631 | 221 | sub switch_pound_if ($$;$) { |
bffc0129 KW |
222 | my $name = shift; |
223 | my $new_pound_if = shift; | |
8ec55631 KW |
224 | my $charset = shift; |
225 | ||
62a54bb7 | 226 | my @new_pound_if = ref ($new_pound_if) |
0f8eed22 | 227 | ? sort @$new_pound_if |
62a54bb7 | 228 | : $new_pound_if; |
bffc0129 KW |
229 | |
230 | # Switch to new #if given by the 2nd argument. If there is an override | |
231 | # for this, it instead switches to that. The 1st argument is the | |
0f8eed22 | 232 | # static's name, used only to check if there is an override for this |
8ec55631 KW |
233 | # |
234 | # The 'charset' parmameter, if present, is used to first end the charset | |
235 | # #if if we actually do a switch, and then restart it afterwards. This | |
236 | # code, then assumes that the charset #if's are enclosed in the file ones. | |
bffc0129 KW |
237 | |
238 | if (exists $exceptions_to_where_to_define{$name}) { | |
62a54bb7 | 239 | @new_pound_if = $exceptions_to_where_to_define{$name}; |
bffc0129 KW |
240 | } |
241 | ||
0f8eed22 | 242 | foreach my $element (@new_pound_if) { |
cef72199 KW |
243 | |
244 | # regcomp.c is arranged so that the tables are not compiled in | |
b619f93d KW |
245 | # re_comp.c, but general enums and defines (which take no space) are |
246 | # compiled */ | |
247 | my $no_xsub = 1 if $name !~ /enum|define/ | |
248 | && $element =~ / PERL_IN_ (?: REGCOMP ) _C /x; | |
0f8eed22 | 249 | $element = "defined($element)"; |
cef72199 | 250 | $element = "($element && ! defined(PERL_IN_XSUB_RE))" if $no_xsub; |
bffc0129 | 251 | } |
0f8eed22 | 252 | $new_pound_if = join " || ", @new_pound_if; |
bffc0129 | 253 | |
0f8eed22 KW |
254 | # Change to the new one if different from old |
255 | if ($in_file_pound_if ne $new_pound_if) { | |
256 | ||
8ec55631 KW |
257 | end_charset_pound_if() if defined $charset; |
258 | ||
0f8eed22 KW |
259 | # Exit any current #if |
260 | if ($in_file_pound_if) { | |
261 | end_file_pound_if; | |
62a54bb7 | 262 | } |
0f8eed22 KW |
263 | |
264 | $in_file_pound_if = $new_pound_if; | |
bffc0129 | 265 | print $out_fh "\n#if $in_file_pound_if\n"; |
8ec55631 KW |
266 | |
267 | start_charset_pound_if ($charset, 1) if defined $charset; | |
43b443dd KW |
268 | } |
269 | } | |
270 | ||
48737b77 KW |
271 | sub start_charset_pound_if ($;$) { |
272 | print $out_fh "\n" . get_conditional_compile_line_start(shift, shift); | |
273 | } | |
274 | ||
cef72199 KW |
275 | { # Closure |
276 | my $fh; | |
277 | my $in_doinit = 0; | |
278 | ||
279 | sub output_table_header($$$;$@) { | |
280 | ||
281 | # Output to $fh the heading for a table given by the other inputs | |
282 | ||
283 | $fh = shift; | |
284 | my ($type, # typedef of table, like UV, UV* | |
285 | $name, # name of table | |
286 | $comment, # Optional comment to put on header line | |
287 | @sizes # Optional sizes of each array index. If omitted, | |
288 | # there is a single index whose size is computed by | |
289 | # the C compiler. | |
290 | ) = @_; | |
291 | ||
292 | $type =~ s/ \s+ $ //x; | |
293 | ||
294 | # If a the typedef is a ptr, add in an extra const | |
295 | $type .= " const" if $type =~ / \* $ /x; | |
296 | ||
297 | $comment = "" unless defined $comment; | |
298 | $comment = " /* $comment */" if $comment; | |
299 | ||
300 | my $array_declaration; | |
301 | if (@sizes) { | |
302 | $array_declaration = ""; | |
303 | $array_declaration .= "[$_]" for @sizes; | |
304 | } | |
305 | else { | |
306 | $array_declaration = '[]'; | |
307 | } | |
308 | ||
309 | my $declaration = "$type ${name}$array_declaration"; | |
310 | ||
311 | # Things not matching this are static. Otherwise, it is an external | |
312 | # constant, initialized only under DOINIT. | |
313 | # | |
314 | # (Currently everything is static) | |
315 | if ($in_file_pound_if !~ / PERL_IN_ (?: ) _C /x) { | |
316 | $in_doinit = 0; | |
317 | print $fh "\nstatic const $declaration = {$comment\n"; | |
318 | } | |
319 | else { | |
320 | $in_doinit = 1; | |
321 | print $fh <<EOF; | |
322 | ||
323 | # ifndef DOINIT | |
324 | ||
325 | EXTCONST $declaration; | |
326 | ||
327 | # else | |
328 | ||
329 | EXTCONST $declaration = {$comment | |
330 | EOF | |
331 | } | |
332 | } | |
333 | ||
334 | sub output_table_trailer() { | |
335 | ||
336 | # Close out a table started by output_table_header() | |
337 | ||
338 | print $fh "};\n"; | |
339 | if ($in_doinit) { | |
340 | print $fh "\n# endif /* DOINIT */\n\n"; | |
341 | $in_doinit = 0; | |
342 | } | |
343 | } | |
344 | } # End closure | |
345 | ||
346 | ||
0c4ecf42 | 347 | sub output_invlist ($$;$) { |
9d9177be KW |
348 | my $name = shift; |
349 | my $invlist = shift; # Reference to inversion list array | |
0c4ecf42 | 350 | my $charset = shift // ""; # name of character set for comment |
9d9177be | 351 | |
d1907b94 YO |
352 | print " output_invlist($name) $charset\n" if DEBUG; |
353 | ||
76d3994c | 354 | die "No inversion list for $name" unless defined $invlist |
ad85f59a | 355 | && ref $invlist eq 'ARRAY'; |
76d3994c | 356 | |
9d9177be KW |
357 | # Output the inversion list $invlist using the name $name for it. |
358 | # It is output in the exact internal form for inversion lists. | |
359 | ||
a0316a6c KW |
360 | # Is the last element of the header 0, or 1 ? |
361 | my $zero_or_one = 0; | |
ad85f59a | 362 | if (@$invlist && $invlist->[0] != 0) { |
a0316a6c | 363 | unshift @$invlist, 0; |
9d9177be KW |
364 | $zero_or_one = 1; |
365 | } | |
366 | ||
cef72199 KW |
367 | $charset = "for $charset" if $charset; |
368 | output_table_header($out_fh, "UV", "${name}_invlist", $charset); | |
9d9177be | 369 | |
cef72199 KW |
370 | my $count = @$invlist; |
371 | print $out_fh <<EOF; | |
372 | \t$count,\t/* Number of elements */ | |
373 | \t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */ | |
374 | \t$zero_or_one,\t/* 0 if the list starts at 0; | |
375 | \t\t 1 if it starts at the element beyond 0 */ | |
376 | EOF | |
9d9177be KW |
377 | |
378 | # The main body are the UVs passed in to this routine. Do the final | |
379 | # element separately | |
47d53124 KW |
380 | for my $i (0 .. @$invlist - 1) { |
381 | printf $out_fh "\t0x%X", $invlist->[$i]; | |
382 | print $out_fh "," if $i < @$invlist - 1; | |
383 | print $out_fh "\n"; | |
9d9177be KW |
384 | } |
385 | ||
cef72199 | 386 | output_table_trailer(); |
9d9177be KW |
387 | } |
388 | ||
99f21fb9 KW |
389 | sub output_invmap ($$$$$$$) { |
390 | my $name = shift; | |
391 | my $invmap = shift; # Reference to inversion map array | |
392 | my $prop_name = shift; | |
393 | my $input_format = shift; # The inversion map's format | |
394 | my $default = shift; # The property value for code points who | |
395 | # otherwise don't have a value specified. | |
396 | my $extra_enums = shift; # comma-separated list of our additions to the | |
397 | # property's standard possible values | |
398 | my $charset = shift // ""; # name of character set for comment | |
399 | ||
d1907b94 YO |
400 | print " output_invmap($name,$prop_name) $charset\n" if DEBUG; |
401 | ||
99f21fb9 KW |
402 | # Output the inversion map $invmap for property $prop_name, but use $name |
403 | # as the actual data structure's name. | |
404 | ||
405 | my $count = @$invmap; | |
406 | ||
407 | my $output_format; | |
18230d9d KW |
408 | my $invmap_declaration_type; |
409 | my $enum_declaration_type; | |
410 | my $aux_declaration_type; | |
99f21fb9 KW |
411 | my %enums; |
412 | my $name_prefix; | |
413 | ||
18230d9d | 414 | if ($input_format =~ / ^ [as] l? $ /x) { |
cf2cd619 KW |
415 | $prop_name = (prop_aliases($prop_name))[1] |
416 | // $prop_name =~ s/^_Perl_//r; # Get full name | |
02f811dd | 417 | my $short_name = (prop_aliases($prop_name))[0] // $prop_name; |
226b74db | 418 | my @input_enums; |
f79a09fc | 419 | |
226b74db | 420 | # Find all the possible input values. These become the enum names |
34623dbb KW |
421 | # that comprise the inversion map. For inputs that don't have sub |
422 | # lists, we can just get the unique values. Otherwise, we have to | |
423 | # expand the sublists first. | |
18230d9d | 424 | if ($input_format !~ / ^ a /x) { |
563f8b93 | 425 | if ($input_format ne 'sl') { |
18072598 | 426 | @input_enums = sort caselessly uniques(@$invmap); |
563f8b93 KW |
427 | } |
428 | else { | |
429 | foreach my $element (@$invmap) { | |
430 | if (ref $element) { | |
431 | push @input_enums, @$element; | |
432 | } | |
433 | else { | |
434 | push @input_enums, $element; | |
435 | } | |
34623dbb | 436 | } |
18072598 | 437 | @input_enums = sort caselessly uniques(@input_enums); |
34623dbb | 438 | } |
18230d9d | 439 | } |
6b659339 | 440 | |
226b74db | 441 | # The internal enums come last, and in the order specified. |
2027d365 KW |
442 | # |
443 | # The internal one named EDGE is also used a marker. Any ones that | |
444 | # come after it are used in the algorithms below, and so must be | |
445 | # defined, even if the release of Unicode this is being compiled for | |
446 | # doesn't use them. But since no code points are assigned to them in | |
447 | # such a release, those values will never be accessed. We collapse | |
448 | # all of them into a single placholder row and a column. The | |
449 | # algorithms below will fill in those cells with essentially garbage, | |
450 | # but they are never read, so it doesn't matter. This allows the | |
451 | # algorithm to remain the same from release to release. | |
452 | # | |
453 | # In one case, regexec.c also uses a placeholder which must be defined | |
454 | # here, and we put it in the unused row and column as its value is | |
455 | # never read. | |
456 | # | |
226b74db | 457 | my @enums = @input_enums; |
27a619f7 | 458 | my @extras; |
2027d365 KW |
459 | my @unused_enums; |
460 | my $unused_enum_value = @enums; | |
27a619f7 KW |
461 | if ($extra_enums ne "") { |
462 | @extras = split /,/, $extra_enums; | |
2027d365 | 463 | my $seen_EDGE = 0; |
226b74db KW |
464 | |
465 | # Don't add if already there. | |
466 | foreach my $this_extra (@extras) { | |
467 | next if grep { $_ eq $this_extra } @enums; | |
2027d365 KW |
468 | if ($this_extra eq 'EDGE') { |
469 | push @enums, $this_extra; | |
470 | $seen_EDGE = 1; | |
471 | } | |
472 | elsif ($seen_EDGE) { | |
473 | push @unused_enums, $this_extra; | |
474 | } | |
475 | else { | |
476 | push @enums, $this_extra; | |
477 | } | |
226b74db | 478 | } |
2027d365 | 479 | |
18072598 | 480 | @unused_enums = sort caselessly @unused_enums; |
2027d365 KW |
481 | $unused_enum_value = @enums; # All unused have the same value, |
482 | # one beyond the final used one | |
27a619f7 | 483 | } |
289ce9cc | 484 | |
2c490b6a KW |
485 | # These properties have extra tables written out for them that we want |
486 | # to make as compact and legible as possible. So we find short names | |
487 | # for their property values. For non-official ones we will need to | |
488 | # add a legend at the top of the table to say what the abbreviation | |
489 | # stands for. | |
490 | my $property_needs_table_re = qr/ ^ _Perl_ (?: GCB | LB | WB ) $ /x; | |
491 | ||
492 | my %short_enum_name; | |
493 | my %need_explanation; # For non-official abbreviations, we will need | |
494 | # to explain what the one we come up with | |
495 | # stands for | |
496 | my $type = lc $prop_name; | |
497 | if ($name =~ $property_needs_table_re) { | |
498 | my @short_names; # List of already used abbreviations, so we | |
499 | # don't duplicate | |
500 | for my $enum (@enums) { | |
501 | my $short_enum; | |
502 | my $is_official_name = 0; | |
503 | ||
504 | # Special case this wb property value to make the | |
505 | # name more clear | |
506 | if ($enum eq 'Perl_Tailored_HSpace') { | |
507 | $short_enum = 'hs'; | |
508 | } | |
509 | else { | |
510 | ||
511 | # Use the official short name, if found. | |
512 | ($short_enum) = prop_value_aliases($type, $enum); | |
513 | if ( defined $short_enum) { | |
514 | $is_official_name = 1; | |
515 | } | |
516 | else { | |
517 | # But if there is no official name, use the name that | |
518 | # came from the data (if any). Otherwise, the name | |
519 | # had to come from the extras list. There are two | |
520 | # types of values in that list. | |
521 | # | |
522 | # First are those enums that are not part of the | |
523 | # property, but are defined by the code in this file. | |
524 | # By convention these have all-caps names. We use the | |
525 | # lowercased name for these. | |
526 | # | |
527 | # Second are enums that are needed to get the | |
528 | # algorithms below to work and/or to get regexec.c to | |
529 | # compile, but don't exist in all Unicode releases. | |
530 | # These are handled outside this loop as | |
531 | # 'unused_enums' (as they are unused they all get | |
532 | # collapsed into a single column, and their names | |
533 | # don't matter) | |
534 | if (grep { $_ eq $enum } @input_enums) { | |
535 | $short_enum = $enum | |
536 | } | |
537 | else { | |
538 | $short_enum = lc $enum; | |
539 | } | |
540 | } | |
541 | ||
542 | # If our short name is too long, or we already know that | |
543 | # the name is an abbreviation, truncate to make sure it's | |
544 | # short enough, and remember that we did this so we can | |
545 | # later add a comment in the generated file | |
546 | if (length $short_enum > $max_hdr_len) { | |
547 | # First try using just the uppercase letters of the name; | |
548 | # if it is something like FooBar, FB is a better | |
549 | # abbreviation than Foo. That's not the case if it is | |
550 | # entirely lowercase. | |
551 | my $uc = $short_enum; | |
552 | $uc =~ s/[[:^upper:]]//g; | |
553 | $short_enum = $uc if length $uc > 1 | |
554 | && length $uc < length $short_enum; | |
555 | ||
556 | $short_enum = substr($short_enum, 0, $max_hdr_len); | |
557 | $is_official_name = 0; | |
558 | } | |
559 | } | |
560 | ||
561 | # If the name we are to display conflicts, try another. | |
562 | if (grep { $_ eq $short_enum } @short_names) { | |
563 | $is_official_name = 0; | |
564 | do { # The increment operator on strings doesn't work on | |
565 | # those containing an '_', so get rid of any final | |
566 | # portion. | |
567 | $short_enum =~ s/_//g; | |
568 | $short_enum++; | |
569 | } while grep { $_ eq $short_enum } @short_names; | |
570 | } | |
571 | ||
572 | push @short_names, $short_enum; | |
573 | $short_enum_name{$enum} = $short_enum; | |
574 | $need_explanation{$enum} = $short_enum unless $is_official_name; | |
575 | } | |
576 | } # End of calculating short enum names for certain properties | |
577 | ||
226b74db KW |
578 | # Assign a value to each element of the enum type we are creating. |
579 | # The default value always gets 0; the others are arbitrarily | |
2c490b6a KW |
580 | # assigned, but for the properties which have the extra table, it is |
581 | # in the order we have computed above so the rows and columns appear | |
582 | # alphabetically by heading abbreviation. | |
27a619f7 KW |
583 | my $enum_val = 0; |
584 | my $canonical_default = prop_value_aliases($prop_name, $default); | |
585 | $default = $canonical_default if defined $canonical_default; | |
586 | $enums{$default} = $enum_val++; | |
226b74db | 587 | |
2c490b6a KW |
588 | for my $enum (sort { ($name =~ $property_needs_table_re) |
589 | ? lc $short_enum_name{$a} | |
590 | cmp lc $short_enum_name{$b} | |
591 | : lc $a cmp lc $b | |
592 | } @enums) | |
593 | { | |
27a619f7 KW |
594 | $enums{$enum} = $enum_val++ unless exists $enums{$enum}; |
595 | } | |
596 | ||
2c490b6a KW |
597 | # Now calculate the data for the special tables output for these |
598 | # properties. | |
599 | if ($name =~ $property_needs_table_re) { | |
27a619f7 | 600 | |
226b74db KW |
601 | # The data includes the hashes %gcb_enums, %lb_enums, etc. |
602 | # Similarly we calculate column headings for the tables. | |
603 | # | |
27a619f7 | 604 | # We use string evals to allow the same code to work on |
226b74db | 605 | # all the tables |
27a619f7 | 606 | |
27a619f7 KW |
607 | # Skip if we've already done this code, which populated |
608 | # this hash | |
609 | if (eval "! \%${type}_enums") { | |
610 | ||
226b74db | 611 | # For each enum in the type ... |
2c490b6a | 612 | foreach my $enum (keys %enums) { |
27a619f7 | 613 | my $value = $enums{$enum}; |
2c490b6a | 614 | my $short_enum = $short_enum_name{$enum}; |
27a619f7 KW |
615 | |
616 | # Remember the mapping from the property value | |
617 | # (enum) name to its value. | |
618 | eval "\$${type}_enums{$enum} = $value"; | |
619 | die $@ if $@; | |
620 | ||
621 | # Remember the inverse mapping to the short name | |
622 | # so that we can properly label the generated | |
623 | # table's rows and columns | |
2c490b6a | 624 | eval "\$${type}_short_enums[$value] = '$short_enum'"; |
27a619f7 | 625 | die $@ if $@; |
2c490b6a KW |
626 | |
627 | # And note the abbreviations that need explanation | |
628 | if ($need_explanation{$enum}) { | |
629 | eval "\$${type}_abbreviations{$short_enum} = '$enum'"; | |
630 | die $@ if $@; | |
631 | } | |
7e54b87f | 632 | } |
2027d365 KW |
633 | |
634 | # Each unused enum has the same value. They all are collapsed | |
635 | # into one row and one column, named $unused_table_hdr. | |
636 | if (@unused_enums) { | |
637 | eval "\$${type}_short_enums['$unused_enum_value'] = '$unused_table_hdr'"; | |
638 | die $@ if $@; | |
639 | ||
640 | foreach my $enum (@unused_enums) { | |
641 | eval "\$${type}_enums{$enum} = $unused_enum_value"; | |
642 | die $@ if $@; | |
643 | } | |
644 | } | |
99f21fb9 | 645 | } |
19a5f1d5 | 646 | } |
99f21fb9 | 647 | |
cf2cd619 KW |
648 | # The short property names tend to be two lower case letters, but it |
649 | # looks better for those if they are upper. XXX | |
19a5f1d5 | 650 | $short_name = uc($short_name) if length($short_name) < 3 |
cf2cd619 | 651 | || substr($short_name, 0, 1) =~ /[[:lower:]]/; |
19a5f1d5 | 652 | $name_prefix = "${short_name}_"; |
cdc243dd | 653 | |
226b74db | 654 | # Start the enum definition for this map |
f99e0590 | 655 | my @enum_definition; |
19a5f1d5 KW |
656 | my @enum_list; |
657 | foreach my $enum (keys %enums) { | |
658 | $enum_list[$enums{$enum}] = $enum; | |
99f21fb9 | 659 | } |
19a5f1d5 | 660 | foreach my $i (0 .. @enum_list - 1) { |
f99e0590 | 661 | push @enum_definition, ",\n" if $i > 0; |
34623dbb | 662 | |
19a5f1d5 | 663 | my $name = $enum_list[$i]; |
f99e0590 | 664 | push @enum_definition, "\t${name_prefix}$name = $i"; |
19a5f1d5 | 665 | } |
2027d365 KW |
666 | if (@unused_enums) { |
667 | foreach my $unused (@unused_enums) { | |
668 | push @enum_definition, | |
669 | ",\n\t${name_prefix}$unused = $unused_enum_value"; | |
670 | } | |
671 | } | |
34623dbb | 672 | |
18230d9d | 673 | # For an 'l' property, we need extra enums, because some of the |
34623dbb KW |
674 | # elements are lists. Each such distinct list is placed in its own |
675 | # auxiliary map table. Here, we go through the inversion map, and for | |
676 | # each distinct list found, create an enum value for it, numbered -1, | |
677 | # -2, .... | |
678 | my %multiples; | |
679 | my $aux_table_prefix = "AUX_TABLE_"; | |
18230d9d | 680 | if ($input_format =~ /l/) { |
34623dbb KW |
681 | foreach my $element (@$invmap) { |
682 | ||
683 | # A regular scalar is not one of the lists we're looking for | |
684 | # at this stage. | |
685 | next unless ref $element; | |
686 | ||
18230d9d KW |
687 | my $joined; |
688 | if ($input_format =~ /a/) { # These are already ordered | |
689 | $joined = join ",", @$element; | |
690 | } | |
691 | else { | |
18072598 | 692 | $joined = join ",", sort caselessly @$element; |
18230d9d | 693 | } |
34623dbb KW |
694 | my $already_found = exists $multiples{$joined}; |
695 | ||
696 | my $i; | |
697 | if ($already_found) { # Use any existing one | |
698 | $i = $multiples{$joined}; | |
699 | } | |
700 | else { # Otherwise increment to get a new table number | |
701 | $i = keys(%multiples) + 1; | |
702 | $multiples{$joined} = $i; | |
703 | } | |
704 | ||
705 | # This changes the inversion map for this entry to not be the | |
706 | # list | |
707 | $element = "use_$aux_table_prefix$i"; | |
708 | ||
709 | # And add to the enum values | |
710 | if (! $already_found) { | |
f99e0590 | 711 | push @enum_definition, ",\n\t${name_prefix}$element = -$i"; |
34623dbb KW |
712 | } |
713 | } | |
714 | } | |
715 | ||
18230d9d | 716 | $enum_declaration_type = "${name_prefix}enum"; |
f99e0590 | 717 | |
c454388e KW |
718 | # Finished with the enum definition. Inversion map stuff is used only |
719 | # by regexec or utf-8 (if it is for code points) , unless it is in the | |
720 | # enum exception list | |
721 | my $where = (exists $where_to_define_enums{$name}) | |
722 | ? $where_to_define_enums{$name} | |
723 | : ($input_format =~ /a/) | |
724 | ? 'PERL_IN_UTF8_C' | |
725 | : 'PERL_IN_REGEXEC_C'; | |
726 | ||
8ec55631 KW |
727 | if (! exists $public_enums{$name}) { |
728 | switch_pound_if($name, $where, $charset); | |
729 | } | |
730 | else { | |
731 | end_charset_pound_if; | |
732 | end_file_pound_if; | |
733 | start_charset_pound_if($charset, 1); | |
734 | } | |
c454388e KW |
735 | |
736 | # If the enum only contains one element, that is a dummy, default one | |
f99e0590 KW |
737 | if (scalar @enum_definition > 1) { |
738 | ||
739 | # Currently unneeded | |
740 | #print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", | |
741 | # ..scalar keys %enums, "\n"; | |
742 | ||
743 | if ($input_format =~ /l/) { | |
744 | print $out_fh | |
745 | "\n", | |
746 | "/* Negative enum values indicate the need to use an", | |
747 | " auxiliary table\n", | |
748 | " * consisting of the list of enums this one expands to.", | |
749 | " The absolute\n", | |
750 | " * values of the negative enums are indices into a table", | |
751 | " of the auxiliary\n", | |
752 | " * tables' addresses */"; | |
753 | } | |
754 | print $out_fh "\ntypedef enum {\n"; | |
755 | print $out_fh join "", @enum_definition; | |
756 | print $out_fh "\n"; | |
18230d9d | 757 | print $out_fh "} $enum_declaration_type;\n"; |
f99e0590 | 758 | } |
19a5f1d5 | 759 | |
8ec55631 | 760 | switch_pound_if($name, $where, $charset); |
d74e7480 | 761 | |
40d2776f KW |
762 | # The inversion lists here have to be UV because inversion lists are |
763 | # capable of storing any code point, and even though the the ones here | |
764 | # are only Unicode ones, which need just 21 bits, they are linked to | |
765 | # directly, rather than copied. The inversion map and aux tables also | |
766 | # only need be 21 bits, and so we can get away with declaring them | |
767 | # 32-bits to save a little space and memory (on some 64-bit | |
768 | # platforms), as they are copied. | |
18230d9d KW |
769 | $invmap_declaration_type = ($input_format =~ /s/) |
770 | ? $enum_declaration_type | |
40d2776f | 771 | : "I32"; |
18230d9d KW |
772 | $aux_declaration_type = ($input_format =~ /s/) |
773 | ? $enum_declaration_type | |
40d2776f | 774 | : "U32"; |
18230d9d | 775 | |
19a5f1d5 | 776 | $output_format = "${name_prefix}%s"; |
34623dbb KW |
777 | |
778 | # If there are auxiliary tables, output them. | |
779 | if (%multiples) { | |
780 | ||
781 | print $out_fh "\n#define HAS_${name_prefix}AUX_TABLES\n"; | |
782 | ||
783 | # Invert keys and values | |
784 | my %inverted_mults; | |
785 | while (my ($key, $value) = each %multiples) { | |
786 | $inverted_mults{$value} = $key; | |
787 | } | |
788 | ||
789 | # Output them in sorted order | |
790 | my @sorted_table_list = sort { $a <=> $b } keys %inverted_mults; | |
791 | ||
792 | # Keep track of how big each aux table is | |
793 | my @aux_counts; | |
794 | ||
795 | # Output each aux table. | |
796 | foreach my $table_number (@sorted_table_list) { | |
797 | my $table = $inverted_mults{$table_number}; | |
cef72199 | 798 | output_table_header($out_fh, |
cf2cd619 KW |
799 | $aux_declaration_type, |
800 | "$name_prefix$aux_table_prefix$table_number"); | |
34623dbb | 801 | |
cf2cd619 KW |
802 | # Earlier, we joined the elements of this table together with |
803 | # a comma | |
34623dbb KW |
804 | my @elements = split ",", $table; |
805 | ||
806 | $aux_counts[$table_number] = scalar @elements; | |
807 | for my $i (0 .. @elements - 1) { | |
808 | print $out_fh ",\n" if $i > 0; | |
18230d9d KW |
809 | if ($input_format =~ /a/) { |
810 | printf $out_fh "\t0x%X", $elements[$i]; | |
811 | } | |
812 | else { | |
813 | print $out_fh "\t${name_prefix}$elements[$i]"; | |
814 | } | |
34623dbb | 815 | } |
cef72199 KW |
816 | |
817 | print $out_fh "\n"; | |
818 | output_table_trailer(); | |
34623dbb KW |
819 | } |
820 | ||
821 | # Output the table that is indexed by the absolute value of the | |
822 | # aux table enum and contains pointers to the tables output just | |
823 | # above | |
cef72199 KW |
824 | output_table_header($out_fh, "$aux_declaration_type *", |
825 | "${name_prefix}${aux_table_prefix}ptrs"); | |
34623dbb KW |
826 | print $out_fh "\tNULL,\t/* Placeholder */\n"; |
827 | for my $i (1 .. @sorted_table_list) { | |
828 | print $out_fh ",\n" if $i > 1; | |
829 | print $out_fh "\t$name_prefix$aux_table_prefix$i"; | |
830 | } | |
cef72199 KW |
831 | print $out_fh "\n"; |
832 | output_table_trailer(); | |
34623dbb KW |
833 | |
834 | print $out_fh | |
835 | "\n/* Parallel table to the above, giving the number of elements" | |
836 | . " in each table\n * pointed to */\n"; | |
cef72199 KW |
837 | output_table_header($out_fh, "U8", |
838 | "${name_prefix}${aux_table_prefix}lengths"); | |
34623dbb KW |
839 | print $out_fh "\t0,\t/* Placeholder */\n"; |
840 | for my $i (1 .. @sorted_table_list) { | |
cf2cd619 KW |
841 | print $out_fh ",\n" if $i > 1; |
842 | print $out_fh | |
843 | "\t$aux_counts[$i]\t/* $name_prefix$aux_table_prefix$i */"; | |
34623dbb | 844 | } |
cef72199 KW |
845 | print $out_fh "\n"; |
846 | output_table_trailer(); | |
34623dbb | 847 | } # End of outputting the auxiliary and associated tables |
463b4a67 KW |
848 | |
849 | # The scx property used in regexec.c needs a specialized table which | |
850 | # is most convenient to output here, while the data structures set up | |
851 | # above are still extant. This table contains the code point that is | |
852 | # the zero digit of each script, indexed by script enum value. | |
853 | if (lc $short_name eq 'scx') { | |
854 | my @decimals_invlist = prop_invlist("Numeric_Type=Decimal"); | |
855 | my %script_zeros; | |
856 | ||
857 | # Find all the decimal digits. The 0 of each range is always the | |
858 | # 0th element, except in some early Unicode releases, so check for | |
859 | # that. | |
860 | for (my $i = 0; $i < @decimals_invlist; $i += 2) { | |
861 | my $code_point = $decimals_invlist[$i]; | |
a1c8344d | 862 | next if num(chr($code_point)) ne '0'; |
463b4a67 KW |
863 | |
864 | # Turn the scripts this zero is in into a list. | |
865 | my @scripts = split ",", | |
866 | charprop($code_point, "_Perl_SCX", '_perl_core_internal_ok'); | |
867 | $code_point = sprintf("0x%x", $code_point); | |
868 | ||
869 | foreach my $script (@scripts) { | |
870 | if (! exists $script_zeros{$script}) { | |
871 | $script_zeros{$script} = $code_point; | |
872 | } | |
873 | elsif (ref $script_zeros{$script}) { | |
874 | push $script_zeros{$script}->@*, $code_point; | |
875 | } | |
876 | else { # Turn into a list if this is the 2nd zero of the | |
877 | # script | |
878 | my $existing = $script_zeros{$script}; | |
879 | undef $script_zeros{$script}; | |
880 | push $script_zeros{$script}->@*, $existing, $code_point; | |
881 | } | |
882 | } | |
883 | } | |
884 | ||
885 | # @script_zeros contains the zero, sorted by the script's enum | |
886 | # value | |
887 | my @script_zeros; | |
888 | foreach my $script (keys %script_zeros) { | |
889 | my $enum_value = $enums{$script}; | |
890 | $script_zeros[$enum_value] = $script_zeros{$script}; | |
891 | } | |
892 | ||
893 | print $out_fh | |
894 | "\n/* This table, indexed by the script enum, gives the zero" | |
895 | . " code point for that\n * script; 0 if the script has multiple" | |
896 | . " digit sequences. Scripts without a\n * digit sequence use" | |
897 | . " ASCII [0-9], hence are marked '0' */\n"; | |
cef72199 | 898 | output_table_header($out_fh, "UV", "script_zeros"); |
463b4a67 KW |
899 | for my $i (0 .. @script_zeros - 1) { |
900 | my $code_point = $script_zeros[$i]; | |
901 | if (defined $code_point) { | |
902 | $code_point = " 0" if ref $code_point; | |
903 | print $out_fh "\t$code_point"; | |
904 | } | |
905 | elsif (lc $enum_list[$i] eq 'inherited') { | |
906 | print $out_fh "\t 0"; | |
907 | } | |
908 | else { # The only digits a script without its own set accepts | |
909 | # is [0-9] | |
910 | print $out_fh "\t'0'"; | |
911 | } | |
912 | print $out_fh "," if $i < @script_zeros - 1; | |
913 | print $out_fh "\t/* $enum_list[$i] */"; | |
914 | print $out_fh "\n"; | |
915 | } | |
cef72199 | 916 | output_table_trailer(); |
463b4a67 | 917 | } # End of special handling of scx |
99f21fb9 KW |
918 | } |
919 | else { | |
920 | die "'$input_format' invmap() format for '$prop_name' unimplemented"; | |
921 | } | |
922 | ||
923 | die "No inversion map for $prop_name" unless defined $invmap | |
924 | && ref $invmap eq 'ARRAY' | |
925 | && $count; | |
926 | ||
226b74db | 927 | # Now output the inversion map proper |
cef72199 KW |
928 | $charset = "for $charset" if $charset; |
929 | output_table_header($out_fh, $invmap_declaration_type, | |
930 | "${name}_invmap", | |
931 | $charset); | |
99f21fb9 KW |
932 | |
933 | # The main body are the scalars passed in to this routine. | |
934 | for my $i (0 .. $count - 1) { | |
935 | my $element = $invmap->[$i]; | |
02f811dd | 936 | my $full_element_name = prop_value_aliases($prop_name, $element); |
18230d9d KW |
937 | if ($input_format =~ /a/ && $element !~ /\D/) { |
938 | $element = ($element == 0) | |
939 | ? 0 | |
940 | : sprintf("0x%X", $element); | |
941 | } | |
942 | else { | |
02f811dd KW |
943 | $element = $full_element_name if defined $full_element_name; |
944 | $element = $name_prefix . $element; | |
18230d9d | 945 | } |
99f21fb9 KW |
946 | print $out_fh "\t$element"; |
947 | print $out_fh "," if $i < $count - 1; | |
948 | print $out_fh "\n"; | |
949 | } | |
cef72199 | 950 | output_table_trailer(); |
99f21fb9 KW |
951 | } |
952 | ||
5a7e5385 | 953 | sub mk_invlist_from_sorted_cp_list { |
a02047bf KW |
954 | |
955 | # Returns an inversion list constructed from the sorted input array of | |
956 | # code points | |
957 | ||
958 | my $list_ref = shift; | |
959 | ||
99f21fb9 KW |
960 | return unless @$list_ref; |
961 | ||
a02047bf KW |
962 | # Initialize to just the first element |
963 | my @invlist = ( $list_ref->[0], $list_ref->[0] + 1); | |
964 | ||
965 | # For each succeeding element, if it extends the previous range, adjust | |
966 | # up, otherwise add it. | |
967 | for my $i (1 .. @$list_ref - 1) { | |
968 | if ($invlist[-1] == $list_ref->[$i]) { | |
969 | $invlist[-1]++; | |
970 | } | |
971 | else { | |
972 | push @invlist, $list_ref->[$i], $list_ref->[$i] + 1; | |
973 | } | |
974 | } | |
975 | return @invlist; | |
976 | } | |
977 | ||
d1907b94 | 978 | print "Reading Case Folding rules.\n" if DEBUG; |
a02047bf KW |
979 | # Read in the Case Folding rules, and construct arrays of code points for the |
980 | # properties we need. | |
d2aadf62 | 981 | my ($cp_ref, $folds_ref, $format, $default) = prop_invmap("Case_Folding"); |
a02047bf KW |
982 | die "Could not find inversion map for Case_Folding" unless defined $format; |
983 | die "Incorrect format '$format' for Case_Folding inversion map" | |
347b9066 KW |
984 | unless $format eq 'al' |
985 | || $format eq 'a'; | |
d1907b94 YO |
986 | print "Finished reading Case Folding rules.\n" if DEBUG; |
987 | ||
988 | ||
d2aadf62 KW |
989 | sub _Perl_IVCF { |
990 | ||
991 | # This creates a map of the inversion of case folding. i.e., given a | |
992 | # character, it gives all the other characters that fold to it. | |
993 | # | |
994 | # Inversion maps function kind of like a hash, with the inversion list | |
995 | # specifying the buckets (keys) and the inversion maps specifying the | |
996 | # contents of the corresponding bucket. Effectively this function just | |
997 | # swaps the keys and values of the case fold hash. But there are | |
998 | # complications. Most importantly, More than one character can each have | |
999 | # the same fold. This is solved by having a list of characters that fold | |
1000 | # to a given one. | |
1001 | ||
1002 | my %new; | |
1003 | ||
1004 | # Go through the inversion list. | |
1005 | for (my $i = 0; $i < @$cp_ref; $i++) { | |
1006 | ||
1007 | # Skip if nothing folds to this | |
1008 | next if $folds_ref->[$i] == 0; | |
1009 | ||
1010 | # This entry which is valid from here to up (but not including) the | |
1011 | # next entry is for the next $count characters, so that, for example, | |
1012 | # A-Z is represented by one entry. | |
1013 | my $cur_list = $cp_ref->[$i]; | |
1014 | my $count = $cp_ref->[$i+1] - $cur_list; | |
1015 | ||
1016 | # The fold of [$i] can be not just a single character, but a sequence | |
1017 | # of multiple ones. We deal with those here by just creating a string | |
1018 | # consisting of them. Otherwise, we use the single code point [$i] | |
1019 | # folds to. | |
1020 | my $cur_map = (ref $folds_ref->[$i]) | |
1021 | ? join "", map { chr } $folds_ref->[$i]->@* | |
1022 | : $folds_ref->[$i]; | |
1023 | ||
1024 | # Expand out this range | |
1025 | while ($count > 0) { | |
1026 | push @{$new{$cur_map}}, $cur_list; | |
1027 | ||
1028 | # A multiple-character fold is a string, and shouldn't need | |
1029 | # incrementing anyway | |
1030 | if (ref $folds_ref->[$i]) { | |
1031 | die sprintf("Case fold for %x is multiple chars; should have" | |
1032 | . " a count of 1, but instead it was $count", $count) | |
1033 | unless $count == 1; | |
1034 | } | |
1035 | else { | |
1036 | $cur_map++; | |
1037 | $cur_list++; | |
1038 | } | |
1039 | $count--; | |
1040 | } | |
1041 | } | |
1042 | ||
1043 | # Now go through and make some adjustments. We add synthetic entries for | |
59142b8b KW |
1044 | # three cases. |
1045 | # 1) If the fold of a Latin1-range character is above that range, some | |
1046 | # coding in regexec.c can be saved by creating a reverse map here. The | |
1047 | # impetus for this is that U+B5 (MICRO SIGN) folds to the Greek small | |
1048 | # mu (U+3BC). That fold isn't done at regex pattern compilation time | |
1049 | # if it means that the pattern would have to be translated into UTF-8, | |
1050 | # whose operation is slower. At run time, having this reverse | |
1051 | # translation eliminates some special cases in the code. | |
1052 | # 2) Two or more code points can fold to the same multiple character, | |
d2aadf62 KW |
1053 | # sequence, as U+FB05 and U+FB06 both fold to 'st'. This code is only |
1054 | # for single character folds, but FB05 and FB06 are single characters | |
1055 | # that are equivalent folded, so we add entries so that they are | |
1056 | # considered to fold to each other | |
59142b8b | 1057 | # 3) If two or more above-Latin1 code points fold to the same Latin1 range |
d2aadf62 KW |
1058 | # one, we also add entries so that they are considered to fold to each |
1059 | # other. This is so that under /aa or /l matching, where folding to | |
1060 | # their Latin1 range code point is illegal, they still can fold to each | |
1061 | # other. This situation happens in Unicode 3.0.1, but probably no | |
1062 | # other version. | |
1063 | foreach my $fold (keys %new) { | |
db95f459 | 1064 | my $folds_to_string = $fold =~ /\D/; |
d2aadf62 KW |
1065 | |
1066 | # If the bucket contains only one element, convert from an array to a | |
1067 | # scalar | |
1068 | if (scalar $new{$fold}->@* == 1) { | |
1069 | $new{$fold} = $new{$fold}[0]; | |
59142b8b KW |
1070 | |
1071 | # Handle case 1) above: if there were a Latin1 range code point | |
1072 | # whose fold is above that range, this creates an extra entry that | |
1073 | # maps the other direction, and would save some special case code. | |
1074 | # (The one current case of this is handled in the else clause | |
1075 | # below.) | |
1076 | $new{$new{$fold}} = $fold if $new{$fold} < 256 && $fold > 255; | |
d2aadf62 KW |
1077 | } |
1078 | else { | |
1079 | ||
59142b8b KW |
1080 | # Handle case 1) when there are multiple things that fold to an |
1081 | # above-Latin1 code point, at least one of which is in Latin1. | |
1082 | if (! $folds_to_string && $fold > 255) { | |
1083 | foreach my $cp ($new{$fold}->@*) { | |
1084 | if ($cp < 256) { | |
1085 | my @new_entry = grep { $_ != $cp } $new{$fold}->@*; | |
1086 | push @new_entry, $fold; | |
1087 | $new{$cp}->@* = @new_entry; | |
1088 | } | |
1089 | } | |
1090 | } | |
1091 | ||
d2aadf62 KW |
1092 | # Otherwise, sort numerically. This places the highest code point |
1093 | # in the list at the tail end. This is because Unicode keeps the | |
1094 | # lowercase code points as higher ordinals than the uppercase, at | |
1095 | # least for the ones that matter so far. These are synthetic | |
1096 | # entries, and we want to predictably have the lowercase (which is | |
1097 | # more likely to be what gets folded to) in the same corresponding | |
1098 | # position, so that other code can rely on that. If some new | |
1099 | # version of Unicode came along that violated this, we might have | |
1100 | # to change so that the sort is based on upper vs lower instead. | |
1101 | # (The lower-comes-after isn't true of native EBCDIC, but here we | |
1102 | # are dealing strictly with Unicode values). | |
1103 | @{$new{$fold}} = sort { $a <=> $b } $new{$fold}->@* | |
1104 | unless $folds_to_string; | |
1105 | # We will be working with a copy of this sorted entry. | |
1106 | my @source_list = $new{$fold}->@*; | |
1107 | if (! $folds_to_string) { | |
1108 | ||
1109 | # This handles situation 2) listed above, which only arises if | |
1110 | # what is being folded-to (the fold) is in the Latin1 range. | |
1111 | if ($fold > 255 ) { | |
1112 | undef @source_list; | |
1113 | } | |
1114 | else { | |
1115 | # And it only arises if there are two or more folders that | |
1116 | # fold to it above Latin1. We look at just those. | |
1117 | @source_list = grep { $_ > 255 } @source_list; | |
1118 | undef @source_list if @source_list == 1; | |
1119 | } | |
1120 | } | |
1121 | ||
1122 | # Here, we've found the items we want to set up synthetic folds | |
1123 | # for. Add entries so that each folds to each other. | |
1124 | foreach my $cp (@source_list) { | |
1125 | my @rest = grep { $cp != $_ } @source_list; | |
1126 | if (@rest == 1) { | |
1127 | $new{$cp} = $rest[0]; | |
1128 | } | |
1129 | else { | |
1130 | push @{$new{$cp}}, @rest; | |
1131 | } | |
1132 | } | |
1133 | } | |
1134 | ||
1135 | # We don't otherwise deal with multiple-character folds | |
1136 | delete $new{$fold} if $folds_to_string; | |
1137 | } | |
1138 | ||
1139 | ||
1140 | # Now we have a hash that is the inversion of the case fold property. | |
cb2d98ed KW |
1141 | # First find the maximum number of code points that fold to the same one. |
1142 | foreach my $fold_to (keys %new) { | |
1143 | if (ref $new{$fold_to}) { | |
1144 | my $folders_count = scalar @{$new{$fold_to}}; | |
1145 | $max_fold_froms = $folders_count if $folders_count > $max_fold_froms; | |
1146 | } | |
1147 | } | |
d2aadf62 | 1148 | |
cb2d98ed | 1149 | # Then convert the hash to an inversion map. |
d2aadf62 KW |
1150 | my @sorted_folds = sort { $a <=> $b } keys %new; |
1151 | my (@invlist, @invmap); | |
1152 | ||
1153 | # We know that nothing folds to the controls (whose ordinals start at 0). | |
1154 | # And the first real entries are the lowest in the hash. | |
1155 | push @invlist, 0, $sorted_folds[0]; | |
1156 | push @invmap, 0, $new{$sorted_folds[0]}; | |
1157 | ||
1158 | # Go through the remainder of the hash keys (which are the folded code | |
1159 | # points) | |
1160 | for (my $i = 1; $i < @sorted_folds; $i++) { | |
1161 | ||
1162 | # Get the current one, and the one prior to it. | |
1163 | my $fold = $sorted_folds[$i]; | |
1164 | my $prev_fold = $sorted_folds[$i-1]; | |
1165 | ||
1166 | # If the current one is not just 1 away from the prior one, we close | |
1167 | # out the range containing the previous fold, and know that the gap | |
1168 | # doesn't have anything that folds. | |
1169 | if ($fold - 1 != $prev_fold) { | |
1170 | push @invlist, $prev_fold + 1; | |
1171 | push @invmap, 0; | |
1172 | ||
1173 | # And start a new range | |
1174 | push @invlist, $fold; | |
1175 | push @invmap, $new{$fold}; | |
1176 | } | |
1177 | elsif ($new{$fold} - 1 != $new{$prev_fold}) { | |
1178 | ||
1179 | # Here the current fold is just 1 greater than the previous, but | |
1180 | # the new map isn't correspondingly 1 greater than the previous, | |
1181 | # the old range is ended, but since there is no gap, we don't have | |
1182 | # to insert anything else. | |
1183 | push @invlist, $fold; | |
1184 | push @invmap, $new{$fold}; | |
1185 | ||
1186 | } # else { Otherwise, this new entry just extends the previous } | |
1187 | ||
1188 | die "In IVCF: $invlist[-1] <= $invlist[-2]" | |
1189 | if $invlist[-1] <= $invlist[-2]; | |
1190 | } | |
1191 | ||
1192 | # And add an entry that indicates that everything above this, to infinity, | |
1193 | # does not have a case fold. | |
1194 | push @invlist, $sorted_folds[-1] + 1; | |
1195 | push @invmap, 0; | |
1196 | ||
9a9a3246 KW |
1197 | push @invlist, 0x110000; |
1198 | push @invmap, 0; | |
1199 | ||
d2aadf62 KW |
1200 | # All Unicode versions have some places where multiple code points map to |
1201 | # the same one, so the format always has an 'l' | |
1202 | return \@invlist, \@invmap, 'al', $default; | |
1203 | } | |
1204 | ||
99f21fb9 KW |
1205 | sub prop_name_for_cmp ($) { # Sort helper |
1206 | my $name = shift; | |
1207 | ||
1208 | # Returns the input lowercased, with non-alphas removed, as well as | |
1209 | # everything starting with a comma | |
1210 | ||
1211 | $name =~ s/,.*//; | |
1212 | $name =~ s/[[:^alpha:]]//g; | |
1213 | return lc $name; | |
1214 | } | |
1215 | ||
892d8259 | 1216 | sub UpperLatin1 { |
8843f0de KW |
1217 | my @return = mk_invlist_from_sorted_cp_list([ 128 .. 255 ]); |
1218 | return \@return; | |
892d8259 KW |
1219 | } |
1220 | ||
a2aeff50 KW |
1221 | sub _Perl_CCC_non0_non230 { |
1222 | ||
1223 | # Create an inversion list of code points with non-zero canonical | |
1224 | # combining class that also don't have 230 as the class number. This is | |
1225 | # part of a Unicode Standard rule | |
1226 | ||
1227 | my @nonzeros = prop_invlist("ccc=0"); | |
1228 | shift @nonzeros; # Invert so is "ccc != 0" | |
1229 | ||
1230 | my @return; | |
1231 | ||
1232 | # Expand into list of code points, while excluding those with ccc == 230 | |
1233 | for (my $i = 0; $i < @nonzeros; $i += 2) { | |
1234 | my $upper = ($i + 1) < @nonzeros | |
1235 | ? $nonzeros[$i+1] - 1 # In range | |
1236 | : $Unicode::UCD::MAX_CP; # To infinity. | |
1237 | for my $j ($nonzeros[$i] .. $upper) { | |
1238 | my @ccc_names = prop_value_aliases("ccc", charprop($j, "ccc")); | |
1239 | ||
1240 | # Final element in @ccc_names will be all numeric | |
1241 | push @return, $j if $ccc_names[-1] != 230; | |
1242 | } | |
1243 | } | |
1244 | ||
1245 | @return = sort { $a <=> $b } @return; | |
1246 | @return = mk_invlist_from_sorted_cp_list(\@return); | |
1247 | return \@return; | |
1248 | } | |
1249 | ||
289ce9cc KW |
1250 | sub output_table_common { |
1251 | ||
1252 | # Common subroutine to actually output the generated rules table. | |
1253 | ||
1254 | my ($property, | |
1255 | $table_value_defines_ref, | |
1256 | $table_ref, | |
1257 | $names_ref, | |
1258 | $abbreviations_ref) = @_; | |
1259 | my $size = @$table_ref; | |
1260 | ||
1261 | # Output the #define list, sorted by numeric value | |
1262 | if ($table_value_defines_ref) { | |
1263 | my $max_name_length = 0; | |
1264 | my @defines; | |
1265 | ||
1266 | # Put in order, and at the same time find the longest name | |
1267 | while (my ($enum, $value) = each %$table_value_defines_ref) { | |
1268 | $defines[$value] = $enum; | |
1269 | ||
1270 | my $length = length $enum; | |
1271 | $max_name_length = $length if $length > $max_name_length; | |
1272 | } | |
1273 | ||
1274 | print $out_fh "\n"; | |
1275 | ||
1276 | # Output, so that the values are vertically aligned in a column after | |
1277 | # the longest name | |
1278 | foreach my $i (0 .. @defines - 1) { | |
1279 | next unless defined $defines[$i]; | |
1280 | printf $out_fh "#define %-*s %2d\n", | |
1281 | $max_name_length, | |
1282 | $defines[$i], | |
1283 | $i; | |
1284 | } | |
1285 | } | |
1286 | ||
1287 | my $column_width = 2; # We currently allow 2 digits for the number | |
1288 | ||
a57a2641 KW |
1289 | # Being above a U8 is not currently handled |
1290 | my $table_type = 'U8'; | |
289ce9cc KW |
1291 | |
1292 | # If a name is longer than the width set aside for a column, its column | |
1293 | # needs to have increased spacing so that the name doesn't get truncated | |
1294 | # nor run into an adjacent column | |
1295 | my @spacers; | |
1296 | ||
2027d365 KW |
1297 | # Is there a row and column for unused values in this release? |
1298 | my $has_unused = $names_ref->[$size-1] eq $unused_table_hdr; | |
289ce9cc KW |
1299 | |
1300 | for my $i (0 .. $size - 1) { | |
1301 | no warnings 'numeric'; | |
289ce9cc KW |
1302 | $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width); |
1303 | } | |
1304 | ||
cf2cd619 KW |
1305 | output_table_header($out_fh, $table_type, "${property}_table", undef, |
1306 | $size, $size); | |
289ce9cc KW |
1307 | |
1308 | # Calculate the column heading line | |
1309 | my $header_line = "/* " | |
1310 | . (" " x $max_hdr_len) # We let the row heading meld to | |
1311 | # the '*/' for those that are at | |
1312 | # the max | |
1313 | . " " x 3; # Space for '*/ ' | |
1314 | # Now each column | |
1315 | for my $i (0 .. $size - 1) { | |
1316 | $header_line .= sprintf "%s%*s", | |
1317 | $spacers[$i], | |
1318 | $column_width + 1, # 1 for the ',' | |
1319 | $names_ref->[$i]; | |
1320 | } | |
1321 | $header_line .= " */\n"; | |
1322 | ||
1323 | # If we have annotations, output it now. | |
2027d365 | 1324 | if ($has_unused || scalar %$abbreviations_ref) { |
289ce9cc | 1325 | my $text = ""; |
18072598 | 1326 | foreach my $abbr (sort caselessly keys %$abbreviations_ref) { |
289ce9cc KW |
1327 | $text .= "; " if $text; |
1328 | $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'"; | |
1329 | } | |
2027d365 KW |
1330 | if ($has_unused) { |
1331 | $text .= "; $unused_table_hdr stands for 'unused in this Unicode" | |
0ebd57bb | 1332 | . " release (and the data in its row and column are garbage)" |
289ce9cc KW |
1333 | } |
1334 | ||
1335 | my $indent = " " x 3; | |
1336 | $text = $indent . "/* $text */"; | |
1337 | ||
1338 | # Wrap the text so that it is no wider than the table, which the | |
1339 | # header line gives. | |
1340 | my $output_width = length $header_line; | |
1341 | while (length $text > $output_width) { | |
1342 | my $cur_line = substr($text, 0, $output_width); | |
1343 | ||
1344 | # Find the first blank back from the right end to wrap at. | |
1345 | for (my $i = $output_width -1; $i > 0; $i--) { | |
1346 | if (substr($text, $i, 1) eq " ") { | |
1347 | print $out_fh substr($text, 0, $i), "\n"; | |
1348 | ||
1349 | # Set so will look at just the remaining tail (which will | |
1350 | # be indented and have a '*' after the indent | |
1351 | $text = $indent . " * " . substr($text, $i + 1); | |
1352 | last; | |
1353 | } | |
1354 | } | |
1355 | } | |
1356 | ||
1357 | # And any remaining | |
1358 | print $out_fh $text, "\n" if $text; | |
1359 | } | |
1360 | ||
1361 | # We calculated the header line earlier just to get its width so that we | |
1362 | # could make sure the annotations fit into that. | |
1363 | print $out_fh $header_line; | |
1364 | ||
1365 | # Now output the bulk of the table. | |
1366 | for my $i (0 .. $size - 1) { | |
1367 | ||
1368 | # First the row heading. | |
1369 | printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i]; | |
1370 | print $out_fh "{"; # Then the brace for this row | |
1371 | ||
1372 | # Then each column | |
1373 | for my $j (0 .. $size -1) { | |
1374 | print $out_fh $spacers[$j]; | |
1375 | printf $out_fh "%*d", $column_width, $table_ref->[$i][$j]; | |
1376 | print $out_fh "," if $j < $size - 1; | |
1377 | } | |
1378 | print $out_fh " }"; | |
1379 | print $out_fh "," if $i < $size - 1; | |
1380 | print $out_fh "\n"; | |
1381 | } | |
1382 | ||
cef72199 | 1383 | output_table_trailer(); |
289ce9cc KW |
1384 | } |
1385 | ||
973a28ed KW |
1386 | sub output_GCB_table() { |
1387 | ||
1388 | # Create and output the pair table for use in determining Grapheme Cluster | |
1389 | # Breaks, given in http://www.unicode.org/reports/tr29/. | |
b0e24409 KW |
1390 | my %gcb_actions = ( |
1391 | GCB_NOBREAK => 0, | |
1392 | GCB_BREAKABLE => 1, | |
1393 | GCB_RI_then_RI => 2, # Rules 12 and 13 | |
1394 | GCB_EX_then_EM => 3, # Rule 10 | |
c0734505 | 1395 | GCB_Maybe_Emoji_NonBreak => 4, |
b0e24409 | 1396 | ); |
973a28ed KW |
1397 | |
1398 | # The table is constructed in reverse order of the rules, to make the | |
1399 | # lower-numbered, higher priority ones override the later ones, as the | |
1400 | # algorithm stops at the earliest matching rule | |
1401 | ||
1402 | my @gcb_table; | |
1403 | my $table_size = @gcb_short_enums; | |
1404 | ||
1405 | # Otherwise, break everywhere. | |
b0e24409 | 1406 | # GB99 Any ÷ Any |
973a28ed KW |
1407 | for my $i (0 .. $table_size - 1) { |
1408 | for my $j (0 .. $table_size - 1) { | |
1409 | $gcb_table[$i][$j] = 1; | |
1410 | } | |
1411 | } | |
1412 | ||
b0e24409 KW |
1413 | # Do not break within emoji flag sequences. That is, do not break between |
1414 | # regional indicator (RI) symbols if there is an odd number of RI | |
1415 | # characters before the break point. Must be resolved in runtime code. | |
1416 | # | |
c492f156 | 1417 | # GB12 sot (RI RI)* RI × RI |
b0e24409 KW |
1418 | # GB13 [^RI] (RI RI)* RI × RI |
1419 | $gcb_table[$gcb_enums{'Regional_Indicator'}] | |
1420 | [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI}; | |
1421 | ||
c0734505 KW |
1422 | # Post 11.0: GB11 \p{Extended_Pictographic} Extend* ZWJ |
1423 | # × \p{Extended_Pictographic} | |
a9256a75 | 1424 | $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'ExtPict_XX'}] = |
c0734505 KW |
1425 | $gcb_actions{GCB_Maybe_Emoji_NonBreak}; |
1426 | ||
1427 | # This and the rule GB10 obsolete starting with Unicode 11.0, can be left | |
1428 | # in as there are no code points that match, so the code won't ever get | |
1429 | # executed. | |
b0e24409 | 1430 | # Do not break within emoji modifier sequences or emoji zwj sequences. |
c0734505 | 1431 | # Pre 11.0: GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ ) |
b0e24409 KW |
1432 | $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0; |
1433 | $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0; | |
1434 | ||
1435 | # GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier | |
1436 | $gcb_table[$gcb_enums{'Extend'}][$gcb_enums{'E_Modifier'}] | |
1437 | = $gcb_actions{GCB_EX_then_EM}; | |
1438 | $gcb_table[$gcb_enums{'E_Base'}][$gcb_enums{'E_Modifier'}] = 0; | |
1439 | $gcb_table[$gcb_enums{'E_Base_GAZ'}][$gcb_enums{'E_Modifier'}] = 0; | |
1440 | ||
1441 | # Do not break before extending characters or ZWJ. | |
973a28ed | 1442 | # Do not break before SpacingMarks, or after Prepend characters. |
973a28ed | 1443 | # GB9b Prepend × |
b0e24409 KW |
1444 | # GB9a × SpacingMark |
1445 | # GB9 × ( Extend | ZWJ ) | |
973a28ed | 1446 | for my $i (0 .. @gcb_table - 1) { |
289ce9cc | 1447 | $gcb_table[$gcb_enums{'Prepend'}][$i] = 0; |
b0e24409 KW |
1448 | $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0; |
1449 | $gcb_table[$i][$gcb_enums{'Extend'}] = 0; | |
1450 | $gcb_table[$i][$gcb_enums{'ZWJ'}] = 0; | |
973a28ed KW |
1451 | } |
1452 | ||
973a28ed KW |
1453 | # Do not break Hangul syllable sequences. |
1454 | # GB8 ( LVT | T) × T | |
1455 | $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0; | |
1456 | $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0; | |
1457 | ||
1458 | # GB7 ( LV | V ) × ( V | T ) | |
1459 | $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0; | |
1460 | $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0; | |
1461 | $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0; | |
1462 | $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0; | |
1463 | ||
1464 | # GB6 L × ( L | V | LV | LVT ) | |
1465 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0; | |
1466 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0; | |
1467 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0; | |
1468 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0; | |
1469 | ||
289ce9cc KW |
1470 | # Do not break between a CR and LF. Otherwise, break before and after |
1471 | # controls. | |
973a28ed KW |
1472 | # GB5 ÷ ( Control | CR | LF ) |
1473 | # GB4 ( Control | CR | LF ) ÷ | |
1474 | for my $i (0 .. @gcb_table - 1) { | |
289ce9cc | 1475 | $gcb_table[$i][$gcb_enums{'Control'}] = 1; |
973a28ed KW |
1476 | $gcb_table[$i][$gcb_enums{'CR'}] = 1; |
1477 | $gcb_table[$i][$gcb_enums{'LF'}] = 1; | |
289ce9cc | 1478 | $gcb_table[$gcb_enums{'Control'}][$i] = 1; |
973a28ed KW |
1479 | $gcb_table[$gcb_enums{'CR'}][$i] = 1; |
1480 | $gcb_table[$gcb_enums{'LF'}][$i] = 1; | |
1481 | } | |
1482 | ||
1483 | # GB3 CR × LF | |
1484 | $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0; | |
1485 | ||
b0e24409 | 1486 | # Break at the start and end of text, unless the text is empty |
973a28ed KW |
1487 | # GB1 sot ÷ |
1488 | # GB2 ÷ eot | |
1489 | for my $i (0 .. @gcb_table - 1) { | |
289ce9cc KW |
1490 | $gcb_table[$i][$gcb_enums{'EDGE'}] = 1; |
1491 | $gcb_table[$gcb_enums{'EDGE'}][$i] = 1; | |
973a28ed | 1492 | } |
289ce9cc | 1493 | $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0; |
973a28ed | 1494 | |
b0e24409 | 1495 | output_table_common('GCB', \%gcb_actions, |
289ce9cc | 1496 | \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations); |
973a28ed KW |
1497 | } |
1498 | ||
6b659339 KW |
1499 | sub output_LB_table() { |
1500 | ||
1501 | # Create and output the enums, #defines, and pair table for use in | |
1502 | # determining Line Breaks. This uses the default line break algorithm, | |
1503 | # given in http://www.unicode.org/reports/tr14/, but tailored by example 7 | |
1504 | # in that page, as the Unicode-furnished tests assume that tailoring. | |
1505 | ||
6b659339 KW |
1506 | # The result is really just true or false. But we follow along with tr14, |
1507 | # creating a rule which is false for something like X SP* X. That gets | |
1508 | # encoding 2. The rest of the actions are synthetic ones that indicate | |
1509 | # some context handling is required. These each are added to the | |
1510 | # underlying 0, 1, or 2, instead of replacing them, so that the underlying | |
1511 | # value can be retrieved. Actually only rules from 7 through 18 (which | |
1512 | # are the ones where space matter) are possible to have 2 added to them. | |
1513 | # The others below add just 0 or 1. It might be possible for one | |
1514 | # synthetic rule to be added to another, yielding a larger value. This | |
1515 | # doesn't happen in the Unicode 8.0 rule set, and as you can see from the | |
1516 | # names of the middle grouping below, it is impossible for that to occur | |
1517 | # for them because they all start with mutually exclusive classes. That | |
1518 | # the final rule can't be added to any of the others isn't obvious from | |
1519 | # its name, so it is assigned a power of 2 higher than the others can get | |
1520 | # to so any addition would preserve all data. (And the code will reach an | |
1521 | # assert(0) on debugging builds should this happen.) | |
1522 | my %lb_actions = ( | |
1523 | LB_NOBREAK => 0, | |
1524 | LB_BREAKABLE => 1, | |
1525 | LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2, | |
1526 | ||
b0e24409 | 1527 | LB_CM_ZWJ_foo => 3, # Rule 9 |
6b659339 KW |
1528 | LB_SP_foo => 6, # Rule 18 |
1529 | LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25 | |
1530 | LB_SY_or_IS_then_various => 11, # Rule 25 | |
1531 | LB_HY_or_BA_then_foo => 13, # Rule 21 | |
b0e24409 | 1532 | LB_RI_then_RI => 15, # Rule 30a |
6b659339 | 1533 | |
b0e24409 | 1534 | LB_various_then_PO_or_PR => (1<<5), # Rule 25 |
6b659339 KW |
1535 | ); |
1536 | ||
6b659339 KW |
1537 | # Construct the LB pair table. This is based on the rules in |
1538 | # http://www.unicode.org/reports/tr14/, but modified as those rules are | |
1539 | # designed for someone taking a string of text and sequentially going | |
1540 | # through it to find the break opportunities, whereas, Perl requires | |
1541 | # determining if a given random spot is a break opportunity, without | |
1542 | # knowing all the entire string before it. | |
1543 | # | |
1544 | # The table is constructed in reverse order of the rules, to make the | |
1545 | # lower-numbered, higher priority ones override the later ones, as the | |
1546 | # algorithm stops at the earliest matching rule | |
1547 | ||
1548 | my @lb_table; | |
1549 | my $table_size = @lb_short_enums; | |
1550 | ||
1551 | # LB31. Break everywhere else | |
1552 | for my $i (0 .. $table_size - 1) { | |
1553 | for my $j (0 .. $table_size - 1) { | |
1554 | $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'}; | |
1555 | } | |
1556 | } | |
1557 | ||
2f1eff3d UC |
1558 | # LB30b Do not break between an emoji base (or potential emoji) and an |
1559 | # emoji modifier. | |
1560 | ||
b0e24409 | 1561 | # EB × EM |
2f1eff3d | 1562 | # [\p{Extended_Pictographic}&\p{Cn}] × EM |
b0e24409 KW |
1563 | $lb_table[$lb_enums{'E_Base'}][$lb_enums{'E_Modifier'}] |
1564 | = $lb_actions{'LB_NOBREAK'}; | |
2f1eff3d UC |
1565 | $lb_table[$lb_enums{'Unassigned_Extended_Pictographic_Ideographic'}] |
1566 | [$lb_enums{'E_Modifier'}] = $lb_actions{'LB_NOBREAK'}; | |
b0e24409 KW |
1567 | |
1568 | # LB30a Break between two regional indicator symbols if and only if there | |
1569 | # are an even number of regional indicators preceding the position of the | |
1570 | # break. | |
1571 | # sot (RI RI)* RI × RI | |
1572 | # [^RI] (RI RI)* RI × RI | |
289ce9cc | 1573 | $lb_table[$lb_enums{'Regional_Indicator'}] |
b0e24409 | 1574 | [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_RI_then_RI'}; |
6b659339 KW |
1575 | |
1576 | # LB30 Do not break between letters, numbers, or ordinary symbols and | |
b6dbf1d3 UC |
1577 | # non-East-Asian opening punctuation nor non-East-Asian closing |
1578 | # parentheses. | |
1579 | ||
1580 | # (AL | HL | NU) × [OP-[\p{ea=F}\p{ea=W}\p{ea=H}]] | |
5c7c2de1 KW |
1581 | # (what we call CP and OP here have already been modified by mktables to |
1582 | # exclude the ea items | |
289ce9cc KW |
1583 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}] |
1584 | = $lb_actions{'LB_NOBREAK'}; | |
1585 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}] | |
1586 | = $lb_actions{'LB_NOBREAK'}; | |
1587 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}] | |
1588 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1589 | |
b6dbf1d3 | 1590 | # [CP-[\p{ea=F}\p{ea=W}\p{ea=H}]] × (AL | HL | NU) |
289ce9cc KW |
1591 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}] |
1592 | = $lb_actions{'LB_NOBREAK'}; | |
1593 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}] | |
1594 | = $lb_actions{'LB_NOBREAK'}; | |
1595 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}] | |
1596 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1597 | |
1598 | # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”). | |
1599 | # IS × (AL | HL) | |
289ce9cc KW |
1600 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}] |
1601 | = $lb_actions{'LB_NOBREAK'}; | |
1602 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
1603 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1604 | |
1605 | # LB28 Do not break between alphabetics (“at”). | |
1606 | # (AL | HL) × (AL | HL) | |
289ce9cc KW |
1607 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}] |
1608 | = $lb_actions{'LB_NOBREAK'}; | |
1609 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}] | |
1610 | = $lb_actions{'LB_NOBREAK'}; | |
1611 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}] | |
1612 | = $lb_actions{'LB_NOBREAK'}; | |
1613 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}] | |
1614 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1615 | |
1616 | # LB27 Treat a Korean Syllable Block the same as ID. | |
6b659339 | 1617 | # (JL | JV | JT | H2 | H3) × PO |
289ce9cc KW |
1618 | $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}] |
1619 | = $lb_actions{'LB_NOBREAK'}; | |
1620 | $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}] | |
1621 | = $lb_actions{'LB_NOBREAK'}; | |
1622 | $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}] | |
1623 | = $lb_actions{'LB_NOBREAK'}; | |
1624 | $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}] | |
1625 | = $lb_actions{'LB_NOBREAK'}; | |
1626 | $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}] | |
1627 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1628 | |
1629 | # PR × (JL | JV | JT | H2 | H3) | |
289ce9cc KW |
1630 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}] |
1631 | = $lb_actions{'LB_NOBREAK'}; | |
1632 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}] | |
1633 | = $lb_actions{'LB_NOBREAK'}; | |
1634 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}] | |
1635 | = $lb_actions{'LB_NOBREAK'}; | |
1636 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}] | |
1637 | = $lb_actions{'LB_NOBREAK'}; | |
1638 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}] | |
1639 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1640 | |
1641 | # LB26 Do not break a Korean syllable. | |
1642 | # JL × (JL | JV | H2 | H3) | |
1643 | $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'}; | |
1644 | $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; | |
1645 | $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'}; | |
1646 | $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'}; | |
1647 | ||
1648 | # (JV | H2) × (JV | JT) | |
1649 | $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; | |
1650 | $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; | |
1651 | $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
1652 | $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
1653 | ||
1654 | # (JT | H3) × JT | |
1655 | $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
1656 | $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
1657 | ||
1658 | # LB25 Do not break between the following pairs of classes relevant to | |
1659 | # numbers, as tailored by example 7 in | |
1660 | # http://www.unicode.org/reports/tr14/#Examples | |
1661 | # We follow that tailoring because Unicode's test cases expect it | |
1662 | # (PR | PO) × ( OP | HY )? NU | |
289ce9cc KW |
1663 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}] |
1664 | = $lb_actions{'LB_NOBREAK'}; | |
1665 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}] | |
1666 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1667 | |
1668 | # Given that (OP | HY )? is optional, we have to test for it in code. | |
1669 | # We add in the action (instead of overriding) for this, so that in | |
1670 | # the code we can recover the underlying break value. | |
289ce9cc | 1671 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}] |
6b659339 | 1672 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
b6dbf1d3 UC |
1673 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'East_Asian_OP'}] |
1674 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; | |
289ce9cc | 1675 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}] |
6b659339 | 1676 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
289ce9cc | 1677 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}] |
6b659339 | 1678 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
289ce9cc | 1679 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}] |
6b659339 KW |
1680 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
1681 | ||
1682 | # ( OP | HY ) × NU | |
289ce9cc KW |
1683 | $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}] |
1684 | = $lb_actions{'LB_NOBREAK'}; | |
b6dbf1d3 UC |
1685 | $lb_table[$lb_enums{'East_Asian_OP'}][$lb_enums{'Numeric'}] |
1686 | = $lb_actions{'LB_NOBREAK'}; | |
289ce9cc KW |
1687 | $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}] |
1688 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1689 | |
1690 | # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP ) | |
1691 | # which can be rewritten as: | |
1692 | # NU (SY | IS)* × (NU | SY | IS | CL | CP ) | |
289ce9cc KW |
1693 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}] |
1694 | = $lb_actions{'LB_NOBREAK'}; | |
1695 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}] | |
1696 | = $lb_actions{'LB_NOBREAK'}; | |
1697 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}] | |
1698 | = $lb_actions{'LB_NOBREAK'}; | |
1699 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}] | |
1700 | = $lb_actions{'LB_NOBREAK'}; | |
1701 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}] | |
1702 | = $lb_actions{'LB_NOBREAK'}; | |
b6dbf1d3 UC |
1703 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'East_Asian_CP'}] |
1704 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1705 | |
1706 | # Like earlier where we have to test in code, we add in the action so | |
1707 | # that we can recover the underlying values. This is done in rules | |
1708 | # below, as well. The code assumes that we haven't added 2 actions. | |
1709 | # Shoul a later Unicode release break that assumption, then tests | |
1710 | # should start failing. | |
289ce9cc | 1711 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}] |
6b659339 | 1712 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1713 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}] |
6b659339 | 1714 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1715 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}] |
6b659339 | 1716 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1717 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}] |
6b659339 | 1718 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1719 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}] |
6b659339 | 1720 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
b6dbf1d3 UC |
1721 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'East_Asian_CP'}] |
1722 | += $lb_actions{'LB_SY_or_IS_then_various'}; | |
289ce9cc | 1723 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}] |
6b659339 | 1724 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1725 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}] |
6b659339 | 1726 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1727 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}] |
6b659339 | 1728 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1729 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}] |
6b659339 | 1730 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1731 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}] |
6b659339 | 1732 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
b6dbf1d3 UC |
1733 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'East_Asian_CP'}] |
1734 | += $lb_actions{'LB_SY_or_IS_then_various'}; | |
6b659339 KW |
1735 | |
1736 | # NU (NU | SY | IS)* (CL | CP)? × (PO | PR) | |
1737 | # which can be rewritten as: | |
1738 | # NU (SY | IS)* (CL | CP)? × (PO | PR) | |
289ce9cc KW |
1739 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}] |
1740 | = $lb_actions{'LB_NOBREAK'}; | |
1741 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}] | |
1742 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1743 | |
289ce9cc | 1744 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 | 1745 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
b6dbf1d3 UC |
1746 | $lb_table[$lb_enums{'East_Asian_CP'}][$lb_enums{'Postfix_Numeric'}] |
1747 | += $lb_actions{'LB_various_then_PO_or_PR'}; | |
289ce9cc | 1748 | $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 | 1749 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1750 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 | 1751 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1752 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 KW |
1753 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
1754 | ||
289ce9cc | 1755 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 | 1756 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
b6dbf1d3 UC |
1757 | $lb_table[$lb_enums{'East_Asian_CP'}][$lb_enums{'Prefix_Numeric'}] |
1758 | += $lb_actions{'LB_various_then_PO_or_PR'}; | |
289ce9cc | 1759 | $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 | 1760 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1761 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 | 1762 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1763 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 KW |
1764 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
1765 | ||
b0e24409 KW |
1766 | # LB24 Do not break between numeric prefix/postfix and letters, or between |
1767 | # letters and prefix/postfix. | |
1768 | # (PR | PO) × (AL | HL) | |
289ce9cc KW |
1769 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}] |
1770 | = $lb_actions{'LB_NOBREAK'}; | |
1771 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
1772 | = $lb_actions{'LB_NOBREAK'}; | |
289ce9cc KW |
1773 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}] |
1774 | = $lb_actions{'LB_NOBREAK'}; | |
1775 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
1776 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1777 | |
b0e24409 KW |
1778 | # (AL | HL) × (PR | PO) |
1779 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Prefix_Numeric'}] | |
1780 | = $lb_actions{'LB_NOBREAK'}; | |
1781 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Prefix_Numeric'}] | |
1782 | = $lb_actions{'LB_NOBREAK'}; | |
1783 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Postfix_Numeric'}] | |
1784 | = $lb_actions{'LB_NOBREAK'}; | |
1785 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Postfix_Numeric'}] | |
1786 | = $lb_actions{'LB_NOBREAK'}; | |
1787 | ||
1788 | # LB23a Do not break between numeric prefixes and ideographs, or between | |
1789 | # ideographs and numeric postfixes. | |
1790 | # PR × (ID | EB | EM) | |
1791 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}] | |
1792 | = $lb_actions{'LB_NOBREAK'}; | |
de524f25 KW |
1793 | $lb_table[$lb_enums{'Prefix_Numeric'}] |
1794 | [$lb_enums{'Unassigned_Extended_Pictographic_Ideographic'}] | |
1795 | = $lb_actions{'LB_NOBREAK'}; | |
b0e24409 KW |
1796 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Base'}] |
1797 | = $lb_actions{'LB_NOBREAK'}; | |
1798 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Modifier'}] | |
1799 | = $lb_actions{'LB_NOBREAK'}; | |
1800 | ||
1801 | # (ID | EB | EM) × PO | |
289ce9cc KW |
1802 | $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}] |
1803 | = $lb_actions{'LB_NOBREAK'}; | |
de524f25 KW |
1804 | $lb_table[$lb_enums{'Unassigned_Extended_Pictographic_Ideographic'}] |
1805 | [$lb_enums{'Postfix_Numeric'}] = $lb_actions{'LB_NOBREAK'}; | |
b0e24409 KW |
1806 | $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Postfix_Numeric'}] |
1807 | = $lb_actions{'LB_NOBREAK'}; | |
1808 | $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Postfix_Numeric'}] | |
1809 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1810 | |
b0e24409 | 1811 | # LB23 Do not break between digits and letters |
6b659339 | 1812 | # (AL | HL) × NU |
289ce9cc KW |
1813 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}] |
1814 | = $lb_actions{'LB_NOBREAK'}; | |
1815 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}] | |
1816 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1817 | |
1818 | # NU × (AL | HL) | |
289ce9cc KW |
1819 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}] |
1820 | = $lb_actions{'LB_NOBREAK'}; | |
1821 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
1822 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1823 | |
b6dbf1d3 UC |
1824 | # LB22 Do not break before ellipses |
1825 | for my $i (0 .. @lb_table - 1) { | |
1826 | $lb_table[$i][$lb_enums{'Inseparable'}] = $lb_actions{'LB_NOBREAK'}; | |
1827 | } | |
6b659339 KW |
1828 | |
1829 | # LB21b Don’t break between Solidus and Hebrew letters. | |
1830 | # SY × HL | |
289ce9cc KW |
1831 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}] |
1832 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1833 | |
1834 | # LB21a Don't break after Hebrew + Hyphen. | |
1835 | # HL (HY | BA) × | |
1836 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1837 | $lb_table[$lb_enums{'Hyphen'}][$i] |
1838 | += $lb_actions{'LB_HY_or_BA_then_foo'}; | |
1839 | $lb_table[$lb_enums{'Break_After'}][$i] | |
1840 | += $lb_actions{'LB_HY_or_BA_then_foo'}; | |
6b659339 KW |
1841 | } |
1842 | ||
1843 | # LB21 Do not break before hyphen-minus, other hyphens, fixed-width | |
1844 | # spaces, small kana, and other non-starters, or after acute accents. | |
1845 | # × BA | |
1846 | # × HY | |
1847 | # × NS | |
1848 | # BB × | |
1849 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1850 | $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'}; |
1851 | $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'}; | |
1852 | $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'}; | |
1853 | $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1854 | } |
1855 | ||
1856 | # LB20 Break before and after unresolved CB. | |
1857 | # ÷ CB | |
1858 | # CB ÷ | |
1859 | # Conditional breaks should be resolved external to the line breaking | |
1860 | # rules. However, the default action is to treat unresolved CB as breaking | |
1861 | # before and after. | |
1862 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1863 | $lb_table[$i][$lb_enums{'Contingent_Break'}] |
1864 | = $lb_actions{'LB_BREAKABLE'}; | |
1865 | $lb_table[$lb_enums{'Contingent_Break'}][$i] | |
1866 | = $lb_actions{'LB_BREAKABLE'}; | |
6b659339 KW |
1867 | } |
1868 | ||
1869 | # LB19 Do not break before or after quotation marks, such as ‘ ” ’. | |
1870 | # × QU | |
1871 | # QU × | |
1872 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1873 | $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'}; |
1874 | $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1875 | } |
1876 | ||
1877 | # LB18 Break after spaces | |
1878 | # SP ÷ | |
1879 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1880 | $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'}; |
6b659339 KW |
1881 | } |
1882 | ||
1883 | # LB17 Do not break within ‘——’, even with intervening spaces. | |
1884 | # B2 SP* × B2 | |
289ce9cc | 1885 | $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}] |
6b659339 KW |
1886 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1887 | ||
1888 | # LB16 Do not break between closing punctuation and a nonstarter even with | |
1889 | # intervening spaces. | |
1890 | # (CL | CP) SP* × NS | |
289ce9cc | 1891 | $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}] |
6b659339 | 1892 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1893 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}] |
6b659339 | 1894 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
b6dbf1d3 UC |
1895 | $lb_table[$lb_enums{'East_Asian_CP'}][$lb_enums{'Nonstarter'}] |
1896 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; | |
6b659339 KW |
1897 | |
1898 | ||
1899 | # LB15 Do not break within ‘”[’, even with intervening spaces. | |
1900 | # QU SP* × OP | |
289ce9cc | 1901 | $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}] |
6b659339 | 1902 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
b6dbf1d3 UC |
1903 | $lb_table[$lb_enums{'Quotation'}][$lb_enums{'East_Asian_OP'}] |
1904 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; | |
6b659339 KW |
1905 | |
1906 | # LB14 Do not break after ‘[’, even after spaces. | |
1907 | # OP SP* × | |
1908 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1909 | $lb_table[$lb_enums{'Open_Punctuation'}][$i] |
6b659339 | 1910 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
b6dbf1d3 UC |
1911 | $lb_table[$lb_enums{'East_Asian_OP'}][$i] |
1912 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; | |
6b659339 KW |
1913 | } |
1914 | ||
1915 | # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as | |
1916 | # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples | |
1917 | # [^NU] × CL | |
1918 | # [^NU] × CP | |
1919 | # × EX | |
1920 | # [^NU] × IS | |
1921 | # [^NU] × SY | |
1922 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1923 | $lb_table[$i][$lb_enums{'Exclamation'}] |
6b659339 KW |
1924 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1925 | ||
289ce9cc | 1926 | next if $i == $lb_enums{'Numeric'}; |
6b659339 | 1927 | |
289ce9cc | 1928 | $lb_table[$i][$lb_enums{'Close_Punctuation'}] |
6b659339 | 1929 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1930 | $lb_table[$i][$lb_enums{'Close_Parenthesis'}] |
6b659339 | 1931 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
b6dbf1d3 UC |
1932 | $lb_table[$i][$lb_enums{'East_Asian_CP'}] |
1933 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; | |
289ce9cc | 1934 | $lb_table[$i][$lb_enums{'Infix_Numeric'}] |
6b659339 | 1935 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1936 | $lb_table[$i][$lb_enums{'Break_Symbols'}] |
6b659339 KW |
1937 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1938 | } | |
1939 | ||
1940 | # LB12a Do not break before NBSP and related characters, except after | |
1941 | # spaces and hyphens. | |
1942 | # [^SP BA HY] × GL | |
1943 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1944 | next if $i == $lb_enums{'Space'} |
1945 | || $i == $lb_enums{'Break_After'} | |
1946 | || $i == $lb_enums{'Hyphen'}; | |
6b659339 KW |
1947 | |
1948 | # We don't break, but if a property above has said don't break even | |
1949 | # with space between, don't override that (also in the next few rules) | |
289ce9cc | 1950 | next if $lb_table[$i][$lb_enums{'Glue'}] |
6b659339 | 1951 | == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1952 | $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'}; |
6b659339 KW |
1953 | } |
1954 | ||
1955 | # LB12 Do not break after NBSP and related characters. | |
1956 | # GL × | |
1957 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1958 | next if $lb_table[$lb_enums{'Glue'}][$i] |
6b659339 | 1959 | == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1960 | $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'}; |
6b659339 KW |
1961 | } |
1962 | ||
1963 | # LB11 Do not break before or after Word joiner and related characters. | |
1964 | # × WJ | |
1965 | # WJ × | |
1966 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1967 | if ($lb_table[$i][$lb_enums{'Word_Joiner'}] |
6b659339 KW |
1968 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) |
1969 | { | |
289ce9cc | 1970 | $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'}; |
6b659339 | 1971 | } |
289ce9cc | 1972 | if ($lb_table[$lb_enums{'Word_Joiner'}][$i] |
6b659339 KW |
1973 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) |
1974 | { | |
289ce9cc | 1975 | $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'}; |
6b659339 KW |
1976 | } |
1977 | } | |
1978 | ||
1979 | # Special case this here to avoid having to do a special case in the code, | |
1980 | # by making this the same as other things with a SP in front of them that | |
1981 | # don't break, we avoid an extra test | |
289ce9cc | 1982 | $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}] |
6b659339 KW |
1983 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1984 | ||
1985 | # LB9 and LB10 are done in the same loop | |
1986 | # | |
1987 | # LB9 Do not break a combining character sequence; treat it as if it has | |
1988 | # the line breaking class of the base character in all of the | |
b0e24409 KW |
1989 | # higher-numbered rules. Treat ZWJ as if it were CM |
1990 | # Treat X (CM|ZWJ)* as if it were X. | |
6b659339 KW |
1991 | # where X is any line break class except BK, CR, LF, NL, SP, or ZW. |
1992 | ||
b0e24409 KW |
1993 | # LB10 Treat any remaining combining mark or ZWJ as AL. This catches the |
1994 | # case where a CM or ZWJ is the first character on the line or follows SP, | |
1995 | # BK, CR, LF, NL, or ZW. | |
6b659339 KW |
1996 | for my $i (0 .. @lb_table - 1) { |
1997 | ||
b0e24409 KW |
1998 | # When the CM or ZWJ is the first in the pair, we don't know without |
1999 | # looking behind whether the CM or ZWJ is going to attach to an | |
2000 | # earlier character, or not. So have to figure this out at runtime in | |
2001 | # the code | |
2002 | $lb_table[$lb_enums{'Combining_Mark'}][$i] | |
2003 | = $lb_actions{'LB_CM_ZWJ_foo'}; | |
2004 | $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_CM_ZWJ_foo'}; | |
289ce9cc KW |
2005 | |
2006 | if ( $i == $lb_enums{'Mandatory_Break'} | |
2007 | || $i == $lb_enums{'EDGE'} | |
2008 | || $i == $lb_enums{'Carriage_Return'} | |
2009 | || $i == $lb_enums{'Line_Feed'} | |
2010 | || $i == $lb_enums{'Next_Line'} | |
2011 | || $i == $lb_enums{'Space'} | |
2012 | || $i == $lb_enums{'ZWSpace'}) | |
6b659339 KW |
2013 | { |
2014 | # For these classes, a following CM doesn't combine, and should do | |
289ce9cc KW |
2015 | # whatever 'Alphabetic' would do. |
2016 | $lb_table[$i][$lb_enums{'Combining_Mark'}] | |
2017 | = $lb_table[$i][$lb_enums{'Alphabetic'}]; | |
b0e24409 KW |
2018 | $lb_table[$i][$lb_enums{'ZWJ'}] |
2019 | = $lb_table[$i][$lb_enums{'Alphabetic'}]; | |
6b659339 KW |
2020 | } |
2021 | else { | |
b0e24409 KW |
2022 | # For these classes, the CM or ZWJ combines, so doesn't break, |
2023 | # inheriting the type of nobreak from the master character. | |
289ce9cc | 2024 | if ($lb_table[$i][$lb_enums{'Combining_Mark'}] |
6b659339 KW |
2025 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) |
2026 | { | |
289ce9cc KW |
2027 | $lb_table[$i][$lb_enums{'Combining_Mark'}] |
2028 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 2029 | } |
b0e24409 KW |
2030 | if ($lb_table[$i][$lb_enums{'ZWJ'}] |
2031 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) | |
2032 | { | |
2033 | $lb_table[$i][$lb_enums{'ZWJ'}] | |
2034 | = $lb_actions{'LB_NOBREAK'}; | |
2035 | } | |
6b659339 KW |
2036 | } |
2037 | } | |
2038 | ||
8a6698d7 UC |
2039 | # LB8a Do not break after a zero width joiner |
2040 | # ZWJ × | |
2041 | for my $i (0 .. @lb_table - 1) { | |
2042 | $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_NOBREAK'}; | |
2043 | } | |
b0e24409 | 2044 | |
6b659339 KW |
2045 | # LB8 Break before any character following a zero-width space, even if one |
2046 | # or more spaces intervene. | |
2047 | # ZW SP* ÷ | |
2048 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 2049 | $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'}; |
6b659339 KW |
2050 | } |
2051 | ||
2052 | # Because of LB8-10, we need to look at context for "SP x", and this must | |
2053 | # be done in the code. So override the existing rules for that, by adding | |
2054 | # a constant to get new rules that tell the code it needs to look at | |
2055 | # context. By adding this action instead of replacing the existing one, | |
2056 | # we can get back to the original rule if necessary. | |
2057 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 2058 | $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'}; |
6b659339 KW |
2059 | } |
2060 | ||
2061 | # LB7 Do not break before spaces or zero width space. | |
2062 | # × SP | |
2063 | # × ZW | |
2064 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
2065 | $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'}; |
2066 | $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
2067 | } |
2068 | ||
2069 | # LB6 Do not break before hard line breaks. | |
2070 | # × ( BK | CR | LF | NL ) | |
2071 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
2072 | $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'}; |
2073 | $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'}; | |
2074 | $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'}; | |
2075 | $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
2076 | } |
2077 | ||
2078 | # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks. | |
2079 | # CR × LF | |
2080 | # CR ! | |
2081 | # LF ! | |
2082 | # NL ! | |
2083 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
2084 | $lb_table[$lb_enums{'Carriage_Return'}][$i] |
2085 | = $lb_actions{'LB_BREAKABLE'}; | |
2086 | $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'}; | |
2087 | $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'}; | |
6b659339 | 2088 | } |
289ce9cc KW |
2089 | $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}] |
2090 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
2091 | |
2092 | # LB4 Always break after hard line breaks. | |
2093 | # BK ! | |
2094 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
2095 | $lb_table[$lb_enums{'Mandatory_Break'}][$i] |
2096 | = $lb_actions{'LB_BREAKABLE'}; | |
6b659339 KW |
2097 | } |
2098 | ||
6b659339 KW |
2099 | # LB3 Always break at the end of text. |
2100 | # ! eot | |
b0e24409 KW |
2101 | # LB2 Never break at the start of text. |
2102 | # sot × | |
6b659339 | 2103 | for my $i (0 .. @lb_table - 1) { |
289ce9cc KW |
2104 | $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'}; |
2105 | $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
2106 | } |
2107 | ||
2108 | # LB1 Assign a line breaking class to each code point of the input. | |
2109 | # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes | |
2110 | # depending on criteria outside the scope of this algorithm. | |
2111 | # | |
2112 | # In the absence of such criteria all characters with a specific | |
2113 | # combination of original class and General_Category property value are | |
2114 | # resolved as follows: | |
2115 | # Original Resolved General_Category | |
2116 | # AI, SG, XX AL Any | |
2117 | # SA CM Only Mn or Mc | |
2118 | # SA AL Any except Mn and Mc | |
2119 | # CJ NS Any | |
2120 | # | |
2121 | # This is done in mktables, so we never see any of the remapped-from | |
2122 | # classes. | |
2123 | ||
289ce9cc KW |
2124 | output_table_common('LB', \%lb_actions, |
2125 | \@lb_table, \@lb_short_enums, \%lb_abbreviations); | |
6b659339 KW |
2126 | } |
2127 | ||
7e54b87f KW |
2128 | sub output_WB_table() { |
2129 | ||
2130 | # Create and output the enums, #defines, and pair table for use in | |
2131 | # determining Word Breaks, given in http://www.unicode.org/reports/tr29/. | |
2132 | ||
2133 | # This uses the same mechanism in the other bounds tables generated by | |
2134 | # this file. The actions that could override a 0 or 1 are added to those | |
2135 | # numbers; the actions that clearly don't depend on the underlying rule | |
2136 | # simply overwrite | |
2137 | my %wb_actions = ( | |
2138 | WB_NOBREAK => 0, | |
2139 | WB_BREAKABLE => 1, | |
2140 | WB_hs_then_hs => 2, | |
b0e24409 | 2141 | WB_Ex_or_FO_or_ZWJ_then_foo => 3, |
7e54b87f KW |
2142 | WB_DQ_then_HL => 4, |
2143 | WB_HL_then_DQ => 6, | |
2144 | WB_LE_or_HL_then_MB_or_ML_or_SQ => 8, | |
2145 | WB_MB_or_ML_or_SQ_then_LE_or_HL => 10, | |
2146 | WB_MB_or_MN_or_SQ_then_NU => 12, | |
2147 | WB_NU_then_MB_or_MN_or_SQ => 14, | |
b0e24409 | 2148 | WB_RI_then_RI => 16, |
7e54b87f KW |
2149 | ); |
2150 | ||
7e54b87f KW |
2151 | # Construct the WB pair table. |
2152 | # The table is constructed in reverse order of the rules, to make the | |
2153 | # lower-numbered, higher priority ones override the later ones, as the | |
2154 | # algorithm stops at the earliest matching rule | |
2155 | ||
2156 | my @wb_table; | |
2027d365 | 2157 | my $table_size = @wb_short_enums; |
7e54b87f KW |
2158 | |
2159 | # Otherwise, break everywhere (including around ideographs). | |
b0e24409 | 2160 | # WB99 Any ÷ Any |
7e54b87f KW |
2161 | for my $i (0 .. $table_size - 1) { |
2162 | for my $j (0 .. $table_size - 1) { | |
2163 | $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'}; | |
2164 | } | |
2165 | } | |
2166 | ||
b0e24409 KW |
2167 | # Do not break within emoji flag sequences. That is, do not break between |
2168 | # regional indicator (RI) symbols if there is an odd number of RI | |
2169 | # characters before the break point. | |
2170 | # WB16 [^RI] (RI RI)* RI × RI | |
c492f156 | 2171 | # WB15 sot (RI RI)* RI × RI |
289ce9cc | 2172 | $wb_table[$wb_enums{'Regional_Indicator'}] |
b0e24409 KW |
2173 | [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_RI_then_RI'}; |
2174 | ||
2175 | # Do not break within emoji modifier sequences. | |
2176 | # WB14 ( E_Base | EBG ) × E_Modifier | |
2177 | $wb_table[$wb_enums{'E_Base'}][$wb_enums{'E_Modifier'}] | |
2178 | = $wb_actions{'WB_NOBREAK'}; | |
2179 | $wb_table[$wb_enums{'E_Base_GAZ'}][$wb_enums{'E_Modifier'}] | |
2180 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
2181 | |
2182 | # Do not break from extenders. | |
2183 | # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) | |
289ce9cc KW |
2184 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}] |
2185 | = $wb_actions{'WB_NOBREAK'}; | |
a9256a75 | 2186 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtPict_LE'}] |
c0734505 | 2187 | = $wb_actions{'WB_NOBREAK'}; |
289ce9cc KW |
2188 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}] |
2189 | = $wb_actions{'WB_NOBREAK'}; | |
2190 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}] | |
2191 | = $wb_actions{'WB_NOBREAK'}; | |
2192 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}] | |
2193 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
2194 | |
2195 | # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) | |
d21ae9f6 | 2196 | # × ExtendNumLet |
289ce9cc KW |
2197 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}] |
2198 | = $wb_actions{'WB_NOBREAK'}; | |
a9256a75 | 2199 | $wb_table[$wb_enums{'ExtPict_LE'}][$wb_enums{'ExtendNumLet'}] |
c0734505 | 2200 | = $wb_actions{'WB_NOBREAK'}; |
289ce9cc KW |
2201 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}] |
2202 | = $wb_actions{'WB_NOBREAK'}; | |
2203 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}] | |
2204 | = $wb_actions{'WB_NOBREAK'}; | |
2205 | $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}] | |
2206 | = $wb_actions{'WB_NOBREAK'}; | |
2207 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}] | |
2208 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
2209 | |
2210 | # Do not break between Katakana. | |
2211 | # WB13 Katakana × Katakana | |
289ce9cc KW |
2212 | $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}] |
2213 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
2214 | |
2215 | # Do not break within sequences, such as “3.2” or “3,456.789”. | |
2216 | # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric | |
289ce9cc | 2217 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}] |
7e54b87f | 2218 | += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; |
289ce9cc | 2219 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}] |
7e54b87f | 2220 | += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; |
289ce9cc | 2221 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}] |
7e54b87f KW |
2222 | += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; |
2223 | ||
2224 | # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric | |
289ce9cc | 2225 | $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}] |
7e54b87f | 2226 | += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; |
289ce9cc | 2227 | $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}] |
7e54b87f | 2228 | += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; |
289ce9cc | 2229 | $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}] |
7e54b87f KW |
2230 | += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; |
2231 | ||
2232 | # Do not break within sequences of digits, or digits adjacent to letters | |
2233 | # (“3a”, or “A3”). | |
2234 | # WB10 Numeric × (ALetter | Hebrew_Letter) | |
289ce9cc KW |
2235 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}] |
2236 | = $wb_actions{'WB_NOBREAK'}; | |
a9256a75 | 2237 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtPict_LE'}] |
c0734505 | 2238 | = $wb_actions{'WB_NOBREAK'}; |
289ce9cc KW |
2239 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}] |
2240 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
2241 | |
2242 | # WB9 (ALetter | Hebrew_Letter) × Numeric | |
289ce9cc KW |
2243 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}] |
2244 | = $wb_actions{'WB_NOBREAK'}; | |
a9256a75 | 2245 | $wb_table[$wb_enums{'ExtPict_LE'}][$wb_enums{'Numeric'}] |
c0734505 | 2246 | = $wb_actions{'WB_NOBREAK'}; |
289ce9cc KW |
2247 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}] |
2248 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
2249 | |
2250 | # WB8 Numeric × Numeric | |
289ce9cc KW |
2251 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}] |
2252 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
2253 | |
2254 | # Do not break letters across certain punctuation. | |
2255 | # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | |
289ce9cc KW |
2256 | $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}] |
2257 | += $wb_actions{'WB_DQ_then_HL'}; | |
7e54b87f KW |
2258 | |
2259 | # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | |
289ce9cc KW |
2260 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}] |
2261 | += $wb_actions{'WB_HL_then_DQ'}; | |
7e54b87f KW |
2262 | |
2263 | # WB7a Hebrew_Letter × Single_Quote | |
289ce9cc KW |
2264 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}] |
2265 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
2266 | |
2267 | # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) | |
2268 | # × (ALetter | Hebrew_Letter) | |
289ce9cc | 2269 | $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}] |
7e54b87f | 2270 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
a9256a75 | 2271 | $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ExtPict_LE'}] |
c0734505 | 2272 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 2273 | $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}] |
7e54b87f | 2274 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 2275 | $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}] |
7e54b87f | 2276 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
a9256a75 | 2277 | $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ExtPict_LE'}] |
c0734505 | 2278 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 2279 | $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}] |
7e54b87f | 2280 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 2281 | $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}] |
7e54b87f | 2282 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
a9256a75 | 2283 | $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ExtPict_LE'}] |
c0734505 | 2284 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 2285 | $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}] |
7e54b87f KW |
2286 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
2287 | ||
2288 | # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | |
2289 | # | Single_Quote) (ALetter | Hebrew_Letter) | |
289ce9cc | 2290 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}] |
7e54b87f | 2291 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
a9256a75 | 2292 | $wb_table[$wb_enums{'ExtPict_LE'}][$wb_enums{'MidNumLet'}] |
c0734505 | 2293 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 2294 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}] |
7e54b87f | 2295 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 2296 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}] |
7e54b87f | 2297 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
a9256a75 | 2298 | $wb_table[$wb_enums{'ExtPict_LE'}][$wb_enums{'MidLetter'}] |
c0734505 | 2299 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 2300 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}] |
7e54b87f | 2301 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 2302 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}] |
7e54b87f | 2303 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
a9256a75 | 2304 | $wb_table[$wb_enums{'ExtPict_LE'}][$wb_enums{'Single_Quote'}] |
c0734505 | 2305 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 2306 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}] |
7e54b87f KW |
2307 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
2308 | ||
2309 | # Do not break between most letters. | |
2310 | # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) | |
289ce9cc KW |
2311 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}] |
2312 | = $wb_actions{'WB_NOBREAK'}; | |
a9256a75 | 2313 | $wb_table[$wb_enums{'ExtPict_LE'}][$wb_enums{'ALetter'}] |
c0734505 | 2314 | = $wb_actions{'WB_NOBREAK'}; |
289ce9cc KW |
2315 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}] |
2316 | = $wb_actions{'WB_NOBREAK'}; | |
a9256a75 | 2317 | $wb_table[$wb_enums{'ExtPict_LE'}][$wb_enums{'Hebrew_Letter'}] |
c0734505 | 2318 | = $wb_actions{'WB_NOBREAK'}; |
289ce9cc KW |
2319 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}] |
2320 | = $wb_actions{'WB_NOBREAK'}; | |
a9256a75 | 2321 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtPict_LE'}] |
c0734505 | 2322 | = $wb_actions{'WB_NOBREAK'}; |
289ce9cc KW |
2323 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}] |
2324 | = $wb_actions{'WB_NOBREAK'}; | |
a9256a75 | 2325 | $wb_table[$wb_enums{'ExtPict_LE'}][$wb_enums{'ExtPict_LE'}] |
c0734505 | 2326 | = $wb_actions{'WB_NOBREAK'}; |
7e54b87f | 2327 | |
b0e24409 KW |
2328 | # Ignore Format and Extend characters, except after sot, CR, LF, and |
2329 | # Newline. This also has the effect of: Any × (Format | Extend | ZWJ) | |
2330 | # WB4 X (Extend | Format | ZWJ)* → X | |
7e54b87f | 2331 | for my $i (0 .. @wb_table - 1) { |
289ce9cc | 2332 | $wb_table[$wb_enums{'Extend'}][$i] |
b0e24409 | 2333 | = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'}; |
289ce9cc | 2334 | $wb_table[$wb_enums{'Format'}][$i] |
b0e24409 KW |
2335 | = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'}; |
2336 | $wb_table[$wb_enums{'ZWJ'}][$i] | |
2337 | = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'}; | |
2338 | } | |
2339 | for my $i (0 .. @wb_table - 1) { | |
2340 | $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'}; | |
2341 | $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'}; | |
2342 | $wb_table[$i][$wb_enums{'ZWJ'}] = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
2343 | } |
2344 | ||
2345 | # Implied is that these attach to the character before them, except for | |
2346 | # the characters that mark the end of a region of text. The rules below | |
2347 | # override the ones set up here, for all the characters that need | |
2348 | # overriding. | |
2349 | for my $i (0 .. @wb_table - 1) { | |
289ce9cc KW |
2350 | $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'}; |
2351 | $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
2352 | } |
2353 | ||
c0734505 KW |
2354 | # Keep horizontal whitespace together |
2355 | # Use perl's tailoring instead | |
2356 | # WB3d WSegSpace × WSegSpace | |
2357 | #$wb_table[$wb_enums{'WSegSpace'}][$wb_enums{'WSegSpace'}] | |
2358 | # = $wb_actions{'WB_NOBREAK'}; | |
2359 | ||
b0e24409 KW |
2360 | # Do not break within emoji zwj sequences. |
2361 | # WB3c ZWJ × ( Glue_After_Zwj | EBG ) | |
2362 | $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}] | |
2363 | = $wb_actions{'WB_NOBREAK'}; | |
2364 | $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}] | |
2365 | = $wb_actions{'WB_NOBREAK'}; | |
a9256a75 | 2366 | $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'ExtPict_XX'}] |
c0734505 | 2367 | = $wb_actions{'WB_NOBREAK'}; |
a9256a75 | 2368 | $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'ExtPict_LE'}] |
c0734505 | 2369 | = $wb_actions{'WB_NOBREAK'}; |
b0e24409 | 2370 | |
d21ae9f6 | 2371 | # Break before and after newlines |
7e54b87f KW |
2372 | # WB3b ÷ (Newline | CR | LF) |
2373 | # WB3a (Newline | CR | LF) ÷ | |
2374 | # et. al. | |
289ce9cc | 2375 | for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { |
7e54b87f KW |
2376 | for my $j (0 .. @wb_table - 1) { |
2377 | $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'}; | |
2378 | $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'}; | |
2379 | } | |
2380 | } | |
2381 | ||
2382 | # But do not break within white space. | |
2383 | # WB3 CR × LF | |
2384 | # et.al. | |
289ce9cc KW |
2385 | for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { |
2386 | for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { | |
7e54b87f KW |
2387 | $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'}; |
2388 | } | |
2389 | } | |
2390 | ||
b0e24409 | 2391 | # And do not break horizontal space followed by Extend or Format or ZWJ |
289ce9cc KW |
2392 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}] |
2393 | = $wb_actions{'WB_NOBREAK'}; | |
2394 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}] | |
2395 | = $wb_actions{'WB_NOBREAK'}; | |
b0e24409 KW |
2396 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'ZWJ'}] |
2397 | = $wb_actions{'WB_NOBREAK'}; | |
289ce9cc KW |
2398 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}] |
2399 | [$wb_enums{'Perl_Tailored_HSpace'}] | |
2400 | = $wb_actions{'WB_hs_then_hs'}; | |
7e54b87f | 2401 | |
b0e24409 KW |
2402 | # Break at the start and end of text, unless the text is empty |
2403 | # WB2 Any ÷ eot | |
2404 | # WB1 sot ÷ Any | |
7e54b87f | 2405 | for my $i (0 .. @wb_table - 1) { |
289ce9cc KW |
2406 | $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'}; |
2407 | $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'}; | |
7e54b87f | 2408 | } |
289ce9cc | 2409 | $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0; |
7e54b87f | 2410 | |
289ce9cc KW |
2411 | output_table_common('WB', \%wb_actions, |
2412 | \@wb_table, \@wb_short_enums, \%wb_abbreviations); | |
7e54b87f KW |
2413 | } |
2414 | ||
4eea95a6 KW |
2415 | sub sanitize_name ($) { |
2416 | # Change the non-word characters in the input string to standardized word | |
2417 | # equivalents | |
2418 | # | |
2419 | my $sanitized = shift; | |
2420 | $sanitized =~ s/=/__/; | |
2421 | $sanitized =~ s/&/_AMP_/; | |
2422 | $sanitized =~ s/\./_DOT_/; | |
2423 | $sanitized =~ s/-/_MINUS_/; | |
2424 | $sanitized =~ s!/!_SLASH_!; | |
2425 | ||
2426 | return $sanitized; | |
2427 | } | |
2428 | ||
d6945b2c YO |
2429 | sub token_name |
2430 | { | |
2431 | my $name = sanitize_name(shift); | |
2432 | warn "$name contains non-word" if $name =~ /\W/; | |
2433 | ||
2434 | return "$table_name_prefix\U$name" | |
2435 | } | |
2436 | ||
cef72199 | 2437 | switch_pound_if ('ALL', 'PERL_IN_REGCOMP_C'); |
4eea95a6 | 2438 | |
9d9177be KW |
2439 | output_invlist("Latin1", [ 0, 256 ]); |
2440 | output_invlist("AboveLatin1", [ 256 ]); | |
2441 | ||
c11f14f3 KW |
2442 | if ($num_anyof_code_points == 256) { # Same as Latin1 |
2443 | print $out_fh | |
2444 | "\nstatic const UV * const InBitmap_invlist = Latin1_invlist;\n"; | |
2445 | } | |
2446 | else { | |
2447 | output_invlist("InBitmap", [ 0, $num_anyof_code_points ]); | |
2448 | } | |
2449 | ||
bffc0129 | 2450 | end_file_pound_if; |
43b443dd | 2451 | |
3f427fd9 KW |
2452 | # We construct lists for all the POSIX and backslash sequence character |
2453 | # classes in two forms: | |
2454 | # 1) ones which match only in the ASCII range | |
2455 | # 2) ones which match either in the Latin1 range, or the entire Unicode range | |
2456 | # | |
2457 | # These get compiled in, and hence affect the memory footprint of every Perl | |
2458 | # program, even those not using Unicode. To minimize the size, currently | |
2459 | # the Latin1 version is generated for the beyond ASCII range except for those | |
2460 | # lists that are quite small for the entire range, such as for \s, which is 22 | |
2461 | # UVs long plus 4 UVs (currently) for the header. | |
2462 | # | |
2463 | # To save even more memory, the ASCII versions could be derived from the | |
2464 | # larger ones at runtime, saving some memory (minus the expense of the machine | |
2465 | # instructions to do so), but these are all small anyway, so their total is | |
2466 | # about 100 UVs. | |
2467 | # | |
2468 | # In the list of properties below that get generated, the L1 prefix is a fake | |
2469 | # property that means just the Latin1 range of the full property (whose name | |
2470 | # has an X prefix instead of L1). | |
a02047bf KW |
2471 | # |
2472 | # An initial & means to use the subroutine from this file instead of an | |
2473 | # official inversion list. | |
d1907b94 YO |
2474 | # |
2475 | print "Computing unicode properties\n" if DEBUG; | |
3f427fd9 | 2476 | |
53146480 KW |
2477 | # Below is the list of property names to generate. '&' means to use the |
2478 | # subroutine to generate the inversion list instead of the generic code | |
2479 | # below. Some properties have a comma-separated list after the name, | |
2480 | # These are extra enums to add to those found in the Unicode tables. | |
2481 | no warnings 'qw'; | |
2482 | # Ignore non-alpha in sort | |
4eea95a6 KW |
2483 | my @props; |
2484 | push @props, sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw( | |
4eea95a6 | 2485 | &UpperLatin1 |
a9256a75 | 2486 | _Perl_GCB,EDGE,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,ExtPict_XX |
de524f25 | 2487 | _Perl_LB,EDGE,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner,East_Asian_CP,East_Asian_OP,Unassigned_Extended_Pictographic_Ideographic |
2027d365 | 2488 | _Perl_SB,EDGE,SContinue,CR,Extend,LF |
a9256a75 | 2489 | _Perl_WB,Perl_Tailored_HSpace,EDGE,UNKNOWN,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,ExtPict_XX,ExtPict_LE |
4eea95a6 KW |
2490 | _Perl_SCX,Latin,Inherited,Unknown,Kore,Jpan,Hanb,INVALID |
2491 | Lowercase_Mapping | |
2492 | Titlecase_Mapping | |
2493 | Uppercase_Mapping | |
2494 | Simple_Case_Folding | |
2495 | Case_Folding | |
2496 | &_Perl_IVCF | |
a2aeff50 | 2497 | &_Perl_CCC_non0_non230 |
4eea95a6 KW |
2498 | ); |
2499 | # NOTE that the convention is that extra enum values come | |
2500 | # after the property name, separated by commas, with the enums | |
cf2cd619 KW |
2501 | # that aren't ever defined by Unicode (with some exceptions) |
2502 | # containing at least 4 all-uppercase characters. | |
2503 | ||
2504 | # Some of the enums are current official property values that | |
2505 | # are needed for the rules in constructing certain tables in | |
2506 | # this file, and perhaps in regexec.c as well. They are here | |
2507 | # so that things don't crash when compiled on earlier Unicode | |
2508 | # releases where they don't exist. Thus the rules that use | |
2509 | # them still get compiled, but no code point actually uses | |
2510 | # them, hence they won't get exercized on such Unicode | |
2511 | # versions, but the code will still compile and run, though | |
2512 | # may not give the precise results that those versions would | |
2513 | # expect, but reasonable results nonetheless. | |
2514 | # | |
2515 | # Other enums are due to the fact that Unicode has in more | |
2516 | # recent versions added criteria to the rules in these extra | |
2517 | # tables that are based on factors outside the property | |
2518 | # values. And those have to be accounted for, essentially by | |
2519 | # here splitting certain enum equivalence classes based on | |
2520 | # those extra rules. | |
2521 | # | |
2522 | # EDGE is supposed to be a boundary between some types of | |
2523 | # enums, but khw thinks that isn't valid any more. | |
4eea95a6 KW |
2524 | |
2525 | my @bin_props; | |
1aefa327 | 2526 | my @perl_prop_synonyms; |
4eea95a6 | 2527 | my %enums; |
2d74dcf2 KW |
2528 | my @deprecated_messages = ""; # Element [0] is a placeholder |
2529 | my %deprecated_tags; | |
4eea95a6 | 2530 | |
27097618 KW |
2531 | my $float_e_format = qr/ ^ -? \d \. \d+ e [-+] \d+ $ /x; |
2532 | ||
2533 | # Create another hash that maps floating point x.yyEzz representation to what | |
2534 | # %stricter_to_file_of does for the equivalent rational. A typical entry in | |
2535 | # the latter hash is | |
2536 | # | |
2537 | # 'nv=1/2' => 'Nv/1_2', | |
2538 | # | |
2539 | # From that, this loop creates an entry | |
2540 | # | |
2541 | # 'nv=5.00e-01' => 'Nv/1_2', | |
2542 | # | |
2543 | # %stricter_to_file_of contains far more than just the rationals. Instead we | |
048bdb72 | 2544 | # use %Unicode::UCD::nv_floating_to_rational which should have an entry for each |
27097618 KW |
2545 | # nv in the former hash. |
2546 | my %floating_to_file_of; | |
048bdb72 KW |
2547 | foreach my $key (keys %Unicode::UCD::nv_floating_to_rational) { |
2548 | my $value = $Unicode::UCD::nv_floating_to_rational{$key}; | |
2549 | $floating_to_file_of{$key} = $Unicode::UCD::stricter_to_file_of{"nv=$value"}; | |
27097618 KW |
2550 | } |
2551 | ||
2cd613ec KW |
2552 | # Properties that are specified with a prop=value syntax |
2553 | my @equals_properties; | |
2554 | ||
4eea95a6 KW |
2555 | # Collect all the binary properties from data in lib/unicore |
2556 | # Sort so that complements come after the main table, and the shortest | |
8091afe3 | 2557 | # names first, finally alphabetically. Also, sort together the tables we want |
f81c4731 KW |
2558 | # to be kept together, and prefer those with 'posix' in their names, which is |
2559 | # what the C code is expecting their names to be. | |
4eea95a6 | 2560 | foreach my $property (sort |
2d74dcf2 | 2561 | { exists $keep_together{lc $b} <=> exists $keep_together{lc $a} |
f81c4731 KW |
2562 | or $b =~ /posix/i <=> $a =~ /posix/i |
2563 | or $b =~ /perl/i <=> $a =~ /perl/i | |
27097618 | 2564 | or $a =~ $float_e_format <=> $b =~ $float_e_format |
2d74dcf2 | 2565 | or $a =~ /!/ <=> $b =~ /!/ |
4eea95a6 KW |
2566 | or length $a <=> length $b |
2567 | or $a cmp $b | |
048bdb72 KW |
2568 | } keys %Unicode::UCD::loose_to_file_of, |
2569 | keys %Unicode::UCD::stricter_to_file_of, | |
27097618 | 2570 | keys %floating_to_file_of |
53146480 | 2571 | ) { |
0f5e3c71 | 2572 | |
4eea95a6 KW |
2573 | # These two hashes map properties to values that can be considered to |
2574 | # be checksums. If two properties have the same checksum, they have | |
2575 | # identical entries. Otherwise they differ in some way. | |
048bdb72 KW |
2576 | my $tag = $Unicode::UCD::loose_to_file_of{$property}; |
2577 | $tag = $Unicode::UCD::stricter_to_file_of{$property} unless defined $tag; | |
27097618 | 2578 | $tag = $floating_to_file_of{$property} unless defined $tag; |
4eea95a6 KW |
2579 | |
2580 | # The tag may contain an '!' meaning it is identical to the one formed | |
394d2d3f KW |
2581 | # by removing the !, except that it is inverted. |
2582 | my $inverted = $tag =~ s/!//; | |
4eea95a6 | 2583 | |
27097618 KW |
2584 | # This hash is lacking the property name |
2585 | $property = "nv=$property" if $property =~ $float_e_format; | |
2586 | ||
4eea95a6 KW |
2587 | # The list of 'prop=value' entries that this single entry expands to |
2588 | my @this_entries; | |
2589 | ||
2590 | # Split 'property=value' on the equals sign, with $lhs being the whole | |
2591 | # thing if there is no '=' | |
2592 | my ($lhs, $rhs) = $property =~ / ( [^=]* ) ( =? .*) /x; | |
2593 | ||
62e88327 | 2594 | # $lhs then becomes the property name. |
2cd613ec KW |
2595 | my $prop_value = $rhs =~ s/ ^ = //rx; |
2596 | ||
2597 | push @equals_properties, $lhs if $prop_value ne ""; | |
62e88327 KW |
2598 | |
2599 | # See if there are any synonyms for this property. | |
394d2d3f KW |
2600 | if (exists $prop_name_aliases{$lhs}) { |
2601 | ||
2602 | # If so, do the combinatorics so that a new entry is added for | |
2603 | # each legal property combined with the property value (which is | |
2604 | # $rhs) | |
2605 | foreach my $alias (@{$prop_name_aliases{$lhs}}) { | |
2606 | ||
2607 | # But, there are some ambiguities, like 'script' is a synonym | |
2608 | # for 'sc', and 'sc' can stand alone, meaning something | |
2609 | # entirely different than 'script'. 'script' cannot stand | |
2610 | # alone. Don't add if the potential new lhs is in the hash of | |
2611 | # stand-alone properties. | |
2612 | no warnings 'once'; | |
2613 | next if $rhs eq "" && grep { $alias eq $_ } | |
048bdb72 | 2614 | keys %Unicode::UCD::loose_property_to_file_of; |
394d2d3f KW |
2615 | |
2616 | my $new_entry = $alias . $rhs; | |
e498c235 | 2617 | push @this_entries, $new_entry; |
394d2d3f KW |
2618 | } |
2619 | } | |
2620 | ||
2621 | # Above, we added the synonyms for the base entry we're now | |
2622 | # processing. But we haven't dealt with it yet. If we already have a | |
2623 | # property with the identical characteristics, this becomes just a | |
2624 | # synonym for it. | |
62e88327 | 2625 | |
394d2d3f KW |
2626 | if (exists $enums{$tag}) { |
2627 | push @this_entries, $property; | |
2628 | } | |
2629 | else { # Otherwise, create a new entry. | |
2630 | ||
4eea95a6 KW |
2631 | # Add to the list of properties to generate inversion lists for. |
2632 | push @bin_props, uc $property; | |
2633 | ||
394d2d3f | 2634 | # Create a rule for the parser |
f4b10e8e KW |
2635 | if (! exists $keywords{$property}) { |
2636 | $keywords{$property} = token_name($property); | |
2637 | } | |
394d2d3f | 2638 | |
4eea95a6 KW |
2639 | # And create an enum for it. |
2640 | $enums{$tag} = $table_name_prefix . uc sanitize_name($property); | |
394d2d3f | 2641 | |
1aefa327 KW |
2642 | $perl_tags{$tag} = 1 if exists $keep_together{lc $property}; |
2643 | ||
394d2d3f KW |
2644 | # Some properties are deprecated. This hash tells us so, and the |
2645 | # warning message to raise if they are used. | |
048bdb72 | 2646 | if (exists $Unicode::UCD::why_deprecated{$tag}) { |
394d2d3f | 2647 | $deprecated_tags{$enums{$tag}} = scalar @deprecated_messages; |
048bdb72 | 2648 | push @deprecated_messages, $Unicode::UCD::why_deprecated{$tag}; |
394d2d3f KW |
2649 | } |
2650 | ||
2651 | # Our sort above should have made sure that we see the | |
2652 | # non-inverted version first, but this makes sure. | |
2653 | warn "$property is inverted!!!" if $inverted; | |
2654 | } | |
2655 | ||
2656 | # Everything else is #defined to be the base enum, inversion is | |
2657 | # indicated by negating the value. | |
2658 | my $defined_to = ""; | |
2659 | $defined_to .= "-" if $inverted; | |
2660 | $defined_to .= $enums{$tag}; | |
2661 | ||
2662 | # Go through the entries that evaluate to this. | |
e498c235 | 2663 | @this_entries = uniques @this_entries; |
394d2d3f KW |
2664 | foreach my $define (@this_entries) { |
2665 | ||
2666 | # There is a rule for the parser for each. | |
f4b10e8e | 2667 | $keywords{$define} = $defined_to; |
1aefa327 KW |
2668 | |
2669 | # And a #define for all simple names equivalent to a perl property, | |
2670 | # except those that begin with 'is' or 'in'; | |
2671 | if (exists $perl_tags{$tag} && $property !~ / ^ i[ns] | = /x) { | |
7e9b4fe4 | 2672 | push @perl_prop_synonyms, "#define " |
e5360b12 KW |
2673 | . $table_name_prefix |
2674 | . uc(sanitize_name($define)) | |
2675 | . " $defined_to"; | |
1aefa327 | 2676 | } |
4eea95a6 KW |
2677 | } |
2678 | } | |
2679 | ||
cf2cd619 | 2680 | @bin_props = sort { exists $keep_together{lc $b} <=> exists $keep_together{lc $a} |
2d74dcf2 | 2681 | or $a cmp $b |
4eea95a6 | 2682 | } @bin_props; |
1aefa327 | 2683 | @perl_prop_synonyms = sort(uniques(@perl_prop_synonyms)); |
4eea95a6 KW |
2684 | push @props, @bin_props; |
2685 | ||
2686 | foreach my $prop (@props) { | |
2687 | ||
2688 | # For the Latin1 properties, we change to use the eXtended version of the | |
2689 | # base property, then go through the result and get rid of everything not | |
2690 | # in Latin1 (above 255). Actually, we retain the element for the range | |
2691 | # that crosses the 255/256 boundary if it is one that matches the | |
2692 | # property. For example, in the Word property, there is a range of code | |
2693 | # points that start at U+00F8 and goes through U+02C1. Instead of | |
2694 | # artificially cutting that off at 256 because 256 is the first code point | |
2695 | # above Latin1, we let the range go to its natural ending. That gives us | |
2696 | # extra information with no added space taken. But if the range that | |
2697 | # crosses the boundary is one that doesn't match the property, we don't | |
2698 | # start a new range above 255, as that could be construed as going to | |
2699 | # infinity. For example, the Upper property doesn't include the character | |
2700 | # at 255, but does include the one at 256. We don't include the 256 one. | |
2701 | my $prop_name = $prop; | |
2702 | my $is_local_sub = $prop_name =~ s/^&//; | |
2703 | my $extra_enums = ""; | |
2704 | $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x; | |
2705 | my $lookup_prop = $prop_name; | |
2706 | $prop_name = sanitize_name($prop_name); | |
cf2cd619 KW |
2707 | $prop_name = $table_name_prefix . $prop_name |
2708 | if grep { lc $lookup_prop eq lc $_ } @bin_props; | |
4eea95a6 KW |
2709 | my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ |
2710 | or $lookup_prop =~ s/^L1//); | |
2711 | my $nonl1_only = 0; | |
2712 | $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only; | |
2713 | ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x; | |
2714 | ||
4761f74a KW |
2715 | for my $charset (get_supported_code_pages()) { |
2716 | @a2n = @{get_a2n($charset)}; | |
2717 | ||
0f5e3c71 | 2718 | my @invlist; |
99f21fb9 | 2719 | my @invmap; |
5c0563e7 | 2720 | my $map_format = 0;; |
99f21fb9 | 2721 | my $map_default; |
5c0563e7 KW |
2722 | my $maps_to_code_point = 0; |
2723 | my $to_adjust = 0; | |
59fc10af | 2724 | my $same_in_all_code_pages; |
0f5e3c71 | 2725 | if ($is_local_sub) { |
8843f0de | 2726 | my @return = eval $lookup_prop; |
289ce9cc | 2727 | die $@ if $@; |
8843f0de KW |
2728 | my $invlist_ref = shift @return; |
2729 | @invlist = @$invlist_ref; | |
d2aadf62 KW |
2730 | if (@return) { # If has other values returned , must be an |
2731 | # inversion map | |
2732 | my $invmap_ref = shift @return; | |
2733 | @invmap = @$invmap_ref; | |
2734 | $map_format = shift @return; | |
2735 | $map_default = shift @return; | |
2736 | } | |
0f5e3c71 KW |
2737 | } |
2738 | else { | |
2739 | @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok'); | |
99f21fb9 | 2740 | if (! @invlist) { |
99f21fb9 | 2741 | |
ad85f59a KW |
2742 | # If couldn't find a non-empty inversion list, see if it is |
2743 | # instead an inversion map | |
2744 | my ($list_ref, $map_ref, $format, $default) | |
99f21fb9 | 2745 | = prop_invmap($lookup_prop, '_perl_core_internal_ok'); |
ad85f59a KW |
2746 | if (! $list_ref) { |
2747 | # An empty return here could mean an unknown property, or | |
2748 | # merely that the original inversion list is empty. Call | |
2749 | # in scalar context to differentiate | |
2750 | my $count = prop_invlist($lookup_prop, | |
2751 | '_perl_core_internal_ok'); | |
d99e65da KW |
2752 | if (defined $count) { |
2753 | # Short-circuit an empty inversion list. | |
2754 | output_invlist($prop_name, \@invlist, $charset); | |
59fc10af | 2755 | last; |
d99e65da | 2756 | } |
ad85f59a | 2757 | die "Could not find inversion list for '$lookup_prop'" |
ad85f59a KW |
2758 | } |
2759 | else { | |
18b852b3 KW |
2760 | @invlist = @$list_ref; |
2761 | @invmap = @$map_ref; | |
2762 | $map_format = $format; | |
2763 | $map_default = $default; | |
ad85f59a | 2764 | } |
99f21fb9 | 2765 | } |
0f5e3c71 | 2766 | } |
ad85f59a | 2767 | |
5c0563e7 KW |
2768 | if ($map_format) { |
2769 | $maps_to_code_point = $map_format =~ / a ($ | [^r] ) /x; | |
2770 | $to_adjust = $map_format =~ /a/; | |
2771 | } | |
2772 | ||
99f21fb9 KW |
2773 | # Re-order the Unicode code points to native ones for this platform. |
2774 | # This is only needed for code points below 256, because native code | |
2775 | # points are only in that range. For inversion maps of properties | |
2776 | # where the mappings are adjusted (format =~ /a/), this reordering | |
2777 | # could mess up the adjustment pattern that was in the input, so that | |
2778 | # has to be dealt with. | |
2779 | # | |
2780 | # And inversion maps that map to code points need to eventually have | |
2781 | # all those code points remapped to native, and it's better to do that | |
2782 | # here, going through the whole list not just those below 256. This | |
2783 | # is because some inversion maps have adjustments (format =~ /a/) | |
2784 | # which may be affected by the reordering. This code needs to be done | |
2785 | # both for when we are translating the inversion lists for < 256, and | |
2786 | # for the inversion maps for everything. By doing both in this loop, | |
2787 | # we can share that code. | |
2788 | # | |
2789 | # So, we go through everything for an inversion map to code points; | |
2790 | # otherwise, we can skip any remapping at all if we are going to | |
2791 | # output only the above-Latin1 values, or if the range spans the whole | |
2792 | # of 0..256, as the remap will also include all of 0..256 (256 not | |
2793 | # 255 because a re-ordering could cause 256 to need to be in the same | |
2794 | # range as 255.) | |
2b3e8a91 | 2795 | if ( (@invmap && $maps_to_code_point) |
e4e80abb KW |
2796 | || ( @invlist |
2797 | && $invlist[0] < 256 | |
2b3e8a91 | 2798 | && ( $invlist[0] != 0 |
e4e80abb | 2799 | || (scalar @invlist != 1 && $invlist[1] < 256)))) |
ceb1de32 | 2800 | { |
59fc10af | 2801 | $same_in_all_code_pages = 0; |
99f21fb9 | 2802 | if (! @invmap) { # Straight inversion list |
563f8b93 KW |
2803 | # Look at all the ranges that start before 257. |
2804 | my @latin1_list; | |
2805 | while (@invlist) { | |
2806 | last if $invlist[0] > 256; | |
2807 | my $upper = @invlist > 1 | |
2808 | ? $invlist[1] - 1 # In range | |
2809 | ||
2810 | # To infinity. You may want to stop much much | |
2811 | # earlier; going this high may expose perl | |
2812 | # deficiencies with very large numbers. | |
7d2c6c24 | 2813 | : 256; |
563f8b93 KW |
2814 | for my $j ($invlist[0] .. $upper) { |
2815 | push @latin1_list, a2n($j); | |
2816 | } | |
fb4554ea | 2817 | |
563f8b93 KW |
2818 | shift @invlist; # Shift off the range that's in the list |
2819 | shift @invlist; # Shift off the range not in the list | |
2820 | } | |
fb4554ea | 2821 | |
563f8b93 KW |
2822 | # Here @invlist contains all the ranges in the original that |
2823 | # start at code points above 256, and @latin1_list contains | |
2824 | # all the native code points for ranges that start with a | |
2825 | # Unicode code point below 257. We sort the latter and | |
2826 | # convert it to inversion list format. Then simply prepend it | |
2827 | # to the list of the higher code points. | |
2828 | @latin1_list = sort { $a <=> $b } @latin1_list; | |
2829 | @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list); | |
2830 | unshift @invlist, @latin1_list; | |
99f21fb9 KW |
2831 | } |
2832 | else { # Is an inversion map | |
2833 | ||
2834 | # This is a similar procedure as plain inversion list, but has | |
2835 | # multiple buckets. A plain inversion list just has two | |
2836 | # buckets, 1) 'in' the list; and 2) 'not' in the list, and we | |
2837 | # pretty much can ignore the 2nd bucket, as it is completely | |
2838 | # defined by the 1st. But here, what we do is create buckets | |
2839 | # which contain the code points that map to each, translated | |
2840 | # to native and turned into an inversion list. Thus each | |
2841 | # bucket is an inversion list of native code points that map | |
2842 | # to it or don't map to it. We use these to create an | |
2843 | # inversion map for the whole property. | |
2844 | ||
2845 | # As mentioned earlier, we use this procedure to not just | |
2846 | # remap the inversion list to native values, but also the maps | |
2847 | # of code points to native ones. In the latter case we have | |
2848 | # to look at the whole of the inversion map (or at least to | |
2849 | # above Unicode; as the maps of code points above that should | |
2850 | # all be to the default). | |
c125794e KW |
2851 | my $upper_limit = (! $maps_to_code_point) |
2852 | ? 256 | |
2853 | : (Unicode::UCD::UnicodeVersion() eq '1.1.5') | |
2854 | ? 0xFFFF | |
2855 | : 0x10FFFF; | |
99f21fb9 KW |
2856 | |
2857 | my %mapped_lists; # A hash whose keys are the buckets. | |
2858 | while (@invlist) { | |
2859 | last if $invlist[0] > $upper_limit; | |
2860 | ||
2861 | # This shouldn't actually happen, as prop_invmap() returns | |
2862 | # an extra element at the end that is beyond $upper_limit | |
cf2cd619 KW |
2863 | die "inversion map (for $prop_name) that extends to" |
2864 | . " infinity is unimplemented" unless @invlist > 1; | |
99f21fb9 KW |
2865 | |
2866 | my $bucket; | |
2867 | ||
2868 | # A hash key can't be a ref (we are only expecting arrays | |
2869 | # of scalars here), so convert any such to a string that | |
2870 | # will be converted back later (using a vertical tab as | |
b148e8b1 | 2871 | # the separator). |
99f21fb9 | 2872 | if (ref $invmap[0]) { |
b148e8b1 | 2873 | $bucket = join "\cK", map { a2n($_) } @{$invmap[0]}; |
99f21fb9 | 2874 | } |
98a1b8f7 KW |
2875 | elsif ( $maps_to_code_point |
2876 | && $invmap[0] =~ $integer_or_float_re) | |
2877 | { | |
99f21fb9 KW |
2878 | |
2879 | # Do convert to native for maps to single code points. | |
2880 | # There are some properties that have a few outlier | |
2881 | # maps that aren't code points, so the above test | |
f4d6df29 KW |
2882 | # skips those. 0 is never remapped. |
2883 | $bucket = $invmap[0] == 0 ? 0 : a2n($invmap[0]); | |
99f21fb9 KW |
2884 | } else { |
2885 | $bucket = $invmap[0]; | |
2886 | } | |
2887 | ||
2888 | # We now have the bucket that all code points in the range | |
2889 | # map to, though possibly they need to be adjusted. Go | |
2890 | # through the range and put each translated code point in | |
2891 | # it into its bucket. | |
2892 | my $base_map = $invmap[0]; | |
2893 | for my $j ($invlist[0] .. $invlist[1] - 1) { | |
2894 | if ($to_adjust | |
2895 | # The 1st code point doesn't need adjusting | |
2896 | && $j > $invlist[0] | |
2897 | ||
2898 | # Skip any non-numeric maps: these are outliers | |
2899 | # that aren't code points. | |
98a1b8f7 | 2900 | && $base_map =~ $integer_or_float_re |
99f21fb9 KW |
2901 | |
2902 | # 'ne' because the default can be a string | |
2903 | && $base_map ne $map_default) | |
2904 | { | |
2905 | # We adjust, by incrementing each the bucket and | |
2906 | # the map. For code point maps, translate to | |
2907 | # native | |
2908 | $base_map++; | |
2909 | $bucket = ($maps_to_code_point) | |
2910 | ? a2n($base_map) | |
2911 | : $base_map; | |
2912 | } | |
2913 | ||
2914 | # Add the native code point to the bucket for the | |
2915 | # current map | |
2916 | push @{$mapped_lists{$bucket}}, a2n($j); | |
2917 | } # End of loop through all code points in the range | |
2918 | ||
2919 | # Get ready for the next range | |
2920 | shift @invlist; | |
2921 | shift @invmap; | |
2922 | } # End of loop through all ranges in the map. | |
2923 | ||
2924 | # Here, @invlist and @invmap retain all the ranges from the | |
2925 | # originals that start with code points above $upper_limit. | |
2926 | # Each bucket in %mapped_lists contains all the code points | |
2927 | # that map to that bucket. If the bucket is for a map to a | |
5174a821 KW |
2928 | # single code point, the bucket has been converted to native. |
2929 | # If something else (including multiple code points), no | |
2930 | # conversion is done. | |
99f21fb9 KW |
2931 | # |
2932 | # Now we recreate the inversion map into %xlated, but this | |
2933 | # time for the native character set. | |
2934 | my %xlated; | |
2935 | foreach my $bucket (keys %mapped_lists) { | |
2936 | ||
2937 | # Sort and convert this bucket to an inversion list. The | |
2938 | # result will be that ranges that start with even-numbered | |
2939 | # indexes will be for code points that map to this bucket; | |
2940 | # odd ones map to some other bucket, and are discarded | |
2941 | # below. | |
2942 | @{$mapped_lists{$bucket}} | |
2943 | = sort{ $a <=> $b} @{$mapped_lists{$bucket}}; | |
2944 | @{$mapped_lists{$bucket}} | |
cf2cd619 KW |
2945 | = mk_invlist_from_sorted_cp_list( |
2946 | \@{$mapped_lists{$bucket}}); | |
99f21fb9 KW |
2947 | |
2948 | # Add each even-numbered range in the bucket to %xlated; | |
2949 | # so that the keys of %xlated become the range start code | |
2950 | # points, and the values are their corresponding maps. | |
2951 | while (@{$mapped_lists{$bucket}}) { | |
2952 | my $range_start = $mapped_lists{$bucket}->[0]; | |
2953 | if ($bucket =~ /\cK/) { | |
2954 | @{$xlated{$range_start}} = split /\cK/, $bucket; | |
2955 | } | |
2956 | else { | |
e113b1b3 KW |
2957 | # If adjusting, and there is more than one thing |
2958 | # that maps to the same thing, they must be split | |
2959 | # so that later the adjusting doesn't think the | |
2960 | # subsequent items can go away because of the | |
2961 | # adjusting. | |
cf2cd619 KW |
2962 | my $range_end = ( $to_adjust |
2963 | && $bucket != $map_default) | |
2964 | ? $mapped_lists{$bucket}->[1] - 1 | |
2965 | : $range_start; | |
e113b1b3 KW |
2966 | for my $i ($range_start .. $range_end) { |
2967 | $xlated{$i} = $bucket; | |
2968 | } | |
99f21fb9 KW |
2969 | } |
2970 | shift @{$mapped_lists{$bucket}}; # Discard odd ranges | |
2971 | shift @{$mapped_lists{$bucket}}; # Get ready for next | |
2972 | # iteration | |
2973 | } | |
2974 | } # End of loop through all the buckets. | |
2975 | ||
2976 | # Here %xlated's keys are the range starts of all the code | |
2977 | # points in the inversion map. Construct an inversion list | |
2978 | # from them. | |
2979 | my @new_invlist = sort { $a <=> $b } keys %xlated; | |
2980 | ||
2981 | # If the list is adjusted, we want to munge this list so that | |
2982 | # we only have one entry for where consecutive code points map | |
2983 | # to consecutive values. We just skip the subsequent entries | |
2984 | # where this is the case. | |
2985 | if ($to_adjust) { | |
2986 | my @temp; | |
2987 | for my $i (0 .. @new_invlist - 1) { | |
2988 | next if $i > 0 | |
2989 | && $new_invlist[$i-1] + 1 == $new_invlist[$i] | |
98a1b8f7 KW |
2990 | && $xlated{$new_invlist[$i-1]} |
2991 | =~ $integer_or_float_re | |
2992 | && $xlated{$new_invlist[$i]} | |
2993 | =~ $integer_or_float_re | |
62e88327 KW |
2994 | && $xlated{$new_invlist[$i-1]} + 1 |
2995 | == $xlated{$new_invlist[$i]}; | |
99f21fb9 KW |
2996 | push @temp, $new_invlist[$i]; |
2997 | } | |
2998 | @new_invlist = @temp; | |
2999 | } | |
3000 | ||
3001 | # The inversion map comes from %xlated's values. We can | |
3002 | # unshift each onto the front of the untouched portion, in | |
3003 | # reverse order of the portion we did process. | |
3004 | foreach my $start (reverse @new_invlist) { | |
3005 | unshift @invmap, $xlated{$start}; | |
3006 | } | |
3007 | ||
cf2cd619 KW |
3008 | # Finally prepend the inversion list we have just constructed |
3009 | # to the one that contains anything we didn't process. | |
99f21fb9 KW |
3010 | unshift @invlist, @new_invlist; |
3011 | } | |
3012 | } | |
e4e80abb KW |
3013 | elsif (@invmap) { # inversion maps can't cope with this variable |
3014 | # being true, even if it could be true | |
3015 | $same_in_all_code_pages = 0; | |
3016 | } | |
59fc10af KW |
3017 | else { |
3018 | $same_in_all_code_pages = 1; | |
3019 | } | |
99f21fb9 KW |
3020 | |
3021 | # prop_invmap() returns an extra final entry, which we can now | |
3022 | # discard. | |
3023 | if (@invmap) { | |
3024 | pop @invlist; | |
3025 | pop @invmap; | |
ceb1de32 | 3026 | } |
0f5e3c71 KW |
3027 | |
3028 | if ($l1_only) { | |
99f21fb9 | 3029 | die "Unimplemented to do a Latin-1 only inversion map" if @invmap; |
0f5e3c71 KW |
3030 | for my $i (0 .. @invlist - 1 - 1) { |
3031 | if ($invlist[$i] > 255) { | |
3032 | ||
3033 | # In an inversion list, even-numbered elements give the code | |
3034 | # points that begin ranges that match the property; | |
3035 | # odd-numbered give ones that begin ranges that don't match. | |
3036 | # If $i is odd, we are at the first code point above 255 that | |
3037 | # doesn't match, which means the range it is ending does | |
cf2cd619 KW |
3038 | # match, and crosses the 255/256 boundary. We want to |
3039 | # include this ending point, so increment $i, so the | |
3040 | # splice below includes it. Conversely, if $i is even, it | |
3041 | # is the first code point above 255 that matches, which | |
3042 | # means there was no matching range that crossed the | |
3043 | # boundary, and we don't want to include this code point, | |
3044 | # so splice before it. | |
0f5e3c71 KW |
3045 | $i++ if $i % 2 != 0; |
3046 | ||
3047 | # Remove everything past this. | |
3048 | splice @invlist, $i; | |
99f21fb9 | 3049 | splice @invmap, $i if @invmap; |
0f5e3c71 KW |
3050 | last; |
3051 | } | |
0c4ecf42 KW |
3052 | } |
3053 | } | |
0f5e3c71 KW |
3054 | elsif ($nonl1_only) { |
3055 | my $found_nonl1 = 0; | |
3056 | for my $i (0 .. @invlist - 1 - 1) { | |
3057 | next if $invlist[$i] < 256; | |
3058 | ||
3059 | # Here, we have the first element in the array that indicates an | |
3060 | # element above Latin1. Get rid of all previous ones. | |
3061 | splice @invlist, 0, $i; | |
99f21fb9 | 3062 | splice @invmap, 0, $i if @invmap; |
0f5e3c71 KW |
3063 | |
3064 | # If this one's index is not divisible by 2, it means that this | |
3065 | # element is inverting away from being in the list, which means | |
99f21fb9 KW |
3066 | # all code points from 256 to this one are in this list (or |
3067 | # map to the default for inversion maps) | |
3068 | if ($i % 2 != 0) { | |
3069 | unshift @invlist, 256; | |
3070 | unshift @invmap, $map_default if @invmap; | |
3071 | } | |
0f5e3c71 | 3072 | $found_nonl1 = 1; |
3f427fd9 KW |
3073 | last; |
3074 | } | |
0f0b3751 KW |
3075 | if (! $found_nonl1) { |
3076 | warn "No non-Latin1 code points in $prop_name"; | |
3077 | output_invlist($prop_name, []); | |
3078 | last; | |
3079 | } | |
3f427fd9 | 3080 | } |
3f427fd9 | 3081 | |
cef72199 | 3082 | switch_pound_if ($prop_name, 'PERL_IN_REGCOMP_C'); |
59fc10af | 3083 | start_charset_pound_if($charset, 1) unless $same_in_all_code_pages; |
4761f74a | 3084 | |
59fc10af KW |
3085 | output_invlist($prop_name, \@invlist, ($same_in_all_code_pages) |
3086 | ? $applies_to_all_charsets_text | |
3087 | : $charset); | |
4761f74a KW |
3088 | |
3089 | if (@invmap) { | |
3090 | output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, | |
3091 | $map_default, $extra_enums, $charset); | |
3092 | } | |
59fc10af KW |
3093 | |
3094 | last if $same_in_all_code_pages; | |
4761f74a | 3095 | end_charset_pound_if; |
0f5e3c71 | 3096 | } |
9d9177be KW |
3097 | } |
3098 | ||
d1907b94 YO |
3099 | print "Finished computing unicode properties\n" if DEBUG; |
3100 | ||
4ef8bdf9 | 3101 | print $out_fh "\nconst char * const deprecated_property_msgs[] = {\n\t"; |
394d2d3f KW |
3102 | print $out_fh join ",\n\t", map { "\"$_\"" } @deprecated_messages; |
3103 | print $out_fh "\n};\n"; | |
3104 | ||
7a15fa9e KW |
3105 | switch_pound_if ('binary_invlist_enum', 'PERL_IN_REGCOMP_C'); |
3106 | ||
394d2d3f KW |
3107 | my @enums = sort values %enums; |
3108 | ||
3109 | # Save a copy of these before modification | |
3110 | my @invlist_names = map { "${_}_invlist" } @enums; | |
3111 | ||
3112 | # Post-process the enums for deprecated properties. | |
3113 | if (scalar keys %deprecated_tags) { | |
3114 | my $seen_deprecated = 0; | |
3115 | foreach my $enum (@enums) { | |
3116 | if (grep { $_ eq $enum } keys %deprecated_tags) { | |
3117 | ||
3118 | # Change the enum name for this deprecated property to a | |
3119 | # munged one to act as a placeholder in the typedef. Then | |
3120 | # make the real name be a #define whose value is such that | |
3121 | # its modulus with the number of enums yields the index into | |
3122 | # the table occupied by the placeholder. And so that dividing | |
3123 | # the #define value by the table length gives an index into | |
3124 | # the table of deprecation messages for the corresponding | |
3125 | # warning. | |
3126 | my $revised_enum = "${enum}_perl_aux"; | |
3127 | if (! $seen_deprecated) { | |
3128 | $seen_deprecated = 1; | |
3129 | print $out_fh "\n"; | |
3130 | } | |
3131 | print $out_fh "#define $enum ($revised_enum + (MAX_UNI_KEYWORD_INDEX * $deprecated_tags{$enum}))\n"; | |
3132 | $enum = $revised_enum; | |
3133 | } | |
3134 | } | |
3135 | } | |
3136 | ||
cf2cd619 KW |
3137 | print $out_fh "\ntypedef enum {\n\tPERL_BIN_PLACEHOLDER = 0,", |
3138 | " /* So no real value is zero */\n\t"; | |
394d2d3f KW |
3139 | print $out_fh join ",\n\t", @enums; |
3140 | print $out_fh "\n"; | |
3141 | print $out_fh "} binary_invlist_enum;\n"; | |
3142 | print $out_fh "\n#define MAX_UNI_KEYWORD_INDEX $enums[-1]\n"; | |
394d2d3f | 3143 | |
7a15fa9e KW |
3144 | switch_pound_if ('binary_property_tables', 'PERL_IN_REGCOMP_C'); |
3145 | ||
cef72199 | 3146 | output_table_header($out_fh, "UV *", "uni_prop_ptrs"); |
a3d5e31b | 3147 | print $out_fh "\tNULL,\t/* Placeholder */\n"; |
cef72199 | 3148 | print $out_fh "\t"; |
394d2d3f KW |
3149 | print $out_fh join ",\n\t", @invlist_names; |
3150 | print $out_fh "\n"; | |
cef72199 KW |
3151 | |
3152 | output_table_trailer(); | |
3153 | ||
7a15fa9e KW |
3154 | switch_pound_if ('synonym defines', 'PERL_IN_REGCOMP_C'); |
3155 | ||
cef72199 KW |
3156 | print $out_fh join "\n", "\n", |
3157 | #'# ifdef DOINIT', | |
3158 | #"\n", | |
e5360b12 | 3159 | "/* Synonyms for perl properties */", |
cef72199 KW |
3160 | @perl_prop_synonyms, |
3161 | #"\n", | |
3162 | #"# endif /* DOINIT */", | |
3163 | "\n"; | |
394d2d3f | 3164 | |
2cd613ec KW |
3165 | switch_pound_if ('Valid property_values', 'PERL_IN_REGCOMP_C'); |
3166 | ||
3167 | # Each entry is a pointer to a table of property values for some property. | |
3168 | # (Other properties may share this table. The next two data structures allow | |
3169 | # this sharing to be implemented.) | |
3170 | my @values_tables = "NULL /* Placeholder so zero index is an error */"; | |
3171 | ||
3172 | # Keys are all the values of a property, strung together. The value of each | |
3173 | # key is its index in @values_tables. This is because many properties have | |
3174 | # the same values, and this allows the data to appear just once. | |
3175 | my %joined_values; | |
3176 | ||
3177 | # #defines for indices into @values_tables, so can have synonyms resolved by | |
3178 | # the C compiler. | |
3179 | my @values_indices; | |
3180 | ||
d1907b94 | 3181 | print "Computing short unicode properties\n" if DEBUG; |
2cd613ec KW |
3182 | # Go through each property which is specifiable by \p{prop=value}, and create |
3183 | # a hash with the keys being the canonicalized short property names, and the | |
3184 | # values for each property being all possible values that it can take on. | |
3185 | # Both the full value and its short, canonicalized into lc, sans punctuation | |
3186 | # version are included. | |
3187 | my %all_values; | |
3188 | for my $property (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } | |
3189 | uniques @equals_properties) | |
3190 | { | |
3191 | # Get and canonicalize the short name for this property. | |
3192 | my ($short_name) = prop_aliases($property); | |
3193 | $short_name = lc $short_name; | |
3194 | $short_name =~ s/[ _-]//g; | |
3195 | ||
3196 | # Now look at each value this property can take on | |
3197 | foreach my $value (prop_values($short_name)) { | |
3198 | ||
3199 | # And for each value, look at each synonym for it | |
3200 | foreach my $alias (prop_value_aliases($short_name, $value)) { | |
3201 | ||
3202 | # Add each synonym | |
3203 | push @{$all_values{$short_name}}, $alias; | |
3204 | ||
3205 | # As well as its canonicalized name. khw made the decision to not | |
3206 | # support the grandfathered L_ Gc property value | |
3207 | $alias = lc $alias; | |
3208 | $alias =~ s/[ _-]//g unless $alias =~ $numeric_re; | |
3209 | push @{$all_values{$short_name}}, $alias; | |
3210 | } | |
3211 | } | |
3212 | } | |
d1907b94 | 3213 | print "Finished computing short unicode properties\n" if DEBUG; |
2cd613ec KW |
3214 | |
3215 | # Also include the old style block names, using the recipe given in | |
3216 | # Unicode::UCD | |
3217 | foreach my $block (prop_values('block')) { | |
3218 | push @{$all_values{'blk'}}, charblock((prop_invlist("block=$block"))[0]); | |
3219 | } | |
3220 | ||
d1907b94 | 3221 | print "Creating property tables\n" if DEBUG; |
2cd613ec KW |
3222 | # Now create output tables for each property in @equals_properties (the keys |
3223 | # in %all_values) each containing that property's possible values as computed | |
3224 | # just above. | |
3225 | PROPERTY: | |
3226 | for my $property (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) | |
3227 | or $a cmp $b } keys %all_values) | |
3228 | { | |
3229 | @{$all_values{$property}} = uniques(@{$all_values{$property}}); | |
3230 | ||
3231 | # String together the values for this property, sorted. This string forms | |
3232 | # a list definition, with each value as an entry in it, indented on a new | |
3233 | # line. The sorting is used to find properties that take on the exact | |
3234 | # same values to share this string. | |
3235 | my $joined = "\t\""; | |
3236 | $joined .= join "\",\n\t\"", | |
3237 | sort { ($a =~ $numeric_re && $b =~ $numeric_re) | |
3238 | ? eval $a <=> eval $b | |
3239 | : prop_name_for_cmp($a) cmp prop_name_for_cmp($b) | |
3240 | or $a cmp $b | |
3241 | } @{$all_values{$property}}; | |
3242 | # And add a trailing marker | |
3243 | $joined .= "\",\n\tNULL\n"; | |
3244 | ||
3245 | my $table_name = $table_name_prefix . $property . "_values"; | |
3246 | my $index_name = "${table_name}_index"; | |
3247 | ||
3248 | # Add a rule for the parser that is just an empty value. It will need to | |
3249 | # know to look up empty things in the prop_value_ptrs[] table. | |
3250 | ||
3251 | $keywords{"$property="} = $index_name; | |
3252 | if (exists $prop_name_aliases{$property}) { | |
3253 | foreach my $alias (@{$prop_name_aliases{$property}}) { | |
3254 | $keywords{"$alias="} = $index_name; | |
3255 | } | |
3256 | } | |
3257 | ||
3258 | # Also create rules for the synonyms of this property to point to the same | |
3259 | # thing | |
3260 | ||
3261 | # If this property's values are the same as one we've already computed, | |
3262 | # use that instead of creating a duplicate. But we add a #define to point | |
3263 | # to the proper one. | |
3264 | if (exists $joined_values{$joined}) { | |
3265 | push @values_indices, "#define $index_name $joined_values{$joined}\n"; | |
3266 | next PROPERTY; | |
3267 | } | |
3268 | ||
3269 | # And this property, now known to have unique values from any other seen | |
3270 | # so far is about to be pushed onto @values_tables. Its index is the | |
3271 | # current count. | |
3272 | push @values_indices, "#define $index_name " | |
3273 | . scalar @values_tables . "\n"; | |
3274 | $joined_values{$joined} = $index_name; | |
3275 | push @values_tables, $table_name; | |
3276 | ||
3277 | # Create the table for this set of values. | |
3278 | output_table_header($out_fh, "char *", $table_name); | |
3279 | print $out_fh $joined; | |
3280 | output_table_trailer(); | |
3281 | } # End of loop through the properties, and their values | |
3282 | ||
3283 | # We have completely determined the table of the unique property values | |
3284 | output_table_header($out_fh, "char * const *", | |
3285 | "${table_name_prefix}prop_value_ptrs"); | |
3286 | print $out_fh join ",\n", @values_tables; | |
3287 | print $out_fh "\n"; | |
3288 | output_table_trailer(); | |
3289 | ||
3290 | # And the #defines for the indices in it | |
3291 | print $out_fh "\n\n", join "", @values_indices; | |
3292 | ||
973a28ed KW |
3293 | switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C'); |
3294 | ||
3295 | output_GCB_table(); | |
6b659339 | 3296 | output_LB_table(); |
7e54b87f | 3297 | output_WB_table(); |
6b659339 | 3298 | |
973a28ed KW |
3299 | end_file_pound_if; |
3300 | ||
d1907b94 YO |
3301 | print "Computing fold data\n" if DEBUG; |
3302 | ||
cb2d98ed KW |
3303 | print $out_fh <<"EOF"; |
3304 | ||
3305 | /* More than one code point may have the same code point as their fold. This | |
3306 | * gives the maximum number in the current Unicode release. (The folded-to | |
3307 | * code point is not included in this count.) For example, both 'S' and | |
3308 | * \\x{17F} fold to 's', so the number for that fold is 2. Another way to | |
3309 | * look at it is the maximum length of all the IVCF_AUX_TABLE's */ | |
3310 | #define MAX_FOLD_FROMS $max_fold_froms | |
3311 | EOF | |
3312 | ||
2308ab83 | 3313 | my $sources_list = "lib/unicore/mktables.lst"; |
74e28a4a TC |
3314 | my @sources = qw(regen/mk_invlists.pl |
3315 | lib/unicore/mktables | |
3316 | lib/Unicode/UCD.pm | |
3317 | regen/charset_translations.pl | |
f7b69ff8 | 3318 | regen/mk_PL_charclass.pl |
74e28a4a | 3319 | ); |
9a3da3ad FC |
3320 | { |
3321 | # Depend on mktables’ own sources. It’s a shorter list of files than | |
de164984 KW |
3322 | # those that Unicode::UCD uses. Some may not actually have an effect on |
3323 | # the output of this program, but easier to just include all of them, and | |
3324 | # no real harm in doing so, as it is rare for one such to change without | |
3325 | # the others doing so as well. | |
1ae6ead9 | 3326 | if (! open my $mktables_list, '<', $sources_list) { |
2308ab83 KW |
3327 | |
3328 | # This should force a rebuild once $sources_list exists | |
3329 | push @sources, $sources_list; | |
3330 | } | |
3331 | else { | |
3332 | while(<$mktables_list>) { | |
3333 | last if /===/; | |
3334 | chomp; | |
3335 | push @sources, "lib/unicore/$_" if /^[^#]/; | |
3336 | } | |
9a3da3ad FC |
3337 | } |
3338 | } | |
6b659339 KW |
3339 | |
3340 | read_only_bottom_close_and_rename($out_fh, \@sources); | |
394d2d3f | 3341 | |
21c34e97 KW |
3342 | my %name_to_index; |
3343 | for my $i (0 .. @enums - 1) { | |
3344 | my $loose_name = $enums[$i] =~ s/^$table_name_prefix//r; | |
3345 | $loose_name = lc $loose_name; | |
3346 | $loose_name =~ s/__/=/; | |
3347 | $loose_name =~ s/_dot_/./; | |
3348 | $loose_name =~ s/_slash_/\//g; | |
3349 | $name_to_index{$loose_name} = $i + 1; | |
3350 | } | |
3351 | # unsanitize, exclude &, maybe add these before sanitize | |
3352 | for my $i (0 .. @perl_prop_synonyms - 1) { | |
3353 | my $loose_name_pair = $perl_prop_synonyms[$i] =~ s/#\s*define\s*//r; | |
3354 | $loose_name_pair =~ s/\b$table_name_prefix//g; | |
3355 | $loose_name_pair = lc $loose_name_pair; | |
3356 | $loose_name_pair =~ s/__/=/g; | |
3357 | $loose_name_pair =~ s/_dot_/./g; | |
3358 | $loose_name_pair =~ s/_slash_/\//g; | |
3359 | my ($synonym, $primary) = split / +/, $loose_name_pair; | |
3360 | $name_to_index{$synonym} = $name_to_index{$primary}; | |
3361 | } | |
3362 | ||
3363 | my $uni_pl = open_new('lib/unicore/uni_keywords.pl', '>', | |
9824c081 | 3364 | {style => '*', by => 'regen/mk_invlists.pl', |
21c34e97 KW |
3365 | from => "Unicode::UCD"}); |
3366 | { | |
048bdb72 | 3367 | print $uni_pl "\%Unicode::UCD::uni_prop_ptrs_indices = (\n"; |
21c34e97 | 3368 | for my $name (sort keys %name_to_index) { |
21c34e97 KW |
3369 | print $uni_pl " '$name' => $name_to_index{$name},\n"; |
3370 | } | |
3371 | print $uni_pl ");\n\n1;\n"; | |
3372 | } | |
3373 | ||
3374 | read_only_bottom_close_and_rename($uni_pl, \@sources); | |
3375 | ||
d1907b94 YO |
3376 | print "Computing minimal perfect hash for unicode properties.\n" if DEBUG; |
3377 | ||
22c7467e YO |
3378 | if (my $file= $ENV{DUMP_KEYWORDS_FILE}) { |
3379 | require Data::Dumper; | |
3380 | ||
3381 | open my $ofh, ">", $file | |
3382 | or die "Failed to open DUMP_KEYWORDS_FILE '$file' for write: $!"; | |
3383 | print $ofh Data::Dumper->new([\%keywords],['*keywords']) | |
3384 | ->Sortkeys(1)->Useqq(1)->Dump(); | |
3385 | close $ofh; | |
3386 | print "Wrote keywords to '$file'.\n"; | |
3387 | } | |
394d2d3f | 3388 | |
afde5508 | 3389 | my $keywords_fh = open_new('uni_keywords.h', '>', |
9824c081 | 3390 | {style => '*', by => 'regen/mk_invlists.pl', |
afde5508 | 3391 | from => "mph.pl"}); |
394d2d3f | 3392 | |
3ef83dc3 KW |
3393 | print $keywords_fh "\n#if defined(PERL_CORE) || defined(PERL_EXT_RE_BUILD)\n\n"; |
3394 | ||
44a605b0 YO |
3395 | my $mph= MinimalPerfectHash->new( |
3396 | source_hash => \%keywords, | |
3397 | match_name => "match_uniprop", | |
eda35008 YO |
3398 | simple_split => $ENV{SIMPLE_SPLIT} // 0, |
3399 | randomize_squeeze => $ENV{RANDOMIZE_SQUEEZE} // 1, | |
3400 | max_same_in_squeeze => $ENV{MAX_SAME} // 5, | |
3401 | srand_seed => (lc($ENV{SRAND_SEED}//"") eq "auto") | |
3402 | ? undef | |
3403 | : $ENV{SRAND_SEED} // 1785235451, # I let perl pick a number | |
44a605b0 YO |
3404 | ); |
3405 | $mph->make_mph_with_split_keys(); | |
3406 | print $keywords_fh $mph->make_algo(); | |
3407 | ||
3ef83dc3 | 3408 | print $keywords_fh "\n#endif /* #if defined(PERL_CORE) || defined(PERL_EXT_RE_BUILD) */\n"; |
afde5508 KW |
3409 | |
3410 | push @sources, 'regen/mph.pl'; | |
394d2d3f | 3411 | read_only_bottom_close_and_rename($keywords_fh, \@sources); |