Commit | Line | Data |
---|---|---|
9d9177be KW |
1 | #!perl -w |
2 | use 5.015; | |
3 | use strict; | |
4 | use warnings; | |
99f21fb9 KW |
5 | use Unicode::UCD qw(prop_aliases |
6 | prop_values | |
7 | prop_value_aliases | |
8 | prop_invlist | |
9 | prop_invmap search_invlist | |
10 | ); | |
3d7c117d MB |
11 | require './regen/regen_lib.pl'; |
12 | require './regen/charset_translations.pl'; | |
9d9177be KW |
13 | |
14 | # This program outputs charclass_invlists.h, which contains various inversion | |
15 | # lists in the form of C arrays that are to be used as-is for inversion lists. | |
16 | # Thus, the lists it contains are essentially pre-compiled, and need only a | |
17 | # light-weight fast wrapper to make them usable at run-time. | |
18 | ||
19 | # As such, this code knows about the internal structure of these lists, and | |
20 | # any change made to that has to be done here as well. A random number stored | |
21 | # in the headers is used to minimize the possibility of things getting | |
22 | # out-of-sync, or the wrong data structure being passed. Currently that | |
23 | # random number is: | |
99f21fb9 KW |
24 | |
25 | # charclass_invlists.h now also has a partial implementation of inversion | |
26 | # maps; enough to generate tables for the line break properties, such as GCB | |
27 | ||
0a07b44b | 28 | my $VERSION_DATA_STRUCTURE_TYPE = 148565664; |
9d9177be | 29 | |
99f21fb9 KW |
30 | # integer or float |
31 | my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax; | |
32 | ||
33 | # Matches valid C language enum names: begins with ASCII alphabetic, then any | |
34 | # ASCII \w | |
35 | my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax; | |
36 | ||
9d9177be KW |
37 | my $out_fh = open_new('charclass_invlists.h', '>', |
38 | {style => '*', by => $0, | |
39 | from => "Unicode::UCD"}); | |
40 | ||
bffc0129 | 41 | my $in_file_pound_if = 0; |
43b443dd | 42 | |
289ce9cc KW |
43 | my $max_hdr_len = 3; # In headings, how wide a name is allowed? |
44 | ||
9d9177be KW |
45 | print $out_fh "/* See the generating file for comments */\n\n"; |
46 | ||
bffc0129 KW |
47 | # The symbols generated by this program are all currently defined only in a |
48 | # single dot c each. The code knows where most of them go, but this hash | |
49 | # gives overrides for the exceptions to the typical place | |
50 | my %exceptions_to_where_to_define = | |
51 | ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C', | |
52 | AboveLatin1 => 'PERL_IN_REGCOMP_C', | |
53 | Latin1 => 'PERL_IN_REGCOMP_C', | |
54 | UpperLatin1 => 'PERL_IN_REGCOMP_C', | |
55 | _Perl_Any_Folds => 'PERL_IN_REGCOMP_C', | |
56 | _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C', | |
57 | _Perl_IDCont => 'PERL_IN_UTF8_C', | |
58 | _Perl_IDStart => 'PERL_IN_UTF8_C', | |
59 | ); | |
015bb97c | 60 | |
973a28ed KW |
61 | my %gcb_enums; |
62 | my @gcb_short_enums; | |
289ce9cc | 63 | my %gcb_abbreviations; |
6b659339 KW |
64 | my %lb_enums; |
65 | my @lb_short_enums; | |
289ce9cc | 66 | my %lb_abbreviations; |
7e54b87f KW |
67 | my %wb_enums; |
68 | my @wb_short_enums; | |
289ce9cc | 69 | my %wb_abbreviations; |
6b659339 | 70 | |
99f21fb9 KW |
71 | my @a2n; |
72 | ||
73 | sub uniques { | |
74 | # Returns non-duplicated input values. From "Perl Best Practices: | |
75 | # Encapsulated Cleverness". p. 455 in first edition. | |
76 | ||
77 | my %seen; | |
78 | return grep { ! $seen{$_}++ } @_; | |
79 | } | |
80 | ||
81 | sub a2n($) { | |
82 | my $cp = shift; | |
83 | ||
84 | # Returns the input Unicode code point translated to native. | |
85 | ||
86 | return $cp if $cp !~ $numeric_re || $cp > 255; | |
87 | return $a2n[$cp]; | |
88 | } | |
89 | ||
bffc0129 KW |
90 | sub end_file_pound_if { |
91 | if ($in_file_pound_if) { | |
92 | print $out_fh "\n#endif\t/* $in_file_pound_if */\n"; | |
93 | $in_file_pound_if = 0; | |
94 | } | |
95 | } | |
96 | ||
97 | sub switch_pound_if ($$) { | |
98 | my $name = shift; | |
99 | my $new_pound_if = shift; | |
100 | ||
101 | # Switch to new #if given by the 2nd argument. If there is an override | |
102 | # for this, it instead switches to that. The 1st argument is the | |
103 | # static's name, used to look up the overrides | |
104 | ||
105 | if (exists $exceptions_to_where_to_define{$name}) { | |
106 | $new_pound_if = $exceptions_to_where_to_define{$name}; | |
107 | } | |
108 | ||
109 | # Exit current #if if the new one is different from the old | |
110 | if ($in_file_pound_if | |
111 | && $in_file_pound_if !~ /$new_pound_if/) | |
112 | { | |
113 | end_file_pound_if; | |
114 | } | |
115 | ||
116 | # Enter new #if, if not already in it. | |
117 | if (! $in_file_pound_if) { | |
118 | $in_file_pound_if = "defined($new_pound_if)"; | |
119 | print $out_fh "\n#if $in_file_pound_if\n"; | |
43b443dd KW |
120 | } |
121 | } | |
122 | ||
0c4ecf42 | 123 | sub output_invlist ($$;$) { |
9d9177be KW |
124 | my $name = shift; |
125 | my $invlist = shift; # Reference to inversion list array | |
0c4ecf42 | 126 | my $charset = shift // ""; # name of character set for comment |
9d9177be | 127 | |
76d3994c | 128 | die "No inversion list for $name" unless defined $invlist |
ad85f59a | 129 | && ref $invlist eq 'ARRAY'; |
76d3994c | 130 | |
9d9177be KW |
131 | # Output the inversion list $invlist using the name $name for it. |
132 | # It is output in the exact internal form for inversion lists. | |
133 | ||
a0316a6c KW |
134 | # Is the last element of the header 0, or 1 ? |
135 | my $zero_or_one = 0; | |
ad85f59a | 136 | if (@$invlist && $invlist->[0] != 0) { |
a0316a6c | 137 | unshift @$invlist, 0; |
9d9177be KW |
138 | $zero_or_one = 1; |
139 | } | |
0a07b44b | 140 | my $count = @$invlist; |
9d9177be | 141 | |
bffc0129 | 142 | switch_pound_if ($name, 'PERL_IN_PERL_C'); |
43b443dd | 143 | |
0c4ecf42 KW |
144 | print $out_fh "\nstatic const UV ${name}_invlist[] = {"; |
145 | print $out_fh " /* for $charset */" if $charset; | |
146 | print $out_fh "\n"; | |
9d9177be | 147 | |
a0316a6c | 148 | print $out_fh "\t$count,\t/* Number of elements */\n"; |
9d9177be KW |
149 | print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n"; |
150 | print $out_fh "\t", $zero_or_one, | |
a0316a6c KW |
151 | ",\t/* 0 if the list starts at 0;", |
152 | "\n\t\t 1 if it starts at the element beyond 0 */\n"; | |
9d9177be KW |
153 | |
154 | # The main body are the UVs passed in to this routine. Do the final | |
155 | # element separately | |
47d53124 KW |
156 | for my $i (0 .. @$invlist - 1) { |
157 | printf $out_fh "\t0x%X", $invlist->[$i]; | |
158 | print $out_fh "," if $i < @$invlist - 1; | |
159 | print $out_fh "\n"; | |
9d9177be KW |
160 | } |
161 | ||
9d9177be KW |
162 | print $out_fh "};\n"; |
163 | } | |
164 | ||
99f21fb9 KW |
165 | sub output_invmap ($$$$$$$) { |
166 | my $name = shift; | |
167 | my $invmap = shift; # Reference to inversion map array | |
168 | my $prop_name = shift; | |
169 | my $input_format = shift; # The inversion map's format | |
170 | my $default = shift; # The property value for code points who | |
171 | # otherwise don't have a value specified. | |
172 | my $extra_enums = shift; # comma-separated list of our additions to the | |
173 | # property's standard possible values | |
174 | my $charset = shift // ""; # name of character set for comment | |
175 | ||
176 | # Output the inversion map $invmap for property $prop_name, but use $name | |
177 | # as the actual data structure's name. | |
178 | ||
179 | my $count = @$invmap; | |
180 | ||
181 | my $output_format; | |
182 | my $declaration_type; | |
183 | my %enums; | |
184 | my $name_prefix; | |
185 | ||
34623dbb | 186 | if ($input_format =~ / ^ s l? $ /x) { |
02f811dd KW |
187 | $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name |
188 | my $short_name = (prop_aliases($prop_name))[0] // $prop_name; | |
226b74db | 189 | my @input_enums; |
f79a09fc | 190 | |
226b74db | 191 | # Find all the possible input values. These become the enum names |
34623dbb KW |
192 | # that comprise the inversion map. For inputs that don't have sub |
193 | # lists, we can just get the unique values. Otherwise, we have to | |
194 | # expand the sublists first. | |
195 | if ($input_format ne 'sl') { | |
226b74db | 196 | @input_enums = sort(uniques(@$invmap)); |
34623dbb KW |
197 | } |
198 | else { | |
199 | foreach my $element (@$invmap) { | |
200 | if (ref $element) { | |
201 | push @input_enums, @$element; | |
202 | } | |
203 | else { | |
204 | push @input_enums, $element; | |
205 | } | |
206 | } | |
207 | @input_enums = sort(uniques(@input_enums)); | |
208 | } | |
6b659339 | 209 | |
226b74db KW |
210 | # The internal enums come last, and in the order specified. |
211 | my @enums = @input_enums; | |
27a619f7 KW |
212 | my @extras; |
213 | if ($extra_enums ne "") { | |
214 | @extras = split /,/, $extra_enums; | |
226b74db KW |
215 | |
216 | # Don't add if already there. | |
217 | foreach my $this_extra (@extras) { | |
218 | next if grep { $_ eq $this_extra } @enums; | |
219 | ||
220 | push @enums, $this_extra; | |
221 | } | |
27a619f7 | 222 | } |
289ce9cc | 223 | |
226b74db KW |
224 | # Assign a value to each element of the enum type we are creating. |
225 | # The default value always gets 0; the others are arbitrarily | |
226 | # assigned. | |
27a619f7 KW |
227 | my $enum_val = 0; |
228 | my $canonical_default = prop_value_aliases($prop_name, $default); | |
229 | $default = $canonical_default if defined $canonical_default; | |
230 | $enums{$default} = $enum_val++; | |
226b74db | 231 | |
27a619f7 KW |
232 | for my $enum (@enums) { |
233 | $enums{$enum} = $enum_val++ unless exists $enums{$enum}; | |
234 | } | |
235 | ||
226b74db | 236 | # Calculate the data for the special tables output for these properties. |
27a619f7 KW |
237 | if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) { |
238 | ||
226b74db KW |
239 | # The data includes the hashes %gcb_enums, %lb_enums, etc. |
240 | # Similarly we calculate column headings for the tables. | |
241 | # | |
27a619f7 | 242 | # We use string evals to allow the same code to work on |
226b74db | 243 | # all the tables |
27a619f7 KW |
244 | my $type = lc $prop_name; |
245 | ||
27a619f7 KW |
246 | my $placeholder = "a"; |
247 | ||
248 | # Skip if we've already done this code, which populated | |
249 | # this hash | |
250 | if (eval "! \%${type}_enums") { | |
251 | ||
226b74db | 252 | # For each enum in the type ... |
27a619f7 KW |
253 | foreach my $enum (sort keys %enums) { |
254 | my $value = $enums{$enum}; | |
255 | my $short; | |
256 | my $abbreviated_from; | |
257 | ||
258 | # Special case this wb property value to make the | |
259 | # name more clear | |
260 | if ($enum eq 'Perl_Tailored_HSpace') { | |
261 | $short = 'hs'; | |
262 | $abbreviated_from = $enum; | |
263 | } | |
27a619f7 | 264 | else { |
226b74db KW |
265 | |
266 | # Use the official short name, if found. | |
27a619f7 KW |
267 | ($short) = prop_value_aliases($type, $enum); |
268 | ||
226b74db KW |
269 | if (! defined $short) { |
270 | ||
271 | # But if there is no official name, use the name | |
272 | # that came from the data (if any). Otherwise, | |
273 | # the name had to come from the extras list. | |
274 | # There are two types of values in that list. | |
275 | # | |
276 | # First are those enums that are not part of the | |
277 | # property, but are defined by this code. By | |
278 | # convention these have all-caps names of at least | |
279 | # 4 characters. We use the lowercased name for | |
280 | # thse. | |
281 | # | |
282 | # Second are enums that are needed to get | |
283 | # regexec.c to compile, but don't exist in all | |
284 | # Unicode releases. To get here, we must be | |
285 | # compiling an earlier Unicode release that | |
286 | # doesn't have that enum, so just use a unique | |
287 | # anonymous name for it. | |
288 | if (grep { $_ eq $enum } @input_enums) { | |
289 | $short = $enum | |
290 | } | |
291 | elsif ($enum !~ / ^ [A-Z]{4,} $ /x) { | |
292 | $short = $placeholder++; | |
293 | } | |
294 | else { | |
295 | $short = lc $enum; | |
296 | } | |
297 | } | |
27a619f7 KW |
298 | } |
299 | ||
300 | # If our short name is too long, or we already | |
301 | # know that the name is an abbreviation, truncate | |
302 | # to make sure it's short enough, and remember | |
226b74db KW |
303 | # that we did this so we can later add a comment in the |
304 | # generated file | |
27a619f7 KW |
305 | if ( $abbreviated_from |
306 | || length $short > $max_hdr_len) | |
307 | { | |
308 | $short = substr($short, 0, $max_hdr_len); | |
309 | $abbreviated_from = $enum | |
310 | unless $abbreviated_from; | |
311 | # If the name we are to display conflicts, try | |
312 | # another. | |
313 | while (eval "exists | |
314 | \$${type}_abbreviations{$short}") | |
315 | { | |
289ce9cc | 316 | die $@ if $@; |
256fceb3 KW |
317 | |
318 | # The increment operator on strings doesn't work | |
319 | # on those containing an '_', so just use the | |
320 | # final portion. | |
321 | my @short = split '_', $short; | |
322 | $short[-1]++; | |
323 | $short = join "_", @short; | |
289ce9cc | 324 | } |
19a5f1d5 | 325 | |
27a619f7 | 326 | eval "\$${type}_abbreviations{$short} = '$enum'"; |
19a5f1d5 | 327 | die $@ if $@; |
7e54b87f | 328 | } |
27a619f7 KW |
329 | |
330 | # Remember the mapping from the property value | |
331 | # (enum) name to its value. | |
332 | eval "\$${type}_enums{$enum} = $value"; | |
333 | die $@ if $@; | |
334 | ||
335 | # Remember the inverse mapping to the short name | |
336 | # so that we can properly label the generated | |
337 | # table's rows and columns | |
338 | eval "\$${type}_short_enums[$value] = '$short'"; | |
339 | die $@ if $@; | |
7e54b87f | 340 | } |
99f21fb9 | 341 | } |
19a5f1d5 | 342 | } |
99f21fb9 | 343 | |
19a5f1d5 KW |
344 | # Inversion map stuff is currently used only by regexec |
345 | switch_pound_if($name, 'PERL_IN_REGEXEC_C'); | |
346 | ||
347 | # The short names tend to be two lower case letters, but it looks | |
348 | # better for those if they are upper. XXX | |
349 | $short_name = uc($short_name) if length($short_name) < 3 | |
226b74db | 350 | || substr($short_name, 0, 1) =~ /[[:lower:]]/; |
19a5f1d5 KW |
351 | $name_prefix = "${short_name}_"; |
352 | my $enum_count = keys %enums; | |
353 | print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n"; | |
354 | ||
34623dbb KW |
355 | if ($input_format eq 'sl') { |
356 | print $out_fh | |
357 | "\n/* Negative enum values indicate the need to use an auxiliary" | |
358 | . " table\n * consisting of the list of enums this one expands to." | |
359 | . " The absolute\n * values of the negative enums are indices into" | |
360 | . " a table of the auxiliary\n * tables' addresses */"; | |
361 | } | |
362 | ||
226b74db | 363 | # Start the enum definition for this map |
19a5f1d5 KW |
364 | print $out_fh "\ntypedef enum {\n"; |
365 | my @enum_list; | |
366 | foreach my $enum (keys %enums) { | |
367 | $enum_list[$enums{$enum}] = $enum; | |
99f21fb9 | 368 | } |
19a5f1d5 | 369 | foreach my $i (0 .. @enum_list - 1) { |
34623dbb KW |
370 | print $out_fh ",\n" if $i > 0; |
371 | ||
19a5f1d5 KW |
372 | my $name = $enum_list[$i]; |
373 | print $out_fh "\t${name_prefix}$name = $i"; | |
19a5f1d5 | 374 | } |
34623dbb KW |
375 | |
376 | # For an 'sl' property, we need extra enums, because some of the | |
377 | # elements are lists. Each such distinct list is placed in its own | |
378 | # auxiliary map table. Here, we go through the inversion map, and for | |
379 | # each distinct list found, create an enum value for it, numbered -1, | |
380 | # -2, .... | |
381 | my %multiples; | |
382 | my $aux_table_prefix = "AUX_TABLE_"; | |
383 | if ($input_format eq 'sl') { | |
384 | foreach my $element (@$invmap) { | |
385 | ||
386 | # A regular scalar is not one of the lists we're looking for | |
387 | # at this stage. | |
388 | next unless ref $element; | |
389 | ||
390 | my $joined = join ",", sort @$element; | |
391 | my $already_found = exists $multiples{$joined}; | |
392 | ||
393 | my $i; | |
394 | if ($already_found) { # Use any existing one | |
395 | $i = $multiples{$joined}; | |
396 | } | |
397 | else { # Otherwise increment to get a new table number | |
398 | $i = keys(%multiples) + 1; | |
399 | $multiples{$joined} = $i; | |
400 | } | |
401 | ||
402 | # This changes the inversion map for this entry to not be the | |
403 | # list | |
404 | $element = "use_$aux_table_prefix$i"; | |
405 | ||
406 | # And add to the enum values | |
407 | if (! $already_found) { | |
408 | print $out_fh ",\n\t${name_prefix}$element = -$i"; | |
409 | } | |
410 | } | |
411 | } | |
412 | ||
413 | print $out_fh "\n"; | |
19a5f1d5 KW |
414 | $declaration_type = "${name_prefix}enum"; |
415 | print $out_fh "} $declaration_type;\n"; | |
226b74db | 416 | # Finished with the enum defintion. |
19a5f1d5 KW |
417 | |
418 | $output_format = "${name_prefix}%s"; | |
34623dbb KW |
419 | |
420 | # If there are auxiliary tables, output them. | |
421 | if (%multiples) { | |
422 | ||
423 | print $out_fh "\n#define HAS_${name_prefix}AUX_TABLES\n"; | |
424 | ||
425 | # Invert keys and values | |
426 | my %inverted_mults; | |
427 | while (my ($key, $value) = each %multiples) { | |
428 | $inverted_mults{$value} = $key; | |
429 | } | |
430 | ||
431 | # Output them in sorted order | |
432 | my @sorted_table_list = sort { $a <=> $b } keys %inverted_mults; | |
433 | ||
434 | # Keep track of how big each aux table is | |
435 | my @aux_counts; | |
436 | ||
437 | # Output each aux table. | |
438 | foreach my $table_number (@sorted_table_list) { | |
439 | my $table = $inverted_mults{$table_number}; | |
440 | print $out_fh "\nstatic const $declaration_type $name_prefix$aux_table_prefix$table_number\[] = {\n"; | |
441 | ||
442 | # Earlier, we joined the elements of this table together with a comma | |
443 | my @elements = split ",", $table; | |
444 | ||
445 | $aux_counts[$table_number] = scalar @elements; | |
446 | for my $i (0 .. @elements - 1) { | |
447 | print $out_fh ",\n" if $i > 0; | |
448 | print $out_fh "\t${name_prefix}$elements[$i]"; | |
449 | } | |
450 | print $out_fh "\n};\n"; | |
451 | } | |
452 | ||
453 | # Output the table that is indexed by the absolute value of the | |
454 | # aux table enum and contains pointers to the tables output just | |
455 | # above | |
456 | print $out_fh "\nstatic const $declaration_type * const ${name_prefix}${aux_table_prefix}ptrs\[] = {\n"; | |
457 | print $out_fh "\tNULL,\t/* Placeholder */\n"; | |
458 | for my $i (1 .. @sorted_table_list) { | |
459 | print $out_fh ",\n" if $i > 1; | |
460 | print $out_fh "\t$name_prefix$aux_table_prefix$i"; | |
461 | } | |
462 | print $out_fh "\n};\n"; | |
463 | ||
464 | print $out_fh | |
465 | "\n/* Parallel table to the above, giving the number of elements" | |
466 | . " in each table\n * pointed to */\n"; | |
467 | print $out_fh "static const U8 ${name_prefix}${aux_table_prefix}lengths\[] = {\n"; | |
468 | print $out_fh "\t0,\t/* Placeholder */\n"; | |
469 | for my $i (1 .. @sorted_table_list) { | |
470 | print $out_fh ",\n" if $i > 1; | |
471 | print $out_fh "\t$aux_counts[$i]\t/* $name_prefix$aux_table_prefix$i */"; | |
472 | } | |
473 | print $out_fh "\n};\n"; | |
474 | } # End of outputting the auxiliary and associated tables | |
99f21fb9 KW |
475 | } |
476 | else { | |
477 | die "'$input_format' invmap() format for '$prop_name' unimplemented"; | |
478 | } | |
479 | ||
480 | die "No inversion map for $prop_name" unless defined $invmap | |
481 | && ref $invmap eq 'ARRAY' | |
482 | && $count; | |
483 | ||
226b74db | 484 | # Now output the inversion map proper |
99f21fb9 KW |
485 | print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {"; |
486 | print $out_fh " /* for $charset */" if $charset; | |
487 | print $out_fh "\n"; | |
488 | ||
489 | # The main body are the scalars passed in to this routine. | |
490 | for my $i (0 .. $count - 1) { | |
491 | my $element = $invmap->[$i]; | |
02f811dd KW |
492 | my $full_element_name = prop_value_aliases($prop_name, $element); |
493 | $element = $full_element_name if defined $full_element_name; | |
494 | $element = $name_prefix . $element; | |
99f21fb9 KW |
495 | print $out_fh "\t$element"; |
496 | print $out_fh "," if $i < $count - 1; | |
497 | print $out_fh "\n"; | |
498 | } | |
499 | print $out_fh "};\n"; | |
99f21fb9 KW |
500 | } |
501 | ||
5a7e5385 | 502 | sub mk_invlist_from_sorted_cp_list { |
a02047bf KW |
503 | |
504 | # Returns an inversion list constructed from the sorted input array of | |
505 | # code points | |
506 | ||
507 | my $list_ref = shift; | |
508 | ||
99f21fb9 KW |
509 | return unless @$list_ref; |
510 | ||
a02047bf KW |
511 | # Initialize to just the first element |
512 | my @invlist = ( $list_ref->[0], $list_ref->[0] + 1); | |
513 | ||
514 | # For each succeeding element, if it extends the previous range, adjust | |
515 | # up, otherwise add it. | |
516 | for my $i (1 .. @$list_ref - 1) { | |
517 | if ($invlist[-1] == $list_ref->[$i]) { | |
518 | $invlist[-1]++; | |
519 | } | |
520 | else { | |
521 | push @invlist, $list_ref->[$i], $list_ref->[$i] + 1; | |
522 | } | |
523 | } | |
524 | return @invlist; | |
525 | } | |
526 | ||
527 | # Read in the Case Folding rules, and construct arrays of code points for the | |
528 | # properties we need. | |
529 | my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding"); | |
530 | die "Could not find inversion map for Case_Folding" unless defined $format; | |
531 | die "Incorrect format '$format' for Case_Folding inversion map" | |
347b9066 KW |
532 | unless $format eq 'al' |
533 | || $format eq 'a'; | |
a02047bf KW |
534 | my @has_multi_char_fold; |
535 | my @is_non_final_fold; | |
536 | ||
537 | for my $i (0 .. @$folds_ref - 1) { | |
538 | next unless ref $folds_ref->[$i]; # Skip single-char folds | |
539 | push @has_multi_char_fold, $cp_ref->[$i]; | |
540 | ||
b6a6e956 | 541 | # Add to the non-finals list each code point that is in a non-final |
a02047bf KW |
542 | # position |
543 | for my $j (0 .. @{$folds_ref->[$i]} - 2) { | |
544 | push @is_non_final_fold, $folds_ref->[$i][$j] | |
545 | unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold; | |
546 | } | |
547 | } | |
548 | ||
a02047bf KW |
549 | sub _Perl_Non_Final_Folds { |
550 | @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold; | |
5a7e5385 | 551 | return mk_invlist_from_sorted_cp_list(\@is_non_final_fold); |
a02047bf KW |
552 | } |
553 | ||
99f21fb9 KW |
554 | sub prop_name_for_cmp ($) { # Sort helper |
555 | my $name = shift; | |
556 | ||
557 | # Returns the input lowercased, with non-alphas removed, as well as | |
558 | # everything starting with a comma | |
559 | ||
560 | $name =~ s/,.*//; | |
561 | $name =~ s/[[:^alpha:]]//g; | |
562 | return lc $name; | |
563 | } | |
564 | ||
892d8259 | 565 | sub UpperLatin1 { |
5a7e5385 | 566 | return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]); |
892d8259 KW |
567 | } |
568 | ||
289ce9cc KW |
569 | sub output_table_common { |
570 | ||
571 | # Common subroutine to actually output the generated rules table. | |
572 | ||
573 | my ($property, | |
574 | $table_value_defines_ref, | |
575 | $table_ref, | |
576 | $names_ref, | |
577 | $abbreviations_ref) = @_; | |
578 | my $size = @$table_ref; | |
579 | ||
580 | # Output the #define list, sorted by numeric value | |
581 | if ($table_value_defines_ref) { | |
582 | my $max_name_length = 0; | |
583 | my @defines; | |
584 | ||
585 | # Put in order, and at the same time find the longest name | |
586 | while (my ($enum, $value) = each %$table_value_defines_ref) { | |
587 | $defines[$value] = $enum; | |
588 | ||
589 | my $length = length $enum; | |
590 | $max_name_length = $length if $length > $max_name_length; | |
591 | } | |
592 | ||
593 | print $out_fh "\n"; | |
594 | ||
595 | # Output, so that the values are vertically aligned in a column after | |
596 | # the longest name | |
597 | foreach my $i (0 .. @defines - 1) { | |
598 | next unless defined $defines[$i]; | |
599 | printf $out_fh "#define %-*s %2d\n", | |
600 | $max_name_length, | |
601 | $defines[$i], | |
602 | $i; | |
603 | } | |
604 | } | |
605 | ||
606 | my $column_width = 2; # We currently allow 2 digits for the number | |
607 | ||
608 | # If the maximum value in the table is 1, it can be a bool. (Being above | |
609 | # a U8 is not currently handled | |
610 | my $max_element = 0; | |
611 | for my $i (0 .. $size - 1) { | |
612 | for my $j (0 .. $size - 1) { | |
613 | next if $max_element >= $table_ref->[$i][$j]; | |
614 | $max_element = $table_ref->[$i][$j]; | |
615 | } | |
616 | } | |
617 | die "Need wider table column width given '$max_element" | |
618 | if length $max_element > $column_width; | |
619 | ||
620 | my $table_type = ($max_element == 1) | |
621 | ? 'bool' | |
622 | : 'U8'; | |
623 | ||
624 | # If a name is longer than the width set aside for a column, its column | |
625 | # needs to have increased spacing so that the name doesn't get truncated | |
626 | # nor run into an adjacent column | |
627 | my @spacers; | |
628 | ||
629 | # If we are being compiled on a Unicode version earlier than that which | |
630 | # this file was designed for, it may be that some of the property values | |
631 | # aren't in the current release, and so would be undefined if we didn't | |
632 | # define them ourselves. Earlier code has done this, making them | |
633 | # lowercase characters of length one. We look to see if any exist, so | |
634 | # that we can add an annotation to the output table | |
635 | my $has_placeholder = 0; | |
636 | ||
637 | for my $i (0 .. $size - 1) { | |
638 | no warnings 'numeric'; | |
639 | $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax; | |
640 | $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width); | |
641 | } | |
642 | ||
643 | print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n"; | |
644 | ||
645 | # Calculate the column heading line | |
646 | my $header_line = "/* " | |
647 | . (" " x $max_hdr_len) # We let the row heading meld to | |
648 | # the '*/' for those that are at | |
649 | # the max | |
650 | . " " x 3; # Space for '*/ ' | |
651 | # Now each column | |
652 | for my $i (0 .. $size - 1) { | |
653 | $header_line .= sprintf "%s%*s", | |
654 | $spacers[$i], | |
655 | $column_width + 1, # 1 for the ',' | |
656 | $names_ref->[$i]; | |
657 | } | |
658 | $header_line .= " */\n"; | |
659 | ||
660 | # If we have annotations, output it now. | |
661 | if ($has_placeholder || scalar %$abbreviations_ref) { | |
662 | my $text = ""; | |
663 | foreach my $abbr (sort keys %$abbreviations_ref) { | |
664 | $text .= "; " if $text; | |
665 | $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'"; | |
666 | } | |
667 | if ($has_placeholder) { | |
668 | $text .= "; other " if $text; | |
669 | $text .= "lowercase names are placeholders for" | |
670 | . " property values not defined until a later Unicode" | |
671 | . " release, so are irrelevant in this one, as they are" | |
672 | . " not assigned to any code points"; | |
673 | } | |
674 | ||
675 | my $indent = " " x 3; | |
676 | $text = $indent . "/* $text */"; | |
677 | ||
678 | # Wrap the text so that it is no wider than the table, which the | |
679 | # header line gives. | |
680 | my $output_width = length $header_line; | |
681 | while (length $text > $output_width) { | |
682 | my $cur_line = substr($text, 0, $output_width); | |
683 | ||
684 | # Find the first blank back from the right end to wrap at. | |
685 | for (my $i = $output_width -1; $i > 0; $i--) { | |
686 | if (substr($text, $i, 1) eq " ") { | |
687 | print $out_fh substr($text, 0, $i), "\n"; | |
688 | ||
689 | # Set so will look at just the remaining tail (which will | |
690 | # be indented and have a '*' after the indent | |
691 | $text = $indent . " * " . substr($text, $i + 1); | |
692 | last; | |
693 | } | |
694 | } | |
695 | } | |
696 | ||
697 | # And any remaining | |
698 | print $out_fh $text, "\n" if $text; | |
699 | } | |
700 | ||
701 | # We calculated the header line earlier just to get its width so that we | |
702 | # could make sure the annotations fit into that. | |
703 | print $out_fh $header_line; | |
704 | ||
705 | # Now output the bulk of the table. | |
706 | for my $i (0 .. $size - 1) { | |
707 | ||
708 | # First the row heading. | |
709 | printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i]; | |
710 | print $out_fh "{"; # Then the brace for this row | |
711 | ||
712 | # Then each column | |
713 | for my $j (0 .. $size -1) { | |
714 | print $out_fh $spacers[$j]; | |
715 | printf $out_fh "%*d", $column_width, $table_ref->[$i][$j]; | |
716 | print $out_fh "," if $j < $size - 1; | |
717 | } | |
718 | print $out_fh " }"; | |
719 | print $out_fh "," if $i < $size - 1; | |
720 | print $out_fh "\n"; | |
721 | } | |
722 | ||
723 | print $out_fh "};\n"; | |
724 | } | |
725 | ||
973a28ed KW |
726 | sub output_GCB_table() { |
727 | ||
728 | # Create and output the pair table for use in determining Grapheme Cluster | |
729 | # Breaks, given in http://www.unicode.org/reports/tr29/. | |
b0e24409 KW |
730 | my %gcb_actions = ( |
731 | GCB_NOBREAK => 0, | |
732 | GCB_BREAKABLE => 1, | |
733 | GCB_RI_then_RI => 2, # Rules 12 and 13 | |
734 | GCB_EX_then_EM => 3, # Rule 10 | |
735 | ); | |
973a28ed KW |
736 | |
737 | # The table is constructed in reverse order of the rules, to make the | |
738 | # lower-numbered, higher priority ones override the later ones, as the | |
739 | # algorithm stops at the earliest matching rule | |
740 | ||
741 | my @gcb_table; | |
742 | my $table_size = @gcb_short_enums; | |
743 | ||
744 | # Otherwise, break everywhere. | |
b0e24409 | 745 | # GB99 Any ÷ Any |
973a28ed KW |
746 | for my $i (0 .. $table_size - 1) { |
747 | for my $j (0 .. $table_size - 1) { | |
748 | $gcb_table[$i][$j] = 1; | |
749 | } | |
750 | } | |
751 | ||
b0e24409 KW |
752 | # Do not break within emoji flag sequences. That is, do not break between |
753 | # regional indicator (RI) symbols if there is an odd number of RI | |
754 | # characters before the break point. Must be resolved in runtime code. | |
755 | # | |
c492f156 | 756 | # GB12 sot (RI RI)* RI × RI |
b0e24409 KW |
757 | # GB13 [^RI] (RI RI)* RI × RI |
758 | $gcb_table[$gcb_enums{'Regional_Indicator'}] | |
759 | [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI}; | |
760 | ||
761 | # Do not break within emoji modifier sequences or emoji zwj sequences. | |
762 | # GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ ) | |
763 | $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0; | |
764 | $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0; | |
765 | ||
766 | # GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier | |
767 | $gcb_table[$gcb_enums{'Extend'}][$gcb_enums{'E_Modifier'}] | |
768 | = $gcb_actions{GCB_EX_then_EM}; | |
769 | $gcb_table[$gcb_enums{'E_Base'}][$gcb_enums{'E_Modifier'}] = 0; | |
770 | $gcb_table[$gcb_enums{'E_Base_GAZ'}][$gcb_enums{'E_Modifier'}] = 0; | |
771 | ||
772 | # Do not break before extending characters or ZWJ. | |
973a28ed | 773 | # Do not break before SpacingMarks, or after Prepend characters. |
973a28ed | 774 | # GB9b Prepend × |
b0e24409 KW |
775 | # GB9a × SpacingMark |
776 | # GB9 × ( Extend | ZWJ ) | |
973a28ed | 777 | for my $i (0 .. @gcb_table - 1) { |
289ce9cc | 778 | $gcb_table[$gcb_enums{'Prepend'}][$i] = 0; |
b0e24409 KW |
779 | $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0; |
780 | $gcb_table[$i][$gcb_enums{'Extend'}] = 0; | |
781 | $gcb_table[$i][$gcb_enums{'ZWJ'}] = 0; | |
973a28ed KW |
782 | } |
783 | ||
973a28ed KW |
784 | # Do not break Hangul syllable sequences. |
785 | # GB8 ( LVT | T) × T | |
786 | $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0; | |
787 | $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0; | |
788 | ||
789 | # GB7 ( LV | V ) × ( V | T ) | |
790 | $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0; | |
791 | $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0; | |
792 | $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0; | |
793 | $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0; | |
794 | ||
795 | # GB6 L × ( L | V | LV | LVT ) | |
796 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0; | |
797 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0; | |
798 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0; | |
799 | $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0; | |
800 | ||
289ce9cc KW |
801 | # Do not break between a CR and LF. Otherwise, break before and after |
802 | # controls. | |
973a28ed KW |
803 | # GB5 ÷ ( Control | CR | LF ) |
804 | # GB4 ( Control | CR | LF ) ÷ | |
805 | for my $i (0 .. @gcb_table - 1) { | |
289ce9cc | 806 | $gcb_table[$i][$gcb_enums{'Control'}] = 1; |
973a28ed KW |
807 | $gcb_table[$i][$gcb_enums{'CR'}] = 1; |
808 | $gcb_table[$i][$gcb_enums{'LF'}] = 1; | |
289ce9cc | 809 | $gcb_table[$gcb_enums{'Control'}][$i] = 1; |
973a28ed KW |
810 | $gcb_table[$gcb_enums{'CR'}][$i] = 1; |
811 | $gcb_table[$gcb_enums{'LF'}][$i] = 1; | |
812 | } | |
813 | ||
814 | # GB3 CR × LF | |
815 | $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0; | |
816 | ||
b0e24409 | 817 | # Break at the start and end of text, unless the text is empty |
973a28ed KW |
818 | # GB1 sot ÷ |
819 | # GB2 ÷ eot | |
820 | for my $i (0 .. @gcb_table - 1) { | |
289ce9cc KW |
821 | $gcb_table[$i][$gcb_enums{'EDGE'}] = 1; |
822 | $gcb_table[$gcb_enums{'EDGE'}][$i] = 1; | |
973a28ed | 823 | } |
289ce9cc | 824 | $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0; |
973a28ed | 825 | |
b0e24409 | 826 | output_table_common('GCB', \%gcb_actions, |
289ce9cc | 827 | \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations); |
973a28ed KW |
828 | } |
829 | ||
6b659339 KW |
830 | sub output_LB_table() { |
831 | ||
832 | # Create and output the enums, #defines, and pair table for use in | |
833 | # determining Line Breaks. This uses the default line break algorithm, | |
834 | # given in http://www.unicode.org/reports/tr14/, but tailored by example 7 | |
835 | # in that page, as the Unicode-furnished tests assume that tailoring. | |
836 | ||
6b659339 KW |
837 | # The result is really just true or false. But we follow along with tr14, |
838 | # creating a rule which is false for something like X SP* X. That gets | |
839 | # encoding 2. The rest of the actions are synthetic ones that indicate | |
840 | # some context handling is required. These each are added to the | |
841 | # underlying 0, 1, or 2, instead of replacing them, so that the underlying | |
842 | # value can be retrieved. Actually only rules from 7 through 18 (which | |
843 | # are the ones where space matter) are possible to have 2 added to them. | |
844 | # The others below add just 0 or 1. It might be possible for one | |
845 | # synthetic rule to be added to another, yielding a larger value. This | |
846 | # doesn't happen in the Unicode 8.0 rule set, and as you can see from the | |
847 | # names of the middle grouping below, it is impossible for that to occur | |
848 | # for them because they all start with mutually exclusive classes. That | |
849 | # the final rule can't be added to any of the others isn't obvious from | |
850 | # its name, so it is assigned a power of 2 higher than the others can get | |
851 | # to so any addition would preserve all data. (And the code will reach an | |
852 | # assert(0) on debugging builds should this happen.) | |
853 | my %lb_actions = ( | |
854 | LB_NOBREAK => 0, | |
855 | LB_BREAKABLE => 1, | |
856 | LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2, | |
857 | ||
b0e24409 | 858 | LB_CM_ZWJ_foo => 3, # Rule 9 |
6b659339 KW |
859 | LB_SP_foo => 6, # Rule 18 |
860 | LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25 | |
861 | LB_SY_or_IS_then_various => 11, # Rule 25 | |
862 | LB_HY_or_BA_then_foo => 13, # Rule 21 | |
b0e24409 | 863 | LB_RI_then_RI => 15, # Rule 30a |
6b659339 | 864 | |
b0e24409 | 865 | LB_various_then_PO_or_PR => (1<<5), # Rule 25 |
6b659339 KW |
866 | ); |
867 | ||
6b659339 KW |
868 | # Construct the LB pair table. This is based on the rules in |
869 | # http://www.unicode.org/reports/tr14/, but modified as those rules are | |
870 | # designed for someone taking a string of text and sequentially going | |
871 | # through it to find the break opportunities, whereas, Perl requires | |
872 | # determining if a given random spot is a break opportunity, without | |
873 | # knowing all the entire string before it. | |
874 | # | |
875 | # The table is constructed in reverse order of the rules, to make the | |
876 | # lower-numbered, higher priority ones override the later ones, as the | |
877 | # algorithm stops at the earliest matching rule | |
878 | ||
879 | my @lb_table; | |
880 | my $table_size = @lb_short_enums; | |
881 | ||
882 | # LB31. Break everywhere else | |
883 | for my $i (0 .. $table_size - 1) { | |
884 | for my $j (0 .. $table_size - 1) { | |
885 | $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'}; | |
886 | } | |
887 | } | |
888 | ||
b0e24409 KW |
889 | # LB30b Do not break between an emoji base and an emoji modifier. |
890 | # EB × EM | |
891 | $lb_table[$lb_enums{'E_Base'}][$lb_enums{'E_Modifier'}] | |
892 | = $lb_actions{'LB_NOBREAK'}; | |
893 | ||
894 | # LB30a Break between two regional indicator symbols if and only if there | |
895 | # are an even number of regional indicators preceding the position of the | |
896 | # break. | |
897 | # sot (RI RI)* RI × RI | |
898 | # [^RI] (RI RI)* RI × RI | |
289ce9cc | 899 | $lb_table[$lb_enums{'Regional_Indicator'}] |
b0e24409 | 900 | [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_RI_then_RI'}; |
6b659339 KW |
901 | |
902 | # LB30 Do not break between letters, numbers, or ordinary symbols and | |
903 | # opening or closing parentheses. | |
904 | # (AL | HL | NU) × OP | |
289ce9cc KW |
905 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}] |
906 | = $lb_actions{'LB_NOBREAK'}; | |
907 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}] | |
908 | = $lb_actions{'LB_NOBREAK'}; | |
909 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}] | |
910 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
911 | |
912 | # CP × (AL | HL | NU) | |
289ce9cc KW |
913 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}] |
914 | = $lb_actions{'LB_NOBREAK'}; | |
915 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}] | |
916 | = $lb_actions{'LB_NOBREAK'}; | |
917 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}] | |
918 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
919 | |
920 | # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”). | |
921 | # IS × (AL | HL) | |
289ce9cc KW |
922 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}] |
923 | = $lb_actions{'LB_NOBREAK'}; | |
924 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
925 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
926 | |
927 | # LB28 Do not break between alphabetics (“at”). | |
928 | # (AL | HL) × (AL | HL) | |
289ce9cc KW |
929 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}] |
930 | = $lb_actions{'LB_NOBREAK'}; | |
931 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}] | |
932 | = $lb_actions{'LB_NOBREAK'}; | |
933 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}] | |
934 | = $lb_actions{'LB_NOBREAK'}; | |
935 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}] | |
936 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
937 | |
938 | # LB27 Treat a Korean Syllable Block the same as ID. | |
939 | # (JL | JV | JT | H2 | H3) × IN | |
289ce9cc KW |
940 | $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}] |
941 | = $lb_actions{'LB_NOBREAK'}; | |
942 | $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}] | |
943 | = $lb_actions{'LB_NOBREAK'}; | |
944 | $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}] | |
945 | = $lb_actions{'LB_NOBREAK'}; | |
946 | $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}] | |
947 | = $lb_actions{'LB_NOBREAK'}; | |
948 | $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}] | |
949 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
950 | |
951 | # (JL | JV | JT | H2 | H3) × PO | |
289ce9cc KW |
952 | $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}] |
953 | = $lb_actions{'LB_NOBREAK'}; | |
954 | $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}] | |
955 | = $lb_actions{'LB_NOBREAK'}; | |
956 | $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}] | |
957 | = $lb_actions{'LB_NOBREAK'}; | |
958 | $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}] | |
959 | = $lb_actions{'LB_NOBREAK'}; | |
960 | $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}] | |
961 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
962 | |
963 | # PR × (JL | JV | JT | H2 | H3) | |
289ce9cc KW |
964 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}] |
965 | = $lb_actions{'LB_NOBREAK'}; | |
966 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}] | |
967 | = $lb_actions{'LB_NOBREAK'}; | |
968 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}] | |
969 | = $lb_actions{'LB_NOBREAK'}; | |
970 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}] | |
971 | = $lb_actions{'LB_NOBREAK'}; | |
972 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}] | |
973 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
974 | |
975 | # LB26 Do not break a Korean syllable. | |
976 | # JL × (JL | JV | H2 | H3) | |
977 | $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'}; | |
978 | $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; | |
979 | $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'}; | |
980 | $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'}; | |
981 | ||
982 | # (JV | H2) × (JV | JT) | |
983 | $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; | |
984 | $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; | |
985 | $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
986 | $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
987 | ||
988 | # (JT | H3) × JT | |
989 | $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
990 | $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; | |
991 | ||
992 | # LB25 Do not break between the following pairs of classes relevant to | |
993 | # numbers, as tailored by example 7 in | |
994 | # http://www.unicode.org/reports/tr14/#Examples | |
995 | # We follow that tailoring because Unicode's test cases expect it | |
996 | # (PR | PO) × ( OP | HY )? NU | |
289ce9cc KW |
997 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}] |
998 | = $lb_actions{'LB_NOBREAK'}; | |
999 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}] | |
1000 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1001 | |
1002 | # Given that (OP | HY )? is optional, we have to test for it in code. | |
1003 | # We add in the action (instead of overriding) for this, so that in | |
1004 | # the code we can recover the underlying break value. | |
289ce9cc | 1005 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}] |
6b659339 | 1006 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
289ce9cc | 1007 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}] |
6b659339 | 1008 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
289ce9cc | 1009 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}] |
6b659339 | 1010 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
289ce9cc | 1011 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}] |
6b659339 KW |
1012 | += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; |
1013 | ||
1014 | # ( OP | HY ) × NU | |
289ce9cc KW |
1015 | $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}] |
1016 | = $lb_actions{'LB_NOBREAK'}; | |
1017 | $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}] | |
1018 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1019 | |
1020 | # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP ) | |
1021 | # which can be rewritten as: | |
1022 | # NU (SY | IS)* × (NU | SY | IS | CL | CP ) | |
289ce9cc KW |
1023 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}] |
1024 | = $lb_actions{'LB_NOBREAK'}; | |
1025 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}] | |
1026 | = $lb_actions{'LB_NOBREAK'}; | |
1027 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}] | |
1028 | = $lb_actions{'LB_NOBREAK'}; | |
1029 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}] | |
1030 | = $lb_actions{'LB_NOBREAK'}; | |
1031 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}] | |
1032 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1033 | |
1034 | # Like earlier where we have to test in code, we add in the action so | |
1035 | # that we can recover the underlying values. This is done in rules | |
1036 | # below, as well. The code assumes that we haven't added 2 actions. | |
1037 | # Shoul a later Unicode release break that assumption, then tests | |
1038 | # should start failing. | |
289ce9cc | 1039 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}] |
6b659339 | 1040 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1041 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}] |
6b659339 | 1042 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1043 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}] |
6b659339 | 1044 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1045 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}] |
6b659339 | 1046 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1047 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}] |
6b659339 | 1048 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1049 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}] |
6b659339 | 1050 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1051 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}] |
6b659339 | 1052 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1053 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}] |
6b659339 | 1054 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1055 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}] |
6b659339 | 1056 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
289ce9cc | 1057 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}] |
6b659339 KW |
1058 | += $lb_actions{'LB_SY_or_IS_then_various'}; |
1059 | ||
1060 | # NU (NU | SY | IS)* (CL | CP)? × (PO | PR) | |
1061 | # which can be rewritten as: | |
1062 | # NU (SY | IS)* (CL | CP)? × (PO | PR) | |
289ce9cc KW |
1063 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}] |
1064 | = $lb_actions{'LB_NOBREAK'}; | |
1065 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}] | |
1066 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1067 | |
289ce9cc | 1068 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 | 1069 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1070 | $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 | 1071 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1072 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 | 1073 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1074 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}] |
6b659339 KW |
1075 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
1076 | ||
289ce9cc | 1077 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 | 1078 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1079 | $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 | 1080 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1081 | $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 | 1082 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
289ce9cc | 1083 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}] |
6b659339 KW |
1084 | += $lb_actions{'LB_various_then_PO_or_PR'}; |
1085 | ||
b0e24409 KW |
1086 | # LB24 Do not break between numeric prefix/postfix and letters, or between |
1087 | # letters and prefix/postfix. | |
1088 | # (PR | PO) × (AL | HL) | |
289ce9cc KW |
1089 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}] |
1090 | = $lb_actions{'LB_NOBREAK'}; | |
1091 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
1092 | = $lb_actions{'LB_NOBREAK'}; | |
289ce9cc KW |
1093 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}] |
1094 | = $lb_actions{'LB_NOBREAK'}; | |
1095 | $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
1096 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1097 | |
b0e24409 KW |
1098 | # (AL | HL) × (PR | PO) |
1099 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Prefix_Numeric'}] | |
1100 | = $lb_actions{'LB_NOBREAK'}; | |
1101 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Prefix_Numeric'}] | |
1102 | = $lb_actions{'LB_NOBREAK'}; | |
1103 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Postfix_Numeric'}] | |
1104 | = $lb_actions{'LB_NOBREAK'}; | |
1105 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Postfix_Numeric'}] | |
1106 | = $lb_actions{'LB_NOBREAK'}; | |
1107 | ||
1108 | # LB23a Do not break between numeric prefixes and ideographs, or between | |
1109 | # ideographs and numeric postfixes. | |
1110 | # PR × (ID | EB | EM) | |
1111 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}] | |
1112 | = $lb_actions{'LB_NOBREAK'}; | |
1113 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Base'}] | |
1114 | = $lb_actions{'LB_NOBREAK'}; | |
1115 | $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Modifier'}] | |
1116 | = $lb_actions{'LB_NOBREAK'}; | |
1117 | ||
1118 | # (ID | EB | EM) × PO | |
289ce9cc KW |
1119 | $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}] |
1120 | = $lb_actions{'LB_NOBREAK'}; | |
b0e24409 KW |
1121 | $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Postfix_Numeric'}] |
1122 | = $lb_actions{'LB_NOBREAK'}; | |
1123 | $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Postfix_Numeric'}] | |
1124 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1125 | |
b0e24409 | 1126 | # LB23 Do not break between digits and letters |
6b659339 | 1127 | # (AL | HL) × NU |
289ce9cc KW |
1128 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}] |
1129 | = $lb_actions{'LB_NOBREAK'}; | |
1130 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}] | |
1131 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1132 | |
1133 | # NU × (AL | HL) | |
289ce9cc KW |
1134 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}] |
1135 | = $lb_actions{'LB_NOBREAK'}; | |
1136 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}] | |
1137 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1138 | |
1139 | # LB22 Do not break between two ellipses, or between letters, numbers or | |
1140 | # exclamations and ellipsis. | |
1141 | # (AL | HL) × IN | |
289ce9cc KW |
1142 | $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}] |
1143 | = $lb_actions{'LB_NOBREAK'}; | |
1144 | $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}] | |
1145 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1146 | |
289ce9cc KW |
1147 | # Exclamation × IN |
1148 | $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}] | |
1149 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1150 | |
b0e24409 | 1151 | # (ID | EB | EM) × IN |
289ce9cc KW |
1152 | $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}] |
1153 | = $lb_actions{'LB_NOBREAK'}; | |
b0e24409 KW |
1154 | $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Inseparable'}] |
1155 | = $lb_actions{'LB_NOBREAK'}; | |
1156 | $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Inseparable'}] | |
1157 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1158 | |
1159 | # IN × IN | |
289ce9cc KW |
1160 | $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}] |
1161 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1162 | |
1163 | # NU × IN | |
289ce9cc KW |
1164 | $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}] |
1165 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1166 | |
1167 | # LB21b Don’t break between Solidus and Hebrew letters. | |
1168 | # SY × HL | |
289ce9cc KW |
1169 | $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}] |
1170 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1171 | |
1172 | # LB21a Don't break after Hebrew + Hyphen. | |
1173 | # HL (HY | BA) × | |
1174 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1175 | $lb_table[$lb_enums{'Hyphen'}][$i] |
1176 | += $lb_actions{'LB_HY_or_BA_then_foo'}; | |
1177 | $lb_table[$lb_enums{'Break_After'}][$i] | |
1178 | += $lb_actions{'LB_HY_or_BA_then_foo'}; | |
6b659339 KW |
1179 | } |
1180 | ||
1181 | # LB21 Do not break before hyphen-minus, other hyphens, fixed-width | |
1182 | # spaces, small kana, and other non-starters, or after acute accents. | |
1183 | # × BA | |
1184 | # × HY | |
1185 | # × NS | |
1186 | # BB × | |
1187 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1188 | $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'}; |
1189 | $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'}; | |
1190 | $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'}; | |
1191 | $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1192 | } |
1193 | ||
1194 | # LB20 Break before and after unresolved CB. | |
1195 | # ÷ CB | |
1196 | # CB ÷ | |
1197 | # Conditional breaks should be resolved external to the line breaking | |
1198 | # rules. However, the default action is to treat unresolved CB as breaking | |
1199 | # before and after. | |
1200 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1201 | $lb_table[$i][$lb_enums{'Contingent_Break'}] |
1202 | = $lb_actions{'LB_BREAKABLE'}; | |
1203 | $lb_table[$lb_enums{'Contingent_Break'}][$i] | |
1204 | = $lb_actions{'LB_BREAKABLE'}; | |
6b659339 KW |
1205 | } |
1206 | ||
1207 | # LB19 Do not break before or after quotation marks, such as ‘ ” ’. | |
1208 | # × QU | |
1209 | # QU × | |
1210 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1211 | $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'}; |
1212 | $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1213 | } |
1214 | ||
1215 | # LB18 Break after spaces | |
1216 | # SP ÷ | |
1217 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1218 | $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'}; |
6b659339 KW |
1219 | } |
1220 | ||
1221 | # LB17 Do not break within ‘——’, even with intervening spaces. | |
1222 | # B2 SP* × B2 | |
289ce9cc | 1223 | $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}] |
6b659339 KW |
1224 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1225 | ||
1226 | # LB16 Do not break between closing punctuation and a nonstarter even with | |
1227 | # intervening spaces. | |
1228 | # (CL | CP) SP* × NS | |
289ce9cc | 1229 | $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}] |
6b659339 | 1230 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1231 | $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}] |
6b659339 KW |
1232 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1233 | ||
1234 | ||
1235 | # LB15 Do not break within ‘”[’, even with intervening spaces. | |
1236 | # QU SP* × OP | |
289ce9cc | 1237 | $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}] |
6b659339 KW |
1238 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1239 | ||
1240 | # LB14 Do not break after ‘[’, even after spaces. | |
1241 | # OP SP* × | |
1242 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1243 | $lb_table[$lb_enums{'Open_Punctuation'}][$i] |
6b659339 KW |
1244 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1245 | } | |
1246 | ||
1247 | # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as | |
1248 | # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples | |
1249 | # [^NU] × CL | |
1250 | # [^NU] × CP | |
1251 | # × EX | |
1252 | # [^NU] × IS | |
1253 | # [^NU] × SY | |
1254 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1255 | $lb_table[$i][$lb_enums{'Exclamation'}] |
6b659339 KW |
1256 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1257 | ||
289ce9cc | 1258 | next if $i == $lb_enums{'Numeric'}; |
6b659339 | 1259 | |
289ce9cc | 1260 | $lb_table[$i][$lb_enums{'Close_Punctuation'}] |
6b659339 | 1261 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1262 | $lb_table[$i][$lb_enums{'Close_Parenthesis'}] |
6b659339 | 1263 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1264 | $lb_table[$i][$lb_enums{'Infix_Numeric'}] |
6b659339 | 1265 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1266 | $lb_table[$i][$lb_enums{'Break_Symbols'}] |
6b659339 KW |
1267 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1268 | } | |
1269 | ||
1270 | # LB12a Do not break before NBSP and related characters, except after | |
1271 | # spaces and hyphens. | |
1272 | # [^SP BA HY] × GL | |
1273 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1274 | next if $i == $lb_enums{'Space'} |
1275 | || $i == $lb_enums{'Break_After'} | |
1276 | || $i == $lb_enums{'Hyphen'}; | |
6b659339 KW |
1277 | |
1278 | # We don't break, but if a property above has said don't break even | |
1279 | # with space between, don't override that (also in the next few rules) | |
289ce9cc | 1280 | next if $lb_table[$i][$lb_enums{'Glue'}] |
6b659339 | 1281 | == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1282 | $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'}; |
6b659339 KW |
1283 | } |
1284 | ||
1285 | # LB12 Do not break after NBSP and related characters. | |
1286 | # GL × | |
1287 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1288 | next if $lb_table[$lb_enums{'Glue'}][$i] |
6b659339 | 1289 | == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
289ce9cc | 1290 | $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'}; |
6b659339 KW |
1291 | } |
1292 | ||
1293 | # LB11 Do not break before or after Word joiner and related characters. | |
1294 | # × WJ | |
1295 | # WJ × | |
1296 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1297 | if ($lb_table[$i][$lb_enums{'Word_Joiner'}] |
6b659339 KW |
1298 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) |
1299 | { | |
289ce9cc | 1300 | $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'}; |
6b659339 | 1301 | } |
289ce9cc | 1302 | if ($lb_table[$lb_enums{'Word_Joiner'}][$i] |
6b659339 KW |
1303 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) |
1304 | { | |
289ce9cc | 1305 | $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'}; |
6b659339 KW |
1306 | } |
1307 | } | |
1308 | ||
1309 | # Special case this here to avoid having to do a special case in the code, | |
1310 | # by making this the same as other things with a SP in front of them that | |
1311 | # don't break, we avoid an extra test | |
289ce9cc | 1312 | $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}] |
6b659339 KW |
1313 | = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; |
1314 | ||
1315 | # LB9 and LB10 are done in the same loop | |
1316 | # | |
1317 | # LB9 Do not break a combining character sequence; treat it as if it has | |
1318 | # the line breaking class of the base character in all of the | |
b0e24409 KW |
1319 | # higher-numbered rules. Treat ZWJ as if it were CM |
1320 | # Treat X (CM|ZWJ)* as if it were X. | |
6b659339 KW |
1321 | # where X is any line break class except BK, CR, LF, NL, SP, or ZW. |
1322 | ||
b0e24409 KW |
1323 | # LB10 Treat any remaining combining mark or ZWJ as AL. This catches the |
1324 | # case where a CM or ZWJ is the first character on the line or follows SP, | |
1325 | # BK, CR, LF, NL, or ZW. | |
6b659339 KW |
1326 | for my $i (0 .. @lb_table - 1) { |
1327 | ||
b0e24409 KW |
1328 | # When the CM or ZWJ is the first in the pair, we don't know without |
1329 | # looking behind whether the CM or ZWJ is going to attach to an | |
1330 | # earlier character, or not. So have to figure this out at runtime in | |
1331 | # the code | |
1332 | $lb_table[$lb_enums{'Combining_Mark'}][$i] | |
1333 | = $lb_actions{'LB_CM_ZWJ_foo'}; | |
1334 | $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_CM_ZWJ_foo'}; | |
289ce9cc KW |
1335 | |
1336 | if ( $i == $lb_enums{'Mandatory_Break'} | |
1337 | || $i == $lb_enums{'EDGE'} | |
1338 | || $i == $lb_enums{'Carriage_Return'} | |
1339 | || $i == $lb_enums{'Line_Feed'} | |
1340 | || $i == $lb_enums{'Next_Line'} | |
1341 | || $i == $lb_enums{'Space'} | |
1342 | || $i == $lb_enums{'ZWSpace'}) | |
6b659339 KW |
1343 | { |
1344 | # For these classes, a following CM doesn't combine, and should do | |
289ce9cc KW |
1345 | # whatever 'Alphabetic' would do. |
1346 | $lb_table[$i][$lb_enums{'Combining_Mark'}] | |
1347 | = $lb_table[$i][$lb_enums{'Alphabetic'}]; | |
b0e24409 KW |
1348 | $lb_table[$i][$lb_enums{'ZWJ'}] |
1349 | = $lb_table[$i][$lb_enums{'Alphabetic'}]; | |
6b659339 KW |
1350 | } |
1351 | else { | |
b0e24409 KW |
1352 | # For these classes, the CM or ZWJ combines, so doesn't break, |
1353 | # inheriting the type of nobreak from the master character. | |
289ce9cc | 1354 | if ($lb_table[$i][$lb_enums{'Combining_Mark'}] |
6b659339 KW |
1355 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) |
1356 | { | |
289ce9cc KW |
1357 | $lb_table[$i][$lb_enums{'Combining_Mark'}] |
1358 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 | 1359 | } |
b0e24409 KW |
1360 | if ($lb_table[$i][$lb_enums{'ZWJ'}] |
1361 | != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) | |
1362 | { | |
1363 | $lb_table[$i][$lb_enums{'ZWJ'}] | |
1364 | = $lb_actions{'LB_NOBREAK'}; | |
1365 | } | |
6b659339 KW |
1366 | } |
1367 | } | |
1368 | ||
b0e24409 KW |
1369 | # LB8a Do not break between a zero width joiner and an ideograph, emoji |
1370 | # base or emoji modifier. This rule prevents breaks within emoji joiner | |
1371 | # sequences. | |
1372 | # ZWJ × (ID | EB | EM) | |
1373 | $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}] | |
1374 | = $lb_actions{'LB_NOBREAK'}; | |
1375 | $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}] | |
1376 | = $lb_actions{'LB_NOBREAK'}; | |
1377 | $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}] | |
1378 | = $lb_actions{'LB_NOBREAK'}; | |
1379 | ||
6b659339 KW |
1380 | # LB8 Break before any character following a zero-width space, even if one |
1381 | # or more spaces intervene. | |
1382 | # ZW SP* ÷ | |
1383 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1384 | $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'}; |
6b659339 KW |
1385 | } |
1386 | ||
1387 | # Because of LB8-10, we need to look at context for "SP x", and this must | |
1388 | # be done in the code. So override the existing rules for that, by adding | |
1389 | # a constant to get new rules that tell the code it needs to look at | |
1390 | # context. By adding this action instead of replacing the existing one, | |
1391 | # we can get back to the original rule if necessary. | |
1392 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc | 1393 | $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'}; |
6b659339 KW |
1394 | } |
1395 | ||
1396 | # LB7 Do not break before spaces or zero width space. | |
1397 | # × SP | |
1398 | # × ZW | |
1399 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1400 | $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'}; |
1401 | $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1402 | } |
1403 | ||
1404 | # LB6 Do not break before hard line breaks. | |
1405 | # × ( BK | CR | LF | NL ) | |
1406 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1407 | $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'}; |
1408 | $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'}; | |
1409 | $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'}; | |
1410 | $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1411 | } |
1412 | ||
1413 | # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks. | |
1414 | # CR × LF | |
1415 | # CR ! | |
1416 | # LF ! | |
1417 | # NL ! | |
1418 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1419 | $lb_table[$lb_enums{'Carriage_Return'}][$i] |
1420 | = $lb_actions{'LB_BREAKABLE'}; | |
1421 | $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'}; | |
1422 | $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'}; | |
6b659339 | 1423 | } |
289ce9cc KW |
1424 | $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}] |
1425 | = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1426 | |
1427 | # LB4 Always break after hard line breaks. | |
1428 | # BK ! | |
1429 | for my $i (0 .. @lb_table - 1) { | |
289ce9cc KW |
1430 | $lb_table[$lb_enums{'Mandatory_Break'}][$i] |
1431 | = $lb_actions{'LB_BREAKABLE'}; | |
6b659339 KW |
1432 | } |
1433 | ||
6b659339 KW |
1434 | # LB3 Always break at the end of text. |
1435 | # ! eot | |
b0e24409 KW |
1436 | # LB2 Never break at the start of text. |
1437 | # sot × | |
6b659339 | 1438 | for my $i (0 .. @lb_table - 1) { |
289ce9cc KW |
1439 | $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'}; |
1440 | $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'}; | |
6b659339 KW |
1441 | } |
1442 | ||
1443 | # LB1 Assign a line breaking class to each code point of the input. | |
1444 | # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes | |
1445 | # depending on criteria outside the scope of this algorithm. | |
1446 | # | |
1447 | # In the absence of such criteria all characters with a specific | |
1448 | # combination of original class and General_Category property value are | |
1449 | # resolved as follows: | |
1450 | # Original Resolved General_Category | |
1451 | # AI, SG, XX AL Any | |
1452 | # SA CM Only Mn or Mc | |
1453 | # SA AL Any except Mn and Mc | |
1454 | # CJ NS Any | |
1455 | # | |
1456 | # This is done in mktables, so we never see any of the remapped-from | |
1457 | # classes. | |
1458 | ||
289ce9cc KW |
1459 | output_table_common('LB', \%lb_actions, |
1460 | \@lb_table, \@lb_short_enums, \%lb_abbreviations); | |
6b659339 KW |
1461 | } |
1462 | ||
7e54b87f KW |
1463 | sub output_WB_table() { |
1464 | ||
1465 | # Create and output the enums, #defines, and pair table for use in | |
1466 | # determining Word Breaks, given in http://www.unicode.org/reports/tr29/. | |
1467 | ||
1468 | # This uses the same mechanism in the other bounds tables generated by | |
1469 | # this file. The actions that could override a 0 or 1 are added to those | |
1470 | # numbers; the actions that clearly don't depend on the underlying rule | |
1471 | # simply overwrite | |
1472 | my %wb_actions = ( | |
1473 | WB_NOBREAK => 0, | |
1474 | WB_BREAKABLE => 1, | |
1475 | WB_hs_then_hs => 2, | |
b0e24409 | 1476 | WB_Ex_or_FO_or_ZWJ_then_foo => 3, |
7e54b87f KW |
1477 | WB_DQ_then_HL => 4, |
1478 | WB_HL_then_DQ => 6, | |
1479 | WB_LE_or_HL_then_MB_or_ML_or_SQ => 8, | |
1480 | WB_MB_or_ML_or_SQ_then_LE_or_HL => 10, | |
1481 | WB_MB_or_MN_or_SQ_then_NU => 12, | |
1482 | WB_NU_then_MB_or_MN_or_SQ => 14, | |
b0e24409 | 1483 | WB_RI_then_RI => 16, |
7e54b87f KW |
1484 | ); |
1485 | ||
7e54b87f KW |
1486 | # Construct the WB pair table. |
1487 | # The table is constructed in reverse order of the rules, to make the | |
1488 | # lower-numbered, higher priority ones override the later ones, as the | |
1489 | # algorithm stops at the earliest matching rule | |
1490 | ||
1491 | my @wb_table; | |
1492 | my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN | |
39c4defe | 1493 | die "UNKNOWN must be final WB enum" unless $wb_short_enums[-1] =~ /unk/i; |
7e54b87f KW |
1494 | |
1495 | # Otherwise, break everywhere (including around ideographs). | |
b0e24409 | 1496 | # WB99 Any ÷ Any |
7e54b87f KW |
1497 | for my $i (0 .. $table_size - 1) { |
1498 | for my $j (0 .. $table_size - 1) { | |
1499 | $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'}; | |
1500 | } | |
1501 | } | |
1502 | ||
b0e24409 KW |
1503 | # Do not break within emoji flag sequences. That is, do not break between |
1504 | # regional indicator (RI) symbols if there is an odd number of RI | |
1505 | # characters before the break point. | |
1506 | # WB16 [^RI] (RI RI)* RI × RI | |
c492f156 | 1507 | # WB15 sot (RI RI)* RI × RI |
289ce9cc | 1508 | $wb_table[$wb_enums{'Regional_Indicator'}] |
b0e24409 KW |
1509 | [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_RI_then_RI'}; |
1510 | ||
1511 | # Do not break within emoji modifier sequences. | |
1512 | # WB14 ( E_Base | EBG ) × E_Modifier | |
1513 | $wb_table[$wb_enums{'E_Base'}][$wb_enums{'E_Modifier'}] | |
1514 | = $wb_actions{'WB_NOBREAK'}; | |
1515 | $wb_table[$wb_enums{'E_Base_GAZ'}][$wb_enums{'E_Modifier'}] | |
1516 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1517 | |
1518 | # Do not break from extenders. | |
1519 | # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) | |
289ce9cc KW |
1520 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}] |
1521 | = $wb_actions{'WB_NOBREAK'}; | |
1522 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}] | |
1523 | = $wb_actions{'WB_NOBREAK'}; | |
1524 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}] | |
1525 | = $wb_actions{'WB_NOBREAK'}; | |
1526 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}] | |
1527 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1528 | |
1529 | # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) | |
1530 | # × # ExtendNumLet | |
289ce9cc KW |
1531 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}] |
1532 | = $wb_actions{'WB_NOBREAK'}; | |
1533 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}] | |
1534 | = $wb_actions{'WB_NOBREAK'}; | |
1535 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}] | |
1536 | = $wb_actions{'WB_NOBREAK'}; | |
1537 | $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}] | |
1538 | = $wb_actions{'WB_NOBREAK'}; | |
1539 | $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}] | |
1540 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1541 | |
1542 | # Do not break between Katakana. | |
1543 | # WB13 Katakana × Katakana | |
289ce9cc KW |
1544 | $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}] |
1545 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1546 | |
1547 | # Do not break within sequences, such as “3.2” or “3,456.789”. | |
1548 | # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric | |
289ce9cc | 1549 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}] |
7e54b87f | 1550 | += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; |
289ce9cc | 1551 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}] |
7e54b87f | 1552 | += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; |
289ce9cc | 1553 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}] |
7e54b87f KW |
1554 | += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; |
1555 | ||
1556 | # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric | |
289ce9cc | 1557 | $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}] |
7e54b87f | 1558 | += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; |
289ce9cc | 1559 | $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}] |
7e54b87f | 1560 | += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; |
289ce9cc | 1561 | $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}] |
7e54b87f KW |
1562 | += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; |
1563 | ||
1564 | # Do not break within sequences of digits, or digits adjacent to letters | |
1565 | # (“3a”, or “A3”). | |
1566 | # WB10 Numeric × (ALetter | Hebrew_Letter) | |
289ce9cc KW |
1567 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}] |
1568 | = $wb_actions{'WB_NOBREAK'}; | |
1569 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}] | |
1570 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1571 | |
1572 | # WB9 (ALetter | Hebrew_Letter) × Numeric | |
289ce9cc KW |
1573 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}] |
1574 | = $wb_actions{'WB_NOBREAK'}; | |
1575 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}] | |
1576 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1577 | |
1578 | # WB8 Numeric × Numeric | |
289ce9cc KW |
1579 | $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}] |
1580 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1581 | |
1582 | # Do not break letters across certain punctuation. | |
1583 | # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter | |
289ce9cc KW |
1584 | $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}] |
1585 | += $wb_actions{'WB_DQ_then_HL'}; | |
7e54b87f KW |
1586 | |
1587 | # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter | |
289ce9cc KW |
1588 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}] |
1589 | += $wb_actions{'WB_HL_then_DQ'}; | |
7e54b87f KW |
1590 | |
1591 | # WB7a Hebrew_Letter × Single_Quote | |
289ce9cc KW |
1592 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}] |
1593 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1594 | |
1595 | # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) | |
1596 | # × (ALetter | Hebrew_Letter) | |
289ce9cc | 1597 | $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}] |
7e54b87f | 1598 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1599 | $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}] |
7e54b87f | 1600 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1601 | $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}] |
7e54b87f | 1602 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1603 | $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}] |
7e54b87f | 1604 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1605 | $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}] |
7e54b87f | 1606 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
289ce9cc | 1607 | $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}] |
7e54b87f KW |
1608 | += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; |
1609 | ||
1610 | # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | |
1611 | # | Single_Quote) (ALetter | Hebrew_Letter) | |
289ce9cc | 1612 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}] |
7e54b87f | 1613 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1614 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}] |
7e54b87f | 1615 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1616 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}] |
7e54b87f | 1617 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1618 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}] |
7e54b87f | 1619 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1620 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}] |
7e54b87f | 1621 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
289ce9cc | 1622 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}] |
7e54b87f KW |
1623 | += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; |
1624 | ||
1625 | # Do not break between most letters. | |
1626 | # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) | |
289ce9cc KW |
1627 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}] |
1628 | = $wb_actions{'WB_NOBREAK'}; | |
1629 | $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}] | |
1630 | = $wb_actions{'WB_NOBREAK'}; | |
1631 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}] | |
1632 | = $wb_actions{'WB_NOBREAK'}; | |
1633 | $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}] | |
1634 | = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f | 1635 | |
b0e24409 KW |
1636 | # Ignore Format and Extend characters, except after sot, CR, LF, and |
1637 | # Newline. This also has the effect of: Any × (Format | Extend | ZWJ) | |
1638 | # WB4 X (Extend | Format | ZWJ)* → X | |
7e54b87f | 1639 | for my $i (0 .. @wb_table - 1) { |
289ce9cc | 1640 | $wb_table[$wb_enums{'Extend'}][$i] |
b0e24409 | 1641 | = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'}; |
289ce9cc | 1642 | $wb_table[$wb_enums{'Format'}][$i] |
b0e24409 KW |
1643 | = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'}; |
1644 | $wb_table[$wb_enums{'ZWJ'}][$i] | |
1645 | = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'}; | |
1646 | } | |
1647 | for my $i (0 .. @wb_table - 1) { | |
1648 | $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'}; | |
1649 | $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'}; | |
1650 | $wb_table[$i][$wb_enums{'ZWJ'}] = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1651 | } |
1652 | ||
1653 | # Implied is that these attach to the character before them, except for | |
1654 | # the characters that mark the end of a region of text. The rules below | |
1655 | # override the ones set up here, for all the characters that need | |
1656 | # overriding. | |
1657 | for my $i (0 .. @wb_table - 1) { | |
289ce9cc KW |
1658 | $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'}; |
1659 | $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'}; | |
7e54b87f KW |
1660 | } |
1661 | ||
b0e24409 KW |
1662 | # Do not break within emoji zwj sequences. |
1663 | # WB3c ZWJ × ( Glue_After_Zwj | EBG ) | |
1664 | $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}] | |
1665 | = $wb_actions{'WB_NOBREAK'}; | |
1666 | $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}] | |
1667 | = $wb_actions{'WB_NOBREAK'}; | |
1668 | ||
7e54b87f KW |
1669 | # Break before and after white space |
1670 | # WB3b ÷ (Newline | CR | LF) | |
1671 | # WB3a (Newline | CR | LF) ÷ | |
1672 | # et. al. | |
289ce9cc | 1673 | for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { |
7e54b87f KW |
1674 | for my $j (0 .. @wb_table - 1) { |
1675 | $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'}; | |
1676 | $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'}; | |
1677 | } | |
1678 | } | |
1679 | ||
1680 | # But do not break within white space. | |
1681 | # WB3 CR × LF | |
1682 | # et.al. | |
289ce9cc KW |
1683 | for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { |
1684 | for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { | |
7e54b87f KW |
1685 | $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'}; |
1686 | } | |
1687 | } | |
1688 | ||
b0e24409 | 1689 | # And do not break horizontal space followed by Extend or Format or ZWJ |
289ce9cc KW |
1690 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}] |
1691 | = $wb_actions{'WB_NOBREAK'}; | |
1692 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}] | |
1693 | = $wb_actions{'WB_NOBREAK'}; | |
b0e24409 KW |
1694 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'ZWJ'}] |
1695 | = $wb_actions{'WB_NOBREAK'}; | |
289ce9cc KW |
1696 | $wb_table[$wb_enums{'Perl_Tailored_HSpace'}] |
1697 | [$wb_enums{'Perl_Tailored_HSpace'}] | |
1698 | = $wb_actions{'WB_hs_then_hs'}; | |
7e54b87f | 1699 | |
b0e24409 KW |
1700 | # Break at the start and end of text, unless the text is empty |
1701 | # WB2 Any ÷ eot | |
1702 | # WB1 sot ÷ Any | |
7e54b87f | 1703 | for my $i (0 .. @wb_table - 1) { |
289ce9cc KW |
1704 | $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'}; |
1705 | $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'}; | |
7e54b87f | 1706 | } |
289ce9cc | 1707 | $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0; |
7e54b87f | 1708 | |
289ce9cc KW |
1709 | output_table_common('WB', \%wb_actions, |
1710 | \@wb_table, \@wb_short_enums, \%wb_abbreviations); | |
7e54b87f KW |
1711 | } |
1712 | ||
9d9177be KW |
1713 | output_invlist("Latin1", [ 0, 256 ]); |
1714 | output_invlist("AboveLatin1", [ 256 ]); | |
1715 | ||
bffc0129 | 1716 | end_file_pound_if; |
43b443dd | 1717 | |
3f427fd9 KW |
1718 | # We construct lists for all the POSIX and backslash sequence character |
1719 | # classes in two forms: | |
1720 | # 1) ones which match only in the ASCII range | |
1721 | # 2) ones which match either in the Latin1 range, or the entire Unicode range | |
1722 | # | |
1723 | # These get compiled in, and hence affect the memory footprint of every Perl | |
1724 | # program, even those not using Unicode. To minimize the size, currently | |
1725 | # the Latin1 version is generated for the beyond ASCII range except for those | |
1726 | # lists that are quite small for the entire range, such as for \s, which is 22 | |
1727 | # UVs long plus 4 UVs (currently) for the header. | |
1728 | # | |
1729 | # To save even more memory, the ASCII versions could be derived from the | |
1730 | # larger ones at runtime, saving some memory (minus the expense of the machine | |
1731 | # instructions to do so), but these are all small anyway, so their total is | |
1732 | # about 100 UVs. | |
1733 | # | |
1734 | # In the list of properties below that get generated, the L1 prefix is a fake | |
1735 | # property that means just the Latin1 range of the full property (whose name | |
1736 | # has an X prefix instead of L1). | |
a02047bf KW |
1737 | # |
1738 | # An initial & means to use the subroutine from this file instead of an | |
1739 | # official inversion list. | |
3f427fd9 | 1740 | |
0c4ecf42 KW |
1741 | for my $charset (get_supported_code_pages()) { |
1742 | print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
1743 | ||
99f21fb9 | 1744 | @a2n = @{get_a2n($charset)}; |
226b74db KW |
1745 | # Below is the list of property names to generate. '&' means to use the |
1746 | # subroutine to generate the inversion list instead of the generic code | |
1747 | # below. Some properties have a comma-separated list after the name, | |
1748 | # These are extra enums to add to those found in the Unicode tables. | |
99f21fb9 KW |
1749 | no warnings 'qw'; |
1750 | # Ignore non-alpha in sort | |
1751 | for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw( | |
c0382778 | 1752 | Assigned |
1c8c3428 KW |
1753 | ASCII |
1754 | Cased | |
1755 | VertSpace | |
1756 | XPerlSpace | |
1757 | XPosixAlnum | |
1758 | XPosixAlpha | |
1759 | XPosixBlank | |
1760 | XPosixCntrl | |
1761 | XPosixDigit | |
1762 | XPosixGraph | |
1763 | XPosixLower | |
1764 | XPosixPrint | |
1765 | XPosixPunct | |
1766 | XPosixSpace | |
1767 | XPosixUpper | |
1768 | XPosixWord | |
1769 | XPosixXDigit | |
1770 | _Perl_Any_Folds | |
1771 | &NonL1_Perl_Non_Final_Folds | |
1772 | _Perl_Folds_To_Multi_Char | |
1773 | &UpperLatin1 | |
1774 | _Perl_IDStart | |
1775 | _Perl_IDCont | |
226b74db KW |
1776 | _Perl_GCB,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,EDGE |
1777 | _Perl_LB,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner,EDGE, | |
1778 | _Perl_SB,SContinue,CR,Extend,LF,EDGE | |
1779 | _Perl_WB,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,EDGE,UNKNOWN | |
1c8c3428 | 1780 | ) |
226b74db KW |
1781 | # NOTE that the convention is that extra enum |
1782 | # values come after the property name, separated by | |
1783 | # commas, with the enums that aren't ever defined | |
1784 | # by Unicode coming last, at least 4 all-uppercase | |
1785 | # characters. The others are enum names that are | |
1786 | # needed by perl, but aren't in all Unicode | |
1787 | # releases. | |
0f5e3c71 KW |
1788 | ) { |
1789 | ||
1790 | # For the Latin1 properties, we change to use the eXtended version of the | |
1791 | # base property, then go through the result and get rid of everything not | |
1792 | # in Latin1 (above 255). Actually, we retain the element for the range | |
1793 | # that crosses the 255/256 boundary if it is one that matches the | |
1794 | # property. For example, in the Word property, there is a range of code | |
1795 | # points that start at U+00F8 and goes through U+02C1. Instead of | |
1796 | # artificially cutting that off at 256 because 256 is the first code point | |
1797 | # above Latin1, we let the range go to its natural ending. That gives us | |
1798 | # extra information with no added space taken. But if the range that | |
1799 | # crosses the boundary is one that doesn't match the property, we don't | |
1800 | # start a new range above 255, as that could be construed as going to | |
1801 | # infinity. For example, the Upper property doesn't include the character | |
1802 | # at 255, but does include the one at 256. We don't include the 256 one. | |
1803 | my $prop_name = $prop; | |
1804 | my $is_local_sub = $prop_name =~ s/^&//; | |
99f21fb9 KW |
1805 | my $extra_enums = ""; |
1806 | $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x; | |
0f5e3c71 KW |
1807 | my $lookup_prop = $prop_name; |
1808 | my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ | |
1809 | or $lookup_prop =~ s/^L1//); | |
1810 | my $nonl1_only = 0; | |
1811 | $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only; | |
99f21fb9 | 1812 | ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x; |
0f5e3c71 KW |
1813 | |
1814 | my @invlist; | |
99f21fb9 KW |
1815 | my @invmap; |
1816 | my $map_format; | |
1817 | my $map_default; | |
1818 | my $maps_to_code_point; | |
1819 | my $to_adjust; | |
0f5e3c71 KW |
1820 | if ($is_local_sub) { |
1821 | @invlist = eval $lookup_prop; | |
289ce9cc | 1822 | die $@ if $@; |
0f5e3c71 KW |
1823 | } |
1824 | else { | |
1825 | @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok'); | |
99f21fb9 | 1826 | if (! @invlist) { |
99f21fb9 | 1827 | |
ad85f59a KW |
1828 | # If couldn't find a non-empty inversion list, see if it is |
1829 | # instead an inversion map | |
1830 | my ($list_ref, $map_ref, $format, $default) | |
99f21fb9 | 1831 | = prop_invmap($lookup_prop, '_perl_core_internal_ok'); |
ad85f59a KW |
1832 | if (! $list_ref) { |
1833 | # An empty return here could mean an unknown property, or | |
1834 | # merely that the original inversion list is empty. Call | |
1835 | # in scalar context to differentiate | |
1836 | my $count = prop_invlist($lookup_prop, | |
1837 | '_perl_core_internal_ok'); | |
1838 | die "Could not find inversion list for '$lookup_prop'" | |
1839 | unless defined $count; | |
1840 | } | |
1841 | else { | |
18b852b3 KW |
1842 | @invlist = @$list_ref; |
1843 | @invmap = @$map_ref; | |
1844 | $map_format = $format; | |
1845 | $map_default = $default; | |
1846 | $maps_to_code_point = $map_format =~ /x/; | |
1847 | $to_adjust = $map_format =~ /a/; | |
ad85f59a | 1848 | } |
99f21fb9 | 1849 | } |
0f5e3c71 | 1850 | } |
ad85f59a KW |
1851 | |
1852 | ||
1853 | # Short-circuit an empty inversion list. | |
1854 | if (! @invlist) { | |
1855 | output_invlist($prop_name, \@invlist, $charset); | |
1856 | next; | |
1857 | } | |
ceb1de32 | 1858 | |
99f21fb9 KW |
1859 | # Re-order the Unicode code points to native ones for this platform. |
1860 | # This is only needed for code points below 256, because native code | |
1861 | # points are only in that range. For inversion maps of properties | |
1862 | # where the mappings are adjusted (format =~ /a/), this reordering | |
1863 | # could mess up the adjustment pattern that was in the input, so that | |
1864 | # has to be dealt with. | |
1865 | # | |
1866 | # And inversion maps that map to code points need to eventually have | |
1867 | # all those code points remapped to native, and it's better to do that | |
1868 | # here, going through the whole list not just those below 256. This | |
1869 | # is because some inversion maps have adjustments (format =~ /a/) | |
1870 | # which may be affected by the reordering. This code needs to be done | |
1871 | # both for when we are translating the inversion lists for < 256, and | |
1872 | # for the inversion maps for everything. By doing both in this loop, | |
1873 | # we can share that code. | |
1874 | # | |
1875 | # So, we go through everything for an inversion map to code points; | |
1876 | # otherwise, we can skip any remapping at all if we are going to | |
1877 | # output only the above-Latin1 values, or if the range spans the whole | |
1878 | # of 0..256, as the remap will also include all of 0..256 (256 not | |
1879 | # 255 because a re-ordering could cause 256 to need to be in the same | |
1880 | # range as 255.) | |
1881 | if ((@invmap && $maps_to_code_point) | |
1882 | || (! $nonl1_only || ($invlist[0] < 256 | |
1883 | && ! ($invlist[0] == 0 && $invlist[1] > 256)))) | |
ceb1de32 | 1884 | { |
fb4554ea | 1885 | |
99f21fb9 | 1886 | if (! @invmap) { # Straight inversion list |
fb4554ea KW |
1887 | # Look at all the ranges that start before 257. |
1888 | my @latin1_list; | |
1889 | while (@invlist) { | |
1890 | last if $invlist[0] > 256; | |
1891 | my $upper = @invlist > 1 | |
1892 | ? $invlist[1] - 1 # In range | |
8a6c81cf KW |
1893 | |
1894 | # To infinity. You may want to stop much much | |
1895 | # earlier; going this high may expose perl | |
1896 | # deficiencies with very large numbers. | |
1897 | : $Unicode::UCD::MAX_CP; | |
fb4554ea | 1898 | for my $j ($invlist[0] .. $upper) { |
99f21fb9 | 1899 | push @latin1_list, a2n($j); |
0f5e3c71 | 1900 | } |
fb4554ea KW |
1901 | |
1902 | shift @invlist; # Shift off the range that's in the list | |
1903 | shift @invlist; # Shift off the range not in the list | |
0c4ecf42 | 1904 | } |
fb4554ea KW |
1905 | |
1906 | # Here @invlist contains all the ranges in the original that start | |
1907 | # at code points above 256, and @latin1_list contains all the | |
1908 | # native code points for ranges that start with a Unicode code | |
1909 | # point below 257. We sort the latter and convert it to inversion | |
1910 | # list format. Then simply prepend it to the list of the higher | |
1911 | # code points. | |
1912 | @latin1_list = sort { $a <=> $b } @latin1_list; | |
5a7e5385 | 1913 | @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list); |
fb4554ea | 1914 | unshift @invlist, @latin1_list; |
99f21fb9 KW |
1915 | } |
1916 | else { # Is an inversion map | |
1917 | ||
1918 | # This is a similar procedure as plain inversion list, but has | |
1919 | # multiple buckets. A plain inversion list just has two | |
1920 | # buckets, 1) 'in' the list; and 2) 'not' in the list, and we | |
1921 | # pretty much can ignore the 2nd bucket, as it is completely | |
1922 | # defined by the 1st. But here, what we do is create buckets | |
1923 | # which contain the code points that map to each, translated | |
1924 | # to native and turned into an inversion list. Thus each | |
1925 | # bucket is an inversion list of native code points that map | |
1926 | # to it or don't map to it. We use these to create an | |
1927 | # inversion map for the whole property. | |
1928 | ||
1929 | # As mentioned earlier, we use this procedure to not just | |
1930 | # remap the inversion list to native values, but also the maps | |
1931 | # of code points to native ones. In the latter case we have | |
1932 | # to look at the whole of the inversion map (or at least to | |
1933 | # above Unicode; as the maps of code points above that should | |
1934 | # all be to the default). | |
1935 | my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256; | |
1936 | ||
1937 | my %mapped_lists; # A hash whose keys are the buckets. | |
1938 | while (@invlist) { | |
1939 | last if $invlist[0] > $upper_limit; | |
1940 | ||
1941 | # This shouldn't actually happen, as prop_invmap() returns | |
1942 | # an extra element at the end that is beyond $upper_limit | |
1943 | die "inversion map that extends to infinity is unimplemented" unless @invlist > 1; | |
1944 | ||
1945 | my $bucket; | |
1946 | ||
1947 | # A hash key can't be a ref (we are only expecting arrays | |
1948 | # of scalars here), so convert any such to a string that | |
1949 | # will be converted back later (using a vertical tab as | |
1950 | # the separator). Even if the mapping is to code points, | |
1951 | # we don't translate to native here because the code | |
d8049362 | 1952 | # output_invmap() calls to output these arrays assumes the |
99f21fb9 KW |
1953 | # input is Unicode, not native. |
1954 | if (ref $invmap[0]) { | |
1955 | $bucket = join "\cK", @{$invmap[0]}; | |
1956 | } | |
1957 | elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) { | |
1958 | ||
1959 | # Do convert to native for maps to single code points. | |
1960 | # There are some properties that have a few outlier | |
1961 | # maps that aren't code points, so the above test | |
1962 | # skips those. | |
1963 | $bucket = a2n($invmap[0]); | |
1964 | } else { | |
1965 | $bucket = $invmap[0]; | |
1966 | } | |
1967 | ||
1968 | # We now have the bucket that all code points in the range | |
1969 | # map to, though possibly they need to be adjusted. Go | |
1970 | # through the range and put each translated code point in | |
1971 | # it into its bucket. | |
1972 | my $base_map = $invmap[0]; | |
1973 | for my $j ($invlist[0] .. $invlist[1] - 1) { | |
1974 | if ($to_adjust | |
1975 | # The 1st code point doesn't need adjusting | |
1976 | && $j > $invlist[0] | |
1977 | ||
1978 | # Skip any non-numeric maps: these are outliers | |
1979 | # that aren't code points. | |
1980 | && $base_map =~ $numeric_re | |
1981 | ||
1982 | # 'ne' because the default can be a string | |
1983 | && $base_map ne $map_default) | |
1984 | { | |
1985 | # We adjust, by incrementing each the bucket and | |
1986 | # the map. For code point maps, translate to | |
1987 | # native | |
1988 | $base_map++; | |
1989 | $bucket = ($maps_to_code_point) | |
1990 | ? a2n($base_map) | |
1991 | : $base_map; | |
1992 | } | |
1993 | ||
1994 | # Add the native code point to the bucket for the | |
1995 | # current map | |
1996 | push @{$mapped_lists{$bucket}}, a2n($j); | |
1997 | } # End of loop through all code points in the range | |
1998 | ||
1999 | # Get ready for the next range | |
2000 | shift @invlist; | |
2001 | shift @invmap; | |
2002 | } # End of loop through all ranges in the map. | |
2003 | ||
2004 | # Here, @invlist and @invmap retain all the ranges from the | |
2005 | # originals that start with code points above $upper_limit. | |
2006 | # Each bucket in %mapped_lists contains all the code points | |
2007 | # that map to that bucket. If the bucket is for a map to a | |
2008 | # single code point is a single code point, the bucket has | |
2009 | # been converted to native. If something else (including | |
2010 | # multiple code points), no conversion is done. | |
2011 | # | |
2012 | # Now we recreate the inversion map into %xlated, but this | |
2013 | # time for the native character set. | |
2014 | my %xlated; | |
2015 | foreach my $bucket (keys %mapped_lists) { | |
2016 | ||
2017 | # Sort and convert this bucket to an inversion list. The | |
2018 | # result will be that ranges that start with even-numbered | |
2019 | # indexes will be for code points that map to this bucket; | |
2020 | # odd ones map to some other bucket, and are discarded | |
2021 | # below. | |
2022 | @{$mapped_lists{$bucket}} | |
2023 | = sort{ $a <=> $b} @{$mapped_lists{$bucket}}; | |
2024 | @{$mapped_lists{$bucket}} | |
2025 | = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}}); | |
2026 | ||
2027 | # Add each even-numbered range in the bucket to %xlated; | |
2028 | # so that the keys of %xlated become the range start code | |
2029 | # points, and the values are their corresponding maps. | |
2030 | while (@{$mapped_lists{$bucket}}) { | |
2031 | my $range_start = $mapped_lists{$bucket}->[0]; | |
2032 | if ($bucket =~ /\cK/) { | |
2033 | @{$xlated{$range_start}} = split /\cK/, $bucket; | |
2034 | } | |
2035 | else { | |
2036 | $xlated{$range_start} = $bucket; | |
2037 | } | |
2038 | shift @{$mapped_lists{$bucket}}; # Discard odd ranges | |
2039 | shift @{$mapped_lists{$bucket}}; # Get ready for next | |
2040 | # iteration | |
2041 | } | |
2042 | } # End of loop through all the buckets. | |
2043 | ||
2044 | # Here %xlated's keys are the range starts of all the code | |
2045 | # points in the inversion map. Construct an inversion list | |
2046 | # from them. | |
2047 | my @new_invlist = sort { $a <=> $b } keys %xlated; | |
2048 | ||
2049 | # If the list is adjusted, we want to munge this list so that | |
2050 | # we only have one entry for where consecutive code points map | |
2051 | # to consecutive values. We just skip the subsequent entries | |
2052 | # where this is the case. | |
2053 | if ($to_adjust) { | |
2054 | my @temp; | |
2055 | for my $i (0 .. @new_invlist - 1) { | |
2056 | next if $i > 0 | |
2057 | && $new_invlist[$i-1] + 1 == $new_invlist[$i] | |
2058 | && $xlated{$new_invlist[$i-1]} =~ $numeric_re | |
2059 | && $xlated{$new_invlist[$i]} =~ $numeric_re | |
2060 | && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]}; | |
2061 | push @temp, $new_invlist[$i]; | |
2062 | } | |
2063 | @new_invlist = @temp; | |
2064 | } | |
2065 | ||
2066 | # The inversion map comes from %xlated's values. We can | |
2067 | # unshift each onto the front of the untouched portion, in | |
2068 | # reverse order of the portion we did process. | |
2069 | foreach my $start (reverse @new_invlist) { | |
2070 | unshift @invmap, $xlated{$start}; | |
2071 | } | |
2072 | ||
2073 | # Finally prepend the inversion list we have just constructed to the | |
2074 | # one that contains anything we didn't process. | |
2075 | unshift @invlist, @new_invlist; | |
2076 | } | |
2077 | } | |
2078 | ||
2079 | # prop_invmap() returns an extra final entry, which we can now | |
2080 | # discard. | |
2081 | if (@invmap) { | |
2082 | pop @invlist; | |
2083 | pop @invmap; | |
ceb1de32 | 2084 | } |
0f5e3c71 KW |
2085 | |
2086 | if ($l1_only) { | |
99f21fb9 | 2087 | die "Unimplemented to do a Latin-1 only inversion map" if @invmap; |
0f5e3c71 KW |
2088 | for my $i (0 .. @invlist - 1 - 1) { |
2089 | if ($invlist[$i] > 255) { | |
2090 | ||
2091 | # In an inversion list, even-numbered elements give the code | |
2092 | # points that begin ranges that match the property; | |
2093 | # odd-numbered give ones that begin ranges that don't match. | |
2094 | # If $i is odd, we are at the first code point above 255 that | |
2095 | # doesn't match, which means the range it is ending does | |
2096 | # match, and crosses the 255/256 boundary. We want to include | |
2097 | # this ending point, so increment $i, so the splice below | |
2098 | # includes it. Conversely, if $i is even, it is the first | |
2099 | # code point above 255 that matches, which means there was no | |
2100 | # matching range that crossed the boundary, and we don't want | |
2101 | # to include this code point, so splice before it. | |
2102 | $i++ if $i % 2 != 0; | |
2103 | ||
2104 | # Remove everything past this. | |
2105 | splice @invlist, $i; | |
99f21fb9 | 2106 | splice @invmap, $i if @invmap; |
0f5e3c71 KW |
2107 | last; |
2108 | } | |
0c4ecf42 KW |
2109 | } |
2110 | } | |
0f5e3c71 KW |
2111 | elsif ($nonl1_only) { |
2112 | my $found_nonl1 = 0; | |
2113 | for my $i (0 .. @invlist - 1 - 1) { | |
2114 | next if $invlist[$i] < 256; | |
2115 | ||
2116 | # Here, we have the first element in the array that indicates an | |
2117 | # element above Latin1. Get rid of all previous ones. | |
2118 | splice @invlist, 0, $i; | |
99f21fb9 | 2119 | splice @invmap, 0, $i if @invmap; |
0f5e3c71 KW |
2120 | |
2121 | # If this one's index is not divisible by 2, it means that this | |
2122 | # element is inverting away from being in the list, which means | |
99f21fb9 KW |
2123 | # all code points from 256 to this one are in this list (or |
2124 | # map to the default for inversion maps) | |
2125 | if ($i % 2 != 0) { | |
2126 | unshift @invlist, 256; | |
2127 | unshift @invmap, $map_default if @invmap; | |
2128 | } | |
0f5e3c71 | 2129 | $found_nonl1 = 1; |
3f427fd9 KW |
2130 | last; |
2131 | } | |
0f5e3c71 | 2132 | die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1; |
3f427fd9 | 2133 | } |
3f427fd9 | 2134 | |
0f5e3c71 | 2135 | output_invlist($prop_name, \@invlist, $charset); |
99f21fb9 | 2136 | output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap; |
0f5e3c71 | 2137 | } |
bffc0129 | 2138 | end_file_pound_if; |
0c4ecf42 | 2139 | print $out_fh "\n" . get_conditional_compile_line_end(); |
9d9177be KW |
2140 | } |
2141 | ||
973a28ed KW |
2142 | switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C'); |
2143 | ||
2144 | output_GCB_table(); | |
6b659339 | 2145 | output_LB_table(); |
7e54b87f | 2146 | output_WB_table(); |
6b659339 | 2147 | |
973a28ed KW |
2148 | end_file_pound_if; |
2149 | ||
2308ab83 | 2150 | my $sources_list = "lib/unicore/mktables.lst"; |
216b41c2 KW |
2151 | my @sources = ($0, qw(lib/unicore/mktables |
2152 | lib/Unicode/UCD.pm | |
2153 | regen/charset_translations.pl | |
2154 | )); | |
9a3da3ad FC |
2155 | { |
2156 | # Depend on mktables’ own sources. It’s a shorter list of files than | |
2157 | # those that Unicode::UCD uses. | |
1ae6ead9 | 2158 | if (! open my $mktables_list, '<', $sources_list) { |
2308ab83 KW |
2159 | |
2160 | # This should force a rebuild once $sources_list exists | |
2161 | push @sources, $sources_list; | |
2162 | } | |
2163 | else { | |
2164 | while(<$mktables_list>) { | |
2165 | last if /===/; | |
2166 | chomp; | |
2167 | push @sources, "lib/unicore/$_" if /^[^#]/; | |
2168 | } | |
9a3da3ad FC |
2169 | } |
2170 | } | |
6b659339 KW |
2171 | |
2172 | read_only_bottom_close_and_rename($out_fh, \@sources); |