This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/mk_invlists.pl: Add support of 'sl' formats
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
3d7c117d
MB
11require './regen/regen_lib.pl';
12require './regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
289ce9cc
KW
43my $max_hdr_len = 3; # In headings, how wide a name is allowed?
44
9d9177be
KW
45print $out_fh "/* See the generating file for comments */\n\n";
46
bffc0129
KW
47# The symbols generated by this program are all currently defined only in a
48# single dot c each. The code knows where most of them go, but this hash
49# gives overrides for the exceptions to the typical place
50my %exceptions_to_where_to_define =
51 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
52 AboveLatin1 => 'PERL_IN_REGCOMP_C',
53 Latin1 => 'PERL_IN_REGCOMP_C',
54 UpperLatin1 => 'PERL_IN_REGCOMP_C',
55 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
56 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
57 _Perl_IDCont => 'PERL_IN_UTF8_C',
58 _Perl_IDStart => 'PERL_IN_UTF8_C',
59 );
015bb97c 60
973a28ed
KW
61my %gcb_enums;
62my @gcb_short_enums;
289ce9cc 63my %gcb_abbreviations;
6b659339
KW
64my %lb_enums;
65my @lb_short_enums;
289ce9cc 66my %lb_abbreviations;
7e54b87f
KW
67my %wb_enums;
68my @wb_short_enums;
289ce9cc 69my %wb_abbreviations;
6b659339 70
99f21fb9
KW
71my @a2n;
72
73sub uniques {
74 # Returns non-duplicated input values. From "Perl Best Practices:
75 # Encapsulated Cleverness". p. 455 in first edition.
76
77 my %seen;
78 return grep { ! $seen{$_}++ } @_;
79}
80
81sub a2n($) {
82 my $cp = shift;
83
84 # Returns the input Unicode code point translated to native.
85
86 return $cp if $cp !~ $numeric_re || $cp > 255;
87 return $a2n[$cp];
88}
89
bffc0129
KW
90sub end_file_pound_if {
91 if ($in_file_pound_if) {
92 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
93 $in_file_pound_if = 0;
94 }
95}
96
97sub switch_pound_if ($$) {
98 my $name = shift;
99 my $new_pound_if = shift;
100
101 # Switch to new #if given by the 2nd argument. If there is an override
102 # for this, it instead switches to that. The 1st argument is the
103 # static's name, used to look up the overrides
104
105 if (exists $exceptions_to_where_to_define{$name}) {
106 $new_pound_if = $exceptions_to_where_to_define{$name};
107 }
108
109 # Exit current #if if the new one is different from the old
110 if ($in_file_pound_if
111 && $in_file_pound_if !~ /$new_pound_if/)
112 {
113 end_file_pound_if;
114 }
115
116 # Enter new #if, if not already in it.
117 if (! $in_file_pound_if) {
118 $in_file_pound_if = "defined($new_pound_if)";
119 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
120 }
121}
122
0c4ecf42 123sub output_invlist ($$;$) {
9d9177be
KW
124 my $name = shift;
125 my $invlist = shift; # Reference to inversion list array
0c4ecf42 126 my $charset = shift // ""; # name of character set for comment
9d9177be 127
76d3994c 128 die "No inversion list for $name" unless defined $invlist
ad85f59a 129 && ref $invlist eq 'ARRAY';
76d3994c 130
9d9177be
KW
131 # Output the inversion list $invlist using the name $name for it.
132 # It is output in the exact internal form for inversion lists.
133
a0316a6c
KW
134 # Is the last element of the header 0, or 1 ?
135 my $zero_or_one = 0;
ad85f59a 136 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 137 unshift @$invlist, 0;
9d9177be
KW
138 $zero_or_one = 1;
139 }
0a07b44b 140 my $count = @$invlist;
9d9177be 141
bffc0129 142 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 143
0c4ecf42
KW
144 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
145 print $out_fh " /* for $charset */" if $charset;
146 print $out_fh "\n";
9d9177be 147
a0316a6c 148 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
149 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
150 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
151 ",\t/* 0 if the list starts at 0;",
152 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
153
154 # The main body are the UVs passed in to this routine. Do the final
155 # element separately
47d53124
KW
156 for my $i (0 .. @$invlist - 1) {
157 printf $out_fh "\t0x%X", $invlist->[$i];
158 print $out_fh "," if $i < @$invlist - 1;
159 print $out_fh "\n";
9d9177be
KW
160 }
161
9d9177be
KW
162 print $out_fh "};\n";
163}
164
99f21fb9
KW
165sub output_invmap ($$$$$$$) {
166 my $name = shift;
167 my $invmap = shift; # Reference to inversion map array
168 my $prop_name = shift;
169 my $input_format = shift; # The inversion map's format
170 my $default = shift; # The property value for code points who
171 # otherwise don't have a value specified.
172 my $extra_enums = shift; # comma-separated list of our additions to the
173 # property's standard possible values
174 my $charset = shift // ""; # name of character set for comment
175
176 # Output the inversion map $invmap for property $prop_name, but use $name
177 # as the actual data structure's name.
178
179 my $count = @$invmap;
180
181 my $output_format;
182 my $declaration_type;
183 my %enums;
184 my $name_prefix;
185
34623dbb 186 if ($input_format =~ / ^ s l? $ /x) {
02f811dd
KW
187 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
188 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
226b74db 189 my @input_enums;
f79a09fc 190
226b74db 191 # Find all the possible input values. These become the enum names
34623dbb
KW
192 # that comprise the inversion map. For inputs that don't have sub
193 # lists, we can just get the unique values. Otherwise, we have to
194 # expand the sublists first.
195 if ($input_format ne 'sl') {
226b74db 196 @input_enums = sort(uniques(@$invmap));
34623dbb
KW
197 }
198 else {
199 foreach my $element (@$invmap) {
200 if (ref $element) {
201 push @input_enums, @$element;
202 }
203 else {
204 push @input_enums, $element;
205 }
206 }
207 @input_enums = sort(uniques(@input_enums));
208 }
6b659339 209
226b74db
KW
210 # The internal enums come last, and in the order specified.
211 my @enums = @input_enums;
27a619f7
KW
212 my @extras;
213 if ($extra_enums ne "") {
214 @extras = split /,/, $extra_enums;
226b74db
KW
215
216 # Don't add if already there.
217 foreach my $this_extra (@extras) {
218 next if grep { $_ eq $this_extra } @enums;
219
220 push @enums, $this_extra;
221 }
27a619f7 222 }
289ce9cc 223
226b74db
KW
224 # Assign a value to each element of the enum type we are creating.
225 # The default value always gets 0; the others are arbitrarily
226 # assigned.
27a619f7
KW
227 my $enum_val = 0;
228 my $canonical_default = prop_value_aliases($prop_name, $default);
229 $default = $canonical_default if defined $canonical_default;
230 $enums{$default} = $enum_val++;
226b74db 231
27a619f7
KW
232 for my $enum (@enums) {
233 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
234 }
235
226b74db 236 # Calculate the data for the special tables output for these properties.
27a619f7
KW
237 if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) {
238
226b74db
KW
239 # The data includes the hashes %gcb_enums, %lb_enums, etc.
240 # Similarly we calculate column headings for the tables.
241 #
27a619f7 242 # We use string evals to allow the same code to work on
226b74db 243 # all the tables
27a619f7
KW
244 my $type = lc $prop_name;
245
27a619f7
KW
246 my $placeholder = "a";
247
248 # Skip if we've already done this code, which populated
249 # this hash
250 if (eval "! \%${type}_enums") {
251
226b74db 252 # For each enum in the type ...
27a619f7
KW
253 foreach my $enum (sort keys %enums) {
254 my $value = $enums{$enum};
255 my $short;
256 my $abbreviated_from;
257
258 # Special case this wb property value to make the
259 # name more clear
260 if ($enum eq 'Perl_Tailored_HSpace') {
261 $short = 'hs';
262 $abbreviated_from = $enum;
263 }
27a619f7 264 else {
226b74db
KW
265
266 # Use the official short name, if found.
27a619f7
KW
267 ($short) = prop_value_aliases($type, $enum);
268
226b74db
KW
269 if (! defined $short) {
270
271 # But if there is no official name, use the name
272 # that came from the data (if any). Otherwise,
273 # the name had to come from the extras list.
274 # There are two types of values in that list.
275 #
276 # First are those enums that are not part of the
277 # property, but are defined by this code. By
278 # convention these have all-caps names of at least
279 # 4 characters. We use the lowercased name for
280 # thse.
281 #
282 # Second are enums that are needed to get
283 # regexec.c to compile, but don't exist in all
284 # Unicode releases. To get here, we must be
285 # compiling an earlier Unicode release that
286 # doesn't have that enum, so just use a unique
287 # anonymous name for it.
288 if (grep { $_ eq $enum } @input_enums) {
289 $short = $enum
290 }
291 elsif ($enum !~ / ^ [A-Z]{4,} $ /x) {
292 $short = $placeholder++;
293 }
294 else {
295 $short = lc $enum;
296 }
297 }
27a619f7
KW
298 }
299
300 # If our short name is too long, or we already
301 # know that the name is an abbreviation, truncate
302 # to make sure it's short enough, and remember
226b74db
KW
303 # that we did this so we can later add a comment in the
304 # generated file
27a619f7
KW
305 if ( $abbreviated_from
306 || length $short > $max_hdr_len)
307 {
308 $short = substr($short, 0, $max_hdr_len);
309 $abbreviated_from = $enum
310 unless $abbreviated_from;
311 # If the name we are to display conflicts, try
312 # another.
313 while (eval "exists
314 \$${type}_abbreviations{$short}")
315 {
289ce9cc 316 die $@ if $@;
256fceb3
KW
317
318 # The increment operator on strings doesn't work
319 # on those containing an '_', so just use the
320 # final portion.
321 my @short = split '_', $short;
322 $short[-1]++;
323 $short = join "_", @short;
289ce9cc 324 }
19a5f1d5 325
27a619f7 326 eval "\$${type}_abbreviations{$short} = '$enum'";
19a5f1d5 327 die $@ if $@;
7e54b87f 328 }
27a619f7
KW
329
330 # Remember the mapping from the property value
331 # (enum) name to its value.
332 eval "\$${type}_enums{$enum} = $value";
333 die $@ if $@;
334
335 # Remember the inverse mapping to the short name
336 # so that we can properly label the generated
337 # table's rows and columns
338 eval "\$${type}_short_enums[$value] = '$short'";
339 die $@ if $@;
7e54b87f 340 }
99f21fb9 341 }
19a5f1d5 342 }
99f21fb9 343
19a5f1d5
KW
344 # Inversion map stuff is currently used only by regexec
345 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
346
347 # The short names tend to be two lower case letters, but it looks
348 # better for those if they are upper. XXX
349 $short_name = uc($short_name) if length($short_name) < 3
226b74db 350 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
19a5f1d5
KW
351 $name_prefix = "${short_name}_";
352 my $enum_count = keys %enums;
353 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
354
34623dbb
KW
355 if ($input_format eq 'sl') {
356 print $out_fh
357 "\n/* Negative enum values indicate the need to use an auxiliary"
358 . " table\n * consisting of the list of enums this one expands to."
359 . " The absolute\n * values of the negative enums are indices into"
360 . " a table of the auxiliary\n * tables' addresses */";
361 }
362
226b74db 363 # Start the enum definition for this map
19a5f1d5
KW
364 print $out_fh "\ntypedef enum {\n";
365 my @enum_list;
366 foreach my $enum (keys %enums) {
367 $enum_list[$enums{$enum}] = $enum;
99f21fb9 368 }
19a5f1d5 369 foreach my $i (0 .. @enum_list - 1) {
34623dbb
KW
370 print $out_fh ",\n" if $i > 0;
371
19a5f1d5
KW
372 my $name = $enum_list[$i];
373 print $out_fh "\t${name_prefix}$name = $i";
19a5f1d5 374 }
34623dbb
KW
375
376 # For an 'sl' property, we need extra enums, because some of the
377 # elements are lists. Each such distinct list is placed in its own
378 # auxiliary map table. Here, we go through the inversion map, and for
379 # each distinct list found, create an enum value for it, numbered -1,
380 # -2, ....
381 my %multiples;
382 my $aux_table_prefix = "AUX_TABLE_";
383 if ($input_format eq 'sl') {
384 foreach my $element (@$invmap) {
385
386 # A regular scalar is not one of the lists we're looking for
387 # at this stage.
388 next unless ref $element;
389
390 my $joined = join ",", sort @$element;
391 my $already_found = exists $multiples{$joined};
392
393 my $i;
394 if ($already_found) { # Use any existing one
395 $i = $multiples{$joined};
396 }
397 else { # Otherwise increment to get a new table number
398 $i = keys(%multiples) + 1;
399 $multiples{$joined} = $i;
400 }
401
402 # This changes the inversion map for this entry to not be the
403 # list
404 $element = "use_$aux_table_prefix$i";
405
406 # And add to the enum values
407 if (! $already_found) {
408 print $out_fh ",\n\t${name_prefix}$element = -$i";
409 }
410 }
411 }
412
413 print $out_fh "\n";
19a5f1d5
KW
414 $declaration_type = "${name_prefix}enum";
415 print $out_fh "} $declaration_type;\n";
226b74db 416 # Finished with the enum defintion.
19a5f1d5
KW
417
418 $output_format = "${name_prefix}%s";
34623dbb
KW
419
420 # If there are auxiliary tables, output them.
421 if (%multiples) {
422
423 print $out_fh "\n#define HAS_${name_prefix}AUX_TABLES\n";
424
425 # Invert keys and values
426 my %inverted_mults;
427 while (my ($key, $value) = each %multiples) {
428 $inverted_mults{$value} = $key;
429 }
430
431 # Output them in sorted order
432 my @sorted_table_list = sort { $a <=> $b } keys %inverted_mults;
433
434 # Keep track of how big each aux table is
435 my @aux_counts;
436
437 # Output each aux table.
438 foreach my $table_number (@sorted_table_list) {
439 my $table = $inverted_mults{$table_number};
440 print $out_fh "\nstatic const $declaration_type $name_prefix$aux_table_prefix$table_number\[] = {\n";
441
442 # Earlier, we joined the elements of this table together with a comma
443 my @elements = split ",", $table;
444
445 $aux_counts[$table_number] = scalar @elements;
446 for my $i (0 .. @elements - 1) {
447 print $out_fh ",\n" if $i > 0;
448 print $out_fh "\t${name_prefix}$elements[$i]";
449 }
450 print $out_fh "\n};\n";
451 }
452
453 # Output the table that is indexed by the absolute value of the
454 # aux table enum and contains pointers to the tables output just
455 # above
456 print $out_fh "\nstatic const $declaration_type * const ${name_prefix}${aux_table_prefix}ptrs\[] = {\n";
457 print $out_fh "\tNULL,\t/* Placeholder */\n";
458 for my $i (1 .. @sorted_table_list) {
459 print $out_fh ",\n" if $i > 1;
460 print $out_fh "\t$name_prefix$aux_table_prefix$i";
461 }
462 print $out_fh "\n};\n";
463
464 print $out_fh
465 "\n/* Parallel table to the above, giving the number of elements"
466 . " in each table\n * pointed to */\n";
467 print $out_fh "static const U8 ${name_prefix}${aux_table_prefix}lengths\[] = {\n";
468 print $out_fh "\t0,\t/* Placeholder */\n";
469 for my $i (1 .. @sorted_table_list) {
470 print $out_fh ",\n" if $i > 1;
471 print $out_fh "\t$aux_counts[$i]\t/* $name_prefix$aux_table_prefix$i */";
472 }
473 print $out_fh "\n};\n";
474 } # End of outputting the auxiliary and associated tables
99f21fb9
KW
475 }
476 else {
477 die "'$input_format' invmap() format for '$prop_name' unimplemented";
478 }
479
480 die "No inversion map for $prop_name" unless defined $invmap
481 && ref $invmap eq 'ARRAY'
482 && $count;
483
226b74db 484 # Now output the inversion map proper
99f21fb9
KW
485 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
486 print $out_fh " /* for $charset */" if $charset;
487 print $out_fh "\n";
488
489 # The main body are the scalars passed in to this routine.
490 for my $i (0 .. $count - 1) {
491 my $element = $invmap->[$i];
02f811dd
KW
492 my $full_element_name = prop_value_aliases($prop_name, $element);
493 $element = $full_element_name if defined $full_element_name;
494 $element = $name_prefix . $element;
99f21fb9
KW
495 print $out_fh "\t$element";
496 print $out_fh "," if $i < $count - 1;
497 print $out_fh "\n";
498 }
499 print $out_fh "};\n";
99f21fb9
KW
500}
501
5a7e5385 502sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
503
504 # Returns an inversion list constructed from the sorted input array of
505 # code points
506
507 my $list_ref = shift;
508
99f21fb9
KW
509 return unless @$list_ref;
510
a02047bf
KW
511 # Initialize to just the first element
512 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
513
514 # For each succeeding element, if it extends the previous range, adjust
515 # up, otherwise add it.
516 for my $i (1 .. @$list_ref - 1) {
517 if ($invlist[-1] == $list_ref->[$i]) {
518 $invlist[-1]++;
519 }
520 else {
521 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
522 }
523 }
524 return @invlist;
525}
526
527# Read in the Case Folding rules, and construct arrays of code points for the
528# properties we need.
529my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
530die "Could not find inversion map for Case_Folding" unless defined $format;
531die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
532 unless $format eq 'al'
533 || $format eq 'a';
a02047bf
KW
534my @has_multi_char_fold;
535my @is_non_final_fold;
536
537for my $i (0 .. @$folds_ref - 1) {
538 next unless ref $folds_ref->[$i]; # Skip single-char folds
539 push @has_multi_char_fold, $cp_ref->[$i];
540
b6a6e956 541 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
542 # position
543 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
544 push @is_non_final_fold, $folds_ref->[$i][$j]
545 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
546 }
547}
548
a02047bf
KW
549sub _Perl_Non_Final_Folds {
550 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 551 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
552}
553
99f21fb9
KW
554sub prop_name_for_cmp ($) { # Sort helper
555 my $name = shift;
556
557 # Returns the input lowercased, with non-alphas removed, as well as
558 # everything starting with a comma
559
560 $name =~ s/,.*//;
561 $name =~ s/[[:^alpha:]]//g;
562 return lc $name;
563}
564
892d8259 565sub UpperLatin1 {
5a7e5385 566 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
567}
568
289ce9cc
KW
569sub output_table_common {
570
571 # Common subroutine to actually output the generated rules table.
572
573 my ($property,
574 $table_value_defines_ref,
575 $table_ref,
576 $names_ref,
577 $abbreviations_ref) = @_;
578 my $size = @$table_ref;
579
580 # Output the #define list, sorted by numeric value
581 if ($table_value_defines_ref) {
582 my $max_name_length = 0;
583 my @defines;
584
585 # Put in order, and at the same time find the longest name
586 while (my ($enum, $value) = each %$table_value_defines_ref) {
587 $defines[$value] = $enum;
588
589 my $length = length $enum;
590 $max_name_length = $length if $length > $max_name_length;
591 }
592
593 print $out_fh "\n";
594
595 # Output, so that the values are vertically aligned in a column after
596 # the longest name
597 foreach my $i (0 .. @defines - 1) {
598 next unless defined $defines[$i];
599 printf $out_fh "#define %-*s %2d\n",
600 $max_name_length,
601 $defines[$i],
602 $i;
603 }
604 }
605
606 my $column_width = 2; # We currently allow 2 digits for the number
607
608 # If the maximum value in the table is 1, it can be a bool. (Being above
609 # a U8 is not currently handled
610 my $max_element = 0;
611 for my $i (0 .. $size - 1) {
612 for my $j (0 .. $size - 1) {
613 next if $max_element >= $table_ref->[$i][$j];
614 $max_element = $table_ref->[$i][$j];
615 }
616 }
617 die "Need wider table column width given '$max_element"
618 if length $max_element > $column_width;
619
620 my $table_type = ($max_element == 1)
621 ? 'bool'
622 : 'U8';
623
624 # If a name is longer than the width set aside for a column, its column
625 # needs to have increased spacing so that the name doesn't get truncated
626 # nor run into an adjacent column
627 my @spacers;
628
629 # If we are being compiled on a Unicode version earlier than that which
630 # this file was designed for, it may be that some of the property values
631 # aren't in the current release, and so would be undefined if we didn't
632 # define them ourselves. Earlier code has done this, making them
633 # lowercase characters of length one. We look to see if any exist, so
634 # that we can add an annotation to the output table
635 my $has_placeholder = 0;
636
637 for my $i (0 .. $size - 1) {
638 no warnings 'numeric';
639 $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
640 $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
641 }
642
643 print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
644
645 # Calculate the column heading line
646 my $header_line = "/* "
647 . (" " x $max_hdr_len) # We let the row heading meld to
648 # the '*/' for those that are at
649 # the max
650 . " " x 3; # Space for '*/ '
651 # Now each column
652 for my $i (0 .. $size - 1) {
653 $header_line .= sprintf "%s%*s",
654 $spacers[$i],
655 $column_width + 1, # 1 for the ','
656 $names_ref->[$i];
657 }
658 $header_line .= " */\n";
659
660 # If we have annotations, output it now.
661 if ($has_placeholder || scalar %$abbreviations_ref) {
662 my $text = "";
663 foreach my $abbr (sort keys %$abbreviations_ref) {
664 $text .= "; " if $text;
665 $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
666 }
667 if ($has_placeholder) {
668 $text .= "; other " if $text;
669 $text .= "lowercase names are placeholders for"
670 . " property values not defined until a later Unicode"
671 . " release, so are irrelevant in this one, as they are"
672 . " not assigned to any code points";
673 }
674
675 my $indent = " " x 3;
676 $text = $indent . "/* $text */";
677
678 # Wrap the text so that it is no wider than the table, which the
679 # header line gives.
680 my $output_width = length $header_line;
681 while (length $text > $output_width) {
682 my $cur_line = substr($text, 0, $output_width);
683
684 # Find the first blank back from the right end to wrap at.
685 for (my $i = $output_width -1; $i > 0; $i--) {
686 if (substr($text, $i, 1) eq " ") {
687 print $out_fh substr($text, 0, $i), "\n";
688
689 # Set so will look at just the remaining tail (which will
690 # be indented and have a '*' after the indent
691 $text = $indent . " * " . substr($text, $i + 1);
692 last;
693 }
694 }
695 }
696
697 # And any remaining
698 print $out_fh $text, "\n" if $text;
699 }
700
701 # We calculated the header line earlier just to get its width so that we
702 # could make sure the annotations fit into that.
703 print $out_fh $header_line;
704
705 # Now output the bulk of the table.
706 for my $i (0 .. $size - 1) {
707
708 # First the row heading.
709 printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i];
710 print $out_fh "{"; # Then the brace for this row
711
712 # Then each column
713 for my $j (0 .. $size -1) {
714 print $out_fh $spacers[$j];
715 printf $out_fh "%*d", $column_width, $table_ref->[$i][$j];
716 print $out_fh "," if $j < $size - 1;
717 }
718 print $out_fh " }";
719 print $out_fh "," if $i < $size - 1;
720 print $out_fh "\n";
721 }
722
723 print $out_fh "};\n";
724}
725
973a28ed
KW
726sub output_GCB_table() {
727
728 # Create and output the pair table for use in determining Grapheme Cluster
729 # Breaks, given in http://www.unicode.org/reports/tr29/.
b0e24409
KW
730 my %gcb_actions = (
731 GCB_NOBREAK => 0,
732 GCB_BREAKABLE => 1,
733 GCB_RI_then_RI => 2, # Rules 12 and 13
734 GCB_EX_then_EM => 3, # Rule 10
735 );
973a28ed
KW
736
737 # The table is constructed in reverse order of the rules, to make the
738 # lower-numbered, higher priority ones override the later ones, as the
739 # algorithm stops at the earliest matching rule
740
741 my @gcb_table;
742 my $table_size = @gcb_short_enums;
743
744 # Otherwise, break everywhere.
b0e24409 745 # GB99 Any ÷ Any
973a28ed
KW
746 for my $i (0 .. $table_size - 1) {
747 for my $j (0 .. $table_size - 1) {
748 $gcb_table[$i][$j] = 1;
749 }
750 }
751
b0e24409
KW
752 # Do not break within emoji flag sequences. That is, do not break between
753 # regional indicator (RI) symbols if there is an odd number of RI
754 # characters before the break point. Must be resolved in runtime code.
755 #
c492f156 756 # GB12 sot (RI RI)* RI × RI
b0e24409
KW
757 # GB13 [^RI] (RI RI)* RI × RI
758 $gcb_table[$gcb_enums{'Regional_Indicator'}]
759 [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI};
760
761 # Do not break within emoji modifier sequences or emoji zwj sequences.
762 # GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ )
763 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0;
764 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0;
765
766 # GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier
767 $gcb_table[$gcb_enums{'Extend'}][$gcb_enums{'E_Modifier'}]
768 = $gcb_actions{GCB_EX_then_EM};
769 $gcb_table[$gcb_enums{'E_Base'}][$gcb_enums{'E_Modifier'}] = 0;
770 $gcb_table[$gcb_enums{'E_Base_GAZ'}][$gcb_enums{'E_Modifier'}] = 0;
771
772 # Do not break before extending characters or ZWJ.
973a28ed 773 # Do not break before SpacingMarks, or after Prepend characters.
973a28ed 774 # GB9b Prepend ×
b0e24409
KW
775 # GB9a × SpacingMark
776 # GB9 × ( Extend | ZWJ )
973a28ed 777 for my $i (0 .. @gcb_table - 1) {
289ce9cc 778 $gcb_table[$gcb_enums{'Prepend'}][$i] = 0;
b0e24409
KW
779 $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0;
780 $gcb_table[$i][$gcb_enums{'Extend'}] = 0;
781 $gcb_table[$i][$gcb_enums{'ZWJ'}] = 0;
973a28ed
KW
782 }
783
973a28ed
KW
784 # Do not break Hangul syllable sequences.
785 # GB8 ( LVT | T) × T
786 $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
787 $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
788
789 # GB7 ( LV | V ) × ( V | T )
790 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
791 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
792 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
793 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
794
795 # GB6 L × ( L | V | LV | LVT )
796 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
797 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
798 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
799 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
800
289ce9cc
KW
801 # Do not break between a CR and LF. Otherwise, break before and after
802 # controls.
973a28ed
KW
803 # GB5 ÷ ( Control | CR | LF )
804 # GB4 ( Control | CR | LF ) ÷
805 for my $i (0 .. @gcb_table - 1) {
289ce9cc 806 $gcb_table[$i][$gcb_enums{'Control'}] = 1;
973a28ed
KW
807 $gcb_table[$i][$gcb_enums{'CR'}] = 1;
808 $gcb_table[$i][$gcb_enums{'LF'}] = 1;
289ce9cc 809 $gcb_table[$gcb_enums{'Control'}][$i] = 1;
973a28ed
KW
810 $gcb_table[$gcb_enums{'CR'}][$i] = 1;
811 $gcb_table[$gcb_enums{'LF'}][$i] = 1;
812 }
813
814 # GB3 CR × LF
815 $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
816
b0e24409 817 # Break at the start and end of text, unless the text is empty
973a28ed
KW
818 # GB1 sot ÷
819 # GB2 ÷ eot
820 for my $i (0 .. @gcb_table - 1) {
289ce9cc
KW
821 $gcb_table[$i][$gcb_enums{'EDGE'}] = 1;
822 $gcb_table[$gcb_enums{'EDGE'}][$i] = 1;
973a28ed 823 }
289ce9cc 824 $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0;
973a28ed 825
b0e24409 826 output_table_common('GCB', \%gcb_actions,
289ce9cc 827 \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations);
973a28ed
KW
828}
829
6b659339
KW
830sub output_LB_table() {
831
832 # Create and output the enums, #defines, and pair table for use in
833 # determining Line Breaks. This uses the default line break algorithm,
834 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
835 # in that page, as the Unicode-furnished tests assume that tailoring.
836
6b659339
KW
837 # The result is really just true or false. But we follow along with tr14,
838 # creating a rule which is false for something like X SP* X. That gets
839 # encoding 2. The rest of the actions are synthetic ones that indicate
840 # some context handling is required. These each are added to the
841 # underlying 0, 1, or 2, instead of replacing them, so that the underlying
842 # value can be retrieved. Actually only rules from 7 through 18 (which
843 # are the ones where space matter) are possible to have 2 added to them.
844 # The others below add just 0 or 1. It might be possible for one
845 # synthetic rule to be added to another, yielding a larger value. This
846 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
847 # names of the middle grouping below, it is impossible for that to occur
848 # for them because they all start with mutually exclusive classes. That
849 # the final rule can't be added to any of the others isn't obvious from
850 # its name, so it is assigned a power of 2 higher than the others can get
851 # to so any addition would preserve all data. (And the code will reach an
852 # assert(0) on debugging builds should this happen.)
853 my %lb_actions = (
854 LB_NOBREAK => 0,
855 LB_BREAKABLE => 1,
856 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
857
b0e24409 858 LB_CM_ZWJ_foo => 3, # Rule 9
6b659339
KW
859 LB_SP_foo => 6, # Rule 18
860 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
861 LB_SY_or_IS_then_various => 11, # Rule 25
862 LB_HY_or_BA_then_foo => 13, # Rule 21
b0e24409 863 LB_RI_then_RI => 15, # Rule 30a
6b659339 864
b0e24409 865 LB_various_then_PO_or_PR => (1<<5), # Rule 25
6b659339
KW
866 );
867
6b659339
KW
868 # Construct the LB pair table. This is based on the rules in
869 # http://www.unicode.org/reports/tr14/, but modified as those rules are
870 # designed for someone taking a string of text and sequentially going
871 # through it to find the break opportunities, whereas, Perl requires
872 # determining if a given random spot is a break opportunity, without
873 # knowing all the entire string before it.
874 #
875 # The table is constructed in reverse order of the rules, to make the
876 # lower-numbered, higher priority ones override the later ones, as the
877 # algorithm stops at the earliest matching rule
878
879 my @lb_table;
880 my $table_size = @lb_short_enums;
881
882 # LB31. Break everywhere else
883 for my $i (0 .. $table_size - 1) {
884 for my $j (0 .. $table_size - 1) {
885 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
886 }
887 }
888
b0e24409
KW
889 # LB30b Do not break between an emoji base and an emoji modifier.
890 # EB × EM
891 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'E_Modifier'}]
892 = $lb_actions{'LB_NOBREAK'};
893
894 # LB30a Break between two regional indicator symbols if and only if there
895 # are an even number of regional indicators preceding the position of the
896 # break.
897 # sot (RI RI)* RI × RI
898 # [^RI] (RI RI)* RI × RI
289ce9cc 899 $lb_table[$lb_enums{'Regional_Indicator'}]
b0e24409 900 [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_RI_then_RI'};
6b659339
KW
901
902 # LB30 Do not break between letters, numbers, or ordinary symbols and
903 # opening or closing parentheses.
904 # (AL | HL | NU) × OP
289ce9cc
KW
905 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}]
906 = $lb_actions{'LB_NOBREAK'};
907 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}]
908 = $lb_actions{'LB_NOBREAK'};
909 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}]
910 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
911
912 # CP × (AL | HL | NU)
289ce9cc
KW
913 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}]
914 = $lb_actions{'LB_NOBREAK'};
915 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}]
916 = $lb_actions{'LB_NOBREAK'};
917 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}]
918 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
919
920 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
921 # IS × (AL | HL)
289ce9cc
KW
922 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}]
923 = $lb_actions{'LB_NOBREAK'};
924 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
925 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
926
927 # LB28 Do not break between alphabetics (“at”).
928 # (AL | HL) × (AL | HL)
289ce9cc
KW
929 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}]
930 = $lb_actions{'LB_NOBREAK'};
931 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}]
932 = $lb_actions{'LB_NOBREAK'};
933 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}]
934 = $lb_actions{'LB_NOBREAK'};
935 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}]
936 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
937
938 # LB27 Treat a Korean Syllable Block the same as ID.
939 # (JL | JV | JT | H2 | H3) × IN
289ce9cc
KW
940 $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}]
941 = $lb_actions{'LB_NOBREAK'};
942 $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}]
943 = $lb_actions{'LB_NOBREAK'};
944 $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}]
945 = $lb_actions{'LB_NOBREAK'};
946 $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}]
947 = $lb_actions{'LB_NOBREAK'};
948 $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}]
949 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
950
951 # (JL | JV | JT | H2 | H3) × PO
289ce9cc
KW
952 $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}]
953 = $lb_actions{'LB_NOBREAK'};
954 $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}]
955 = $lb_actions{'LB_NOBREAK'};
956 $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}]
957 = $lb_actions{'LB_NOBREAK'};
958 $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}]
959 = $lb_actions{'LB_NOBREAK'};
960 $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}]
961 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
962
963 # PR × (JL | JV | JT | H2 | H3)
289ce9cc
KW
964 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}]
965 = $lb_actions{'LB_NOBREAK'};
966 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}]
967 = $lb_actions{'LB_NOBREAK'};
968 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}]
969 = $lb_actions{'LB_NOBREAK'};
970 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}]
971 = $lb_actions{'LB_NOBREAK'};
972 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}]
973 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
974
975 # LB26 Do not break a Korean syllable.
976 # JL × (JL | JV | H2 | H3)
977 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
978 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
979 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
980 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
981
982 # (JV | H2) × (JV | JT)
983 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
984 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
985 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
986 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
987
988 # (JT | H3) × JT
989 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
990 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
991
992 # LB25 Do not break between the following pairs of classes relevant to
993 # numbers, as tailored by example 7 in
994 # http://www.unicode.org/reports/tr14/#Examples
995 # We follow that tailoring because Unicode's test cases expect it
996 # (PR | PO) × ( OP | HY )? NU
289ce9cc
KW
997 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}]
998 = $lb_actions{'LB_NOBREAK'};
999 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}]
1000 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1001
1002 # Given that (OP | HY )? is optional, we have to test for it in code.
1003 # We add in the action (instead of overriding) for this, so that in
1004 # the code we can recover the underlying break value.
289ce9cc 1005 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 1006 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1007 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 1008 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1009 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339 1010 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 1011 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339
KW
1012 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1013
1014 # ( OP | HY ) × NU
289ce9cc
KW
1015 $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}]
1016 = $lb_actions{'LB_NOBREAK'};
1017 $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}]
1018 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1019
1020 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
1021 # which can be rewritten as:
1022 # NU (SY | IS)* × (NU | SY | IS | CL | CP )
289ce9cc
KW
1023 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}]
1024 = $lb_actions{'LB_NOBREAK'};
1025 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}]
1026 = $lb_actions{'LB_NOBREAK'};
1027 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}]
1028 = $lb_actions{'LB_NOBREAK'};
1029 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}]
1030 = $lb_actions{'LB_NOBREAK'};
1031 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}]
1032 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1033
1034 # Like earlier where we have to test in code, we add in the action so
1035 # that we can recover the underlying values. This is done in rules
1036 # below, as well. The code assumes that we haven't added 2 actions.
1037 # Shoul a later Unicode release break that assumption, then tests
1038 # should start failing.
289ce9cc 1039 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}]
6b659339 1040 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1041 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}]
6b659339 1042 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1043 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}]
6b659339 1044 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1045 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}]
6b659339 1046 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1047 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}]
6b659339 1048 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1049 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}]
6b659339 1050 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1051 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}]
6b659339 1052 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1053 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}]
6b659339 1054 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1055 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}]
6b659339 1056 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 1057 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}]
6b659339
KW
1058 += $lb_actions{'LB_SY_or_IS_then_various'};
1059
1060 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
1061 # which can be rewritten as:
1062 # NU (SY | IS)* (CL | CP)? × (PO | PR)
289ce9cc
KW
1063 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}]
1064 = $lb_actions{'LB_NOBREAK'};
1065 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}]
1066 = $lb_actions{'LB_NOBREAK'};
6b659339 1067
289ce9cc 1068 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1069 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1070 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1071 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1072 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}]
6b659339 1073 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1074 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}]
6b659339
KW
1075 += $lb_actions{'LB_various_then_PO_or_PR'};
1076
289ce9cc 1077 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1078 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1079 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1080 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1081 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}]
6b659339 1082 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 1083 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}]
6b659339
KW
1084 += $lb_actions{'LB_various_then_PO_or_PR'};
1085
b0e24409
KW
1086 # LB24 Do not break between numeric prefix/postfix and letters, or between
1087 # letters and prefix/postfix.
1088 # (PR | PO) × (AL | HL)
289ce9cc
KW
1089 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}]
1090 = $lb_actions{'LB_NOBREAK'};
1091 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1092 = $lb_actions{'LB_NOBREAK'};
289ce9cc
KW
1093 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}]
1094 = $lb_actions{'LB_NOBREAK'};
1095 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1096 = $lb_actions{'LB_NOBREAK'};
6b659339 1097
b0e24409
KW
1098 # (AL | HL) × (PR | PO)
1099 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Prefix_Numeric'}]
1100 = $lb_actions{'LB_NOBREAK'};
1101 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Prefix_Numeric'}]
1102 = $lb_actions{'LB_NOBREAK'};
1103 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Postfix_Numeric'}]
1104 = $lb_actions{'LB_NOBREAK'};
1105 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Postfix_Numeric'}]
1106 = $lb_actions{'LB_NOBREAK'};
1107
1108 # LB23a Do not break between numeric prefixes and ideographs, or between
1109 # ideographs and numeric postfixes.
1110 # PR × (ID | EB | EM)
1111 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}]
1112 = $lb_actions{'LB_NOBREAK'};
1113 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Base'}]
1114 = $lb_actions{'LB_NOBREAK'};
1115 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Modifier'}]
1116 = $lb_actions{'LB_NOBREAK'};
1117
1118 # (ID | EB | EM) × PO
289ce9cc
KW
1119 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}]
1120 = $lb_actions{'LB_NOBREAK'};
b0e24409
KW
1121 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Postfix_Numeric'}]
1122 = $lb_actions{'LB_NOBREAK'};
1123 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Postfix_Numeric'}]
1124 = $lb_actions{'LB_NOBREAK'};
6b659339 1125
b0e24409 1126 # LB23 Do not break between digits and letters
6b659339 1127 # (AL | HL) × NU
289ce9cc
KW
1128 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}]
1129 = $lb_actions{'LB_NOBREAK'};
1130 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}]
1131 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1132
1133 # NU × (AL | HL)
289ce9cc
KW
1134 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}]
1135 = $lb_actions{'LB_NOBREAK'};
1136 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}]
1137 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1138
1139 # LB22 Do not break between two ellipses, or between letters, numbers or
1140 # exclamations and ellipsis.
1141 # (AL | HL) × IN
289ce9cc
KW
1142 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}]
1143 = $lb_actions{'LB_NOBREAK'};
1144 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}]
1145 = $lb_actions{'LB_NOBREAK'};
6b659339 1146
289ce9cc
KW
1147 # Exclamation × IN
1148 $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}]
1149 = $lb_actions{'LB_NOBREAK'};
6b659339 1150
b0e24409 1151 # (ID | EB | EM) × IN
289ce9cc
KW
1152 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}]
1153 = $lb_actions{'LB_NOBREAK'};
b0e24409
KW
1154 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Inseparable'}]
1155 = $lb_actions{'LB_NOBREAK'};
1156 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Inseparable'}]
1157 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1158
1159 # IN × IN
289ce9cc
KW
1160 $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}]
1161 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1162
1163 # NU × IN
289ce9cc
KW
1164 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}]
1165 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1166
1167 # LB21b Don’t break between Solidus and Hebrew letters.
1168 # SY × HL
289ce9cc
KW
1169 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}]
1170 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1171
1172 # LB21a Don't break after Hebrew + Hyphen.
1173 # HL (HY | BA) ×
1174 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1175 $lb_table[$lb_enums{'Hyphen'}][$i]
1176 += $lb_actions{'LB_HY_or_BA_then_foo'};
1177 $lb_table[$lb_enums{'Break_After'}][$i]
1178 += $lb_actions{'LB_HY_or_BA_then_foo'};
6b659339
KW
1179 }
1180
1181 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
1182 # spaces, small kana, and other non-starters, or after acute accents.
1183 # × BA
1184 # × HY
1185 # × NS
1186 # BB ×
1187 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1188 $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'};
1189 $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'};
1190 $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'};
1191 $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1192 }
1193
1194 # LB20 Break before and after unresolved CB.
1195 # ÷ CB
1196 # CB ÷
1197 # Conditional breaks should be resolved external to the line breaking
1198 # rules. However, the default action is to treat unresolved CB as breaking
1199 # before and after.
1200 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1201 $lb_table[$i][$lb_enums{'Contingent_Break'}]
1202 = $lb_actions{'LB_BREAKABLE'};
1203 $lb_table[$lb_enums{'Contingent_Break'}][$i]
1204 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1205 }
1206
1207 # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
1208 # × QU
1209 # QU ×
1210 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1211 $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'};
1212 $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1213 }
1214
1215 # LB18 Break after spaces
1216 # SP ÷
1217 for my $i (0 .. @lb_table - 1) {
289ce9cc 1218 $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1219 }
1220
1221 # LB17 Do not break within ‘——’, even with intervening spaces.
1222 # B2 SP* × B2
289ce9cc 1223 $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}]
6b659339
KW
1224 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1225
1226 # LB16 Do not break between closing punctuation and a nonstarter even with
1227 # intervening spaces.
1228 # (CL | CP) SP* × NS
289ce9cc 1229 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}]
6b659339 1230 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1231 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}]
6b659339
KW
1232 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1233
1234
1235 # LB15 Do not break within ‘”[’, even with intervening spaces.
1236 # QU SP* × OP
289ce9cc 1237 $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}]
6b659339
KW
1238 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1239
1240 # LB14 Do not break after ‘[’, even after spaces.
1241 # OP SP* ×
1242 for my $i (0 .. @lb_table - 1) {
289ce9cc 1243 $lb_table[$lb_enums{'Open_Punctuation'}][$i]
6b659339
KW
1244 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1245 }
1246
1247 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
1248 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
1249 # [^NU] × CL
1250 # [^NU] × CP
1251 # × EX
1252 # [^NU] × IS
1253 # [^NU] × SY
1254 for my $i (0 .. @lb_table - 1) {
289ce9cc 1255 $lb_table[$i][$lb_enums{'Exclamation'}]
6b659339
KW
1256 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1257
289ce9cc 1258 next if $i == $lb_enums{'Numeric'};
6b659339 1259
289ce9cc 1260 $lb_table[$i][$lb_enums{'Close_Punctuation'}]
6b659339 1261 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1262 $lb_table[$i][$lb_enums{'Close_Parenthesis'}]
6b659339 1263 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1264 $lb_table[$i][$lb_enums{'Infix_Numeric'}]
6b659339 1265 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1266 $lb_table[$i][$lb_enums{'Break_Symbols'}]
6b659339
KW
1267 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1268 }
1269
1270 # LB12a Do not break before NBSP and related characters, except after
1271 # spaces and hyphens.
1272 # [^SP BA HY] × GL
1273 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1274 next if $i == $lb_enums{'Space'}
1275 || $i == $lb_enums{'Break_After'}
1276 || $i == $lb_enums{'Hyphen'};
6b659339
KW
1277
1278 # We don't break, but if a property above has said don't break even
1279 # with space between, don't override that (also in the next few rules)
289ce9cc 1280 next if $lb_table[$i][$lb_enums{'Glue'}]
6b659339 1281 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1282 $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1283 }
1284
1285 # LB12 Do not break after NBSP and related characters.
1286 # GL ×
1287 for my $i (0 .. @lb_table - 1) {
289ce9cc 1288 next if $lb_table[$lb_enums{'Glue'}][$i]
6b659339 1289 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1290 $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1291 }
1292
1293 # LB11 Do not break before or after Word joiner and related characters.
1294 # × WJ
1295 # WJ ×
1296 for my $i (0 .. @lb_table - 1) {
289ce9cc 1297 if ($lb_table[$i][$lb_enums{'Word_Joiner'}]
6b659339
KW
1298 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1299 {
289ce9cc 1300 $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'};
6b659339 1301 }
289ce9cc 1302 if ($lb_table[$lb_enums{'Word_Joiner'}][$i]
6b659339
KW
1303 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1304 {
289ce9cc 1305 $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1306 }
1307 }
1308
1309 # Special case this here to avoid having to do a special case in the code,
1310 # by making this the same as other things with a SP in front of them that
1311 # don't break, we avoid an extra test
289ce9cc 1312 $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}]
6b659339
KW
1313 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1314
1315 # LB9 and LB10 are done in the same loop
1316 #
1317 # LB9 Do not break a combining character sequence; treat it as if it has
1318 # the line breaking class of the base character in all of the
b0e24409
KW
1319 # higher-numbered rules. Treat ZWJ as if it were CM
1320 # Treat X (CM|ZWJ)* as if it were X.
6b659339
KW
1321 # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
1322
b0e24409
KW
1323 # LB10 Treat any remaining combining mark or ZWJ as AL. This catches the
1324 # case where a CM or ZWJ is the first character on the line or follows SP,
1325 # BK, CR, LF, NL, or ZW.
6b659339
KW
1326 for my $i (0 .. @lb_table - 1) {
1327
b0e24409
KW
1328 # When the CM or ZWJ is the first in the pair, we don't know without
1329 # looking behind whether the CM or ZWJ is going to attach to an
1330 # earlier character, or not. So have to figure this out at runtime in
1331 # the code
1332 $lb_table[$lb_enums{'Combining_Mark'}][$i]
1333 = $lb_actions{'LB_CM_ZWJ_foo'};
1334 $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_CM_ZWJ_foo'};
289ce9cc
KW
1335
1336 if ( $i == $lb_enums{'Mandatory_Break'}
1337 || $i == $lb_enums{'EDGE'}
1338 || $i == $lb_enums{'Carriage_Return'}
1339 || $i == $lb_enums{'Line_Feed'}
1340 || $i == $lb_enums{'Next_Line'}
1341 || $i == $lb_enums{'Space'}
1342 || $i == $lb_enums{'ZWSpace'})
6b659339
KW
1343 {
1344 # For these classes, a following CM doesn't combine, and should do
289ce9cc
KW
1345 # whatever 'Alphabetic' would do.
1346 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1347 = $lb_table[$i][$lb_enums{'Alphabetic'}];
b0e24409
KW
1348 $lb_table[$i][$lb_enums{'ZWJ'}]
1349 = $lb_table[$i][$lb_enums{'Alphabetic'}];
6b659339
KW
1350 }
1351 else {
b0e24409
KW
1352 # For these classes, the CM or ZWJ combines, so doesn't break,
1353 # inheriting the type of nobreak from the master character.
289ce9cc 1354 if ($lb_table[$i][$lb_enums{'Combining_Mark'}]
6b659339
KW
1355 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1356 {
289ce9cc
KW
1357 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1358 = $lb_actions{'LB_NOBREAK'};
6b659339 1359 }
b0e24409
KW
1360 if ($lb_table[$i][$lb_enums{'ZWJ'}]
1361 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1362 {
1363 $lb_table[$i][$lb_enums{'ZWJ'}]
1364 = $lb_actions{'LB_NOBREAK'};
1365 }
6b659339
KW
1366 }
1367 }
1368
b0e24409
KW
1369 # LB8a Do not break between a zero width joiner and an ideograph, emoji
1370 # base or emoji modifier. This rule prevents breaks within emoji joiner
1371 # sequences.
1372 # ZWJ × (ID | EB | EM)
1373 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}]
1374 = $lb_actions{'LB_NOBREAK'};
1375 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}]
1376 = $lb_actions{'LB_NOBREAK'};
1377 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}]
1378 = $lb_actions{'LB_NOBREAK'};
1379
6b659339
KW
1380 # LB8 Break before any character following a zero-width space, even if one
1381 # or more spaces intervene.
1382 # ZW SP* ÷
1383 for my $i (0 .. @lb_table - 1) {
289ce9cc 1384 $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1385 }
1386
1387 # Because of LB8-10, we need to look at context for "SP x", and this must
1388 # be done in the code. So override the existing rules for that, by adding
1389 # a constant to get new rules that tell the code it needs to look at
1390 # context. By adding this action instead of replacing the existing one,
1391 # we can get back to the original rule if necessary.
1392 for my $i (0 .. @lb_table - 1) {
289ce9cc 1393 $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'};
6b659339
KW
1394 }
1395
1396 # LB7 Do not break before spaces or zero width space.
1397 # × SP
1398 # × ZW
1399 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1400 $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'};
1401 $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1402 }
1403
1404 # LB6 Do not break before hard line breaks.
1405 # × ( BK | CR | LF | NL )
1406 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1407 $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'};
1408 $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'};
1409 $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'};
1410 $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1411 }
1412
1413 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
1414 # CR × LF
1415 # CR !
1416 # LF !
1417 # NL !
1418 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1419 $lb_table[$lb_enums{'Carriage_Return'}][$i]
1420 = $lb_actions{'LB_BREAKABLE'};
1421 $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'};
1422 $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339 1423 }
289ce9cc
KW
1424 $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}]
1425 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1426
1427 # LB4 Always break after hard line breaks.
1428 # BK !
1429 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1430 $lb_table[$lb_enums{'Mandatory_Break'}][$i]
1431 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1432 }
1433
6b659339
KW
1434 # LB3 Always break at the end of text.
1435 # ! eot
b0e24409
KW
1436 # LB2 Never break at the start of text.
1437 # sot ×
6b659339 1438 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1439 $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'};
1440 $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1441 }
1442
1443 # LB1 Assign a line breaking class to each code point of the input.
1444 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
1445 # depending on criteria outside the scope of this algorithm.
1446 #
1447 # In the absence of such criteria all characters with a specific
1448 # combination of original class and General_Category property value are
1449 # resolved as follows:
1450 # Original Resolved General_Category
1451 # AI, SG, XX AL Any
1452 # SA CM Only Mn or Mc
1453 # SA AL Any except Mn and Mc
1454 # CJ NS Any
1455 #
1456 # This is done in mktables, so we never see any of the remapped-from
1457 # classes.
1458
289ce9cc
KW
1459 output_table_common('LB', \%lb_actions,
1460 \@lb_table, \@lb_short_enums, \%lb_abbreviations);
6b659339
KW
1461}
1462
7e54b87f
KW
1463sub output_WB_table() {
1464
1465 # Create and output the enums, #defines, and pair table for use in
1466 # determining Word Breaks, given in http://www.unicode.org/reports/tr29/.
1467
1468 # This uses the same mechanism in the other bounds tables generated by
1469 # this file. The actions that could override a 0 or 1 are added to those
1470 # numbers; the actions that clearly don't depend on the underlying rule
1471 # simply overwrite
1472 my %wb_actions = (
1473 WB_NOBREAK => 0,
1474 WB_BREAKABLE => 1,
1475 WB_hs_then_hs => 2,
b0e24409 1476 WB_Ex_or_FO_or_ZWJ_then_foo => 3,
7e54b87f
KW
1477 WB_DQ_then_HL => 4,
1478 WB_HL_then_DQ => 6,
1479 WB_LE_or_HL_then_MB_or_ML_or_SQ => 8,
1480 WB_MB_or_ML_or_SQ_then_LE_or_HL => 10,
1481 WB_MB_or_MN_or_SQ_then_NU => 12,
1482 WB_NU_then_MB_or_MN_or_SQ => 14,
b0e24409 1483 WB_RI_then_RI => 16,
7e54b87f
KW
1484 );
1485
7e54b87f
KW
1486 # Construct the WB pair table.
1487 # The table is constructed in reverse order of the rules, to make the
1488 # lower-numbered, higher priority ones override the later ones, as the
1489 # algorithm stops at the earliest matching rule
1490
1491 my @wb_table;
1492 my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
39c4defe 1493 die "UNKNOWN must be final WB enum" unless $wb_short_enums[-1] =~ /unk/i;
7e54b87f
KW
1494
1495 # Otherwise, break everywhere (including around ideographs).
b0e24409 1496 # WB99 Any ÷ Any
7e54b87f
KW
1497 for my $i (0 .. $table_size - 1) {
1498 for my $j (0 .. $table_size - 1) {
1499 $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'};
1500 }
1501 }
1502
b0e24409
KW
1503 # Do not break within emoji flag sequences. That is, do not break between
1504 # regional indicator (RI) symbols if there is an odd number of RI
1505 # characters before the break point.
1506 # WB16 [^RI] (RI RI)* RI × RI
c492f156 1507 # WB15 sot (RI RI)* RI × RI
289ce9cc 1508 $wb_table[$wb_enums{'Regional_Indicator'}]
b0e24409
KW
1509 [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_RI_then_RI'};
1510
1511 # Do not break within emoji modifier sequences.
1512 # WB14 ( E_Base | EBG ) × E_Modifier
1513 $wb_table[$wb_enums{'E_Base'}][$wb_enums{'E_Modifier'}]
1514 = $wb_actions{'WB_NOBREAK'};
1515 $wb_table[$wb_enums{'E_Base_GAZ'}][$wb_enums{'E_Modifier'}]
1516 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1517
1518 # Do not break from extenders.
1519 # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
289ce9cc
KW
1520 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
1521 = $wb_actions{'WB_NOBREAK'};
1522 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
1523 = $wb_actions{'WB_NOBREAK'};
1524 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
1525 = $wb_actions{'WB_NOBREAK'};
1526 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}]
1527 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1528
1529 # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
1530 # × # ExtendNumLet
289ce9cc
KW
1531 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
1532 = $wb_actions{'WB_NOBREAK'};
1533 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
1534 = $wb_actions{'WB_NOBREAK'};
1535 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
1536 = $wb_actions{'WB_NOBREAK'};
1537 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}]
1538 = $wb_actions{'WB_NOBREAK'};
1539 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}]
1540 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1541
1542 # Do not break between Katakana.
1543 # WB13 Katakana × Katakana
289ce9cc
KW
1544 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}]
1545 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1546
1547 # Do not break within sequences, such as “3.2” or “3,456.789”.
1548 # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
289ce9cc 1549 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}]
7e54b87f 1550 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1551 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}]
7e54b87f 1552 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1553 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1554 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1555
1556 # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
289ce9cc 1557 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}]
7e54b87f 1558 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1559 $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}]
7e54b87f 1560 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1561 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}]
7e54b87f
KW
1562 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1563
1564 # Do not break within sequences of digits, or digits adjacent to letters
1565 # (“3a”, or “A3”).
1566 # WB10 Numeric × (ALetter | Hebrew_Letter)
289ce9cc
KW
1567 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
1568 = $wb_actions{'WB_NOBREAK'};
1569 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
1570 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1571
1572 # WB9 (ALetter | Hebrew_Letter) × Numeric
289ce9cc
KW
1573 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
1574 = $wb_actions{'WB_NOBREAK'};
1575 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
1576 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1577
1578 # WB8 Numeric × Numeric
289ce9cc
KW
1579 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}]
1580 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1581
1582 # Do not break letters across certain punctuation.
1583 # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
289ce9cc
KW
1584 $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}]
1585 += $wb_actions{'WB_DQ_then_HL'};
7e54b87f
KW
1586
1587 # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
289ce9cc
KW
1588 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}]
1589 += $wb_actions{'WB_HL_then_DQ'};
7e54b87f
KW
1590
1591 # WB7a Hebrew_Letter × Single_Quote
289ce9cc
KW
1592 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1593 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1594
1595 # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)
1596 # × (ALetter | Hebrew_Letter)
289ce9cc 1597 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
7e54b87f 1598 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1599 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1600 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1601 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
7e54b87f 1602 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1603 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1604 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1605 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
7e54b87f 1606 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1607 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f
KW
1608 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1609
1610 # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
1611 # | Single_Quote) (ALetter | Hebrew_Letter)
289ce9cc 1612 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1613 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1614 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1615 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1616 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
7e54b87f 1617 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1618 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
7e54b87f 1619 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1620 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
7e54b87f 1621 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1622 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1623 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1624
1625 # Do not break between most letters.
1626 # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
289ce9cc
KW
1627 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
1628 = $wb_actions{'WB_NOBREAK'};
1629 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
1630 = $wb_actions{'WB_NOBREAK'};
1631 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
1632 = $wb_actions{'WB_NOBREAK'};
1633 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
1634 = $wb_actions{'WB_NOBREAK'};
7e54b87f 1635
b0e24409
KW
1636 # Ignore Format and Extend characters, except after sot, CR, LF, and
1637 # Newline. This also has the effect of: Any × (Format | Extend | ZWJ)
1638 # WB4 X (Extend | Format | ZWJ)* → X
7e54b87f 1639 for my $i (0 .. @wb_table - 1) {
289ce9cc 1640 $wb_table[$wb_enums{'Extend'}][$i]
b0e24409 1641 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
289ce9cc 1642 $wb_table[$wb_enums{'Format'}][$i]
b0e24409
KW
1643 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1644 $wb_table[$wb_enums{'ZWJ'}][$i]
1645 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1646 }
1647 for my $i (0 .. @wb_table - 1) {
1648 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1649 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
1650 $wb_table[$i][$wb_enums{'ZWJ'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1651 }
1652
1653 # Implied is that these attach to the character before them, except for
1654 # the characters that mark the end of a region of text. The rules below
1655 # override the ones set up here, for all the characters that need
1656 # overriding.
1657 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1658 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1659 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1660 }
1661
b0e24409
KW
1662 # Do not break within emoji zwj sequences.
1663 # WB3c ZWJ × ( Glue_After_Zwj | EBG )
1664 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}]
1665 = $wb_actions{'WB_NOBREAK'};
1666 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}]
1667 = $wb_actions{'WB_NOBREAK'};
1668
7e54b87f
KW
1669 # Break before and after white space
1670 # WB3b ÷ (Newline | CR | LF)
1671 # WB3a (Newline | CR | LF) ÷
1672 # et. al.
289ce9cc 1673 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1674 for my $j (0 .. @wb_table - 1) {
1675 $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'};
1676 $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'};
1677 }
1678 }
1679
1680 # But do not break within white space.
1681 # WB3 CR × LF
1682 # et.al.
289ce9cc
KW
1683 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1684 for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1685 $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'};
1686 }
1687 }
1688
b0e24409 1689 # And do not break horizontal space followed by Extend or Format or ZWJ
289ce9cc
KW
1690 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}]
1691 = $wb_actions{'WB_NOBREAK'};
1692 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}]
1693 = $wb_actions{'WB_NOBREAK'};
b0e24409
KW
1694 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'ZWJ'}]
1695 = $wb_actions{'WB_NOBREAK'};
289ce9cc
KW
1696 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}]
1697 [$wb_enums{'Perl_Tailored_HSpace'}]
1698 = $wb_actions{'WB_hs_then_hs'};
7e54b87f 1699
b0e24409
KW
1700 # Break at the start and end of text, unless the text is empty
1701 # WB2 Any ÷ eot
1702 # WB1 sot ÷ Any
7e54b87f 1703 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1704 $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'};
1705 $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'};
7e54b87f 1706 }
289ce9cc 1707 $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0;
7e54b87f 1708
289ce9cc
KW
1709 output_table_common('WB', \%wb_actions,
1710 \@wb_table, \@wb_short_enums, \%wb_abbreviations);
7e54b87f
KW
1711}
1712
9d9177be
KW
1713output_invlist("Latin1", [ 0, 256 ]);
1714output_invlist("AboveLatin1", [ 256 ]);
1715
bffc0129 1716end_file_pound_if;
43b443dd 1717
3f427fd9
KW
1718# We construct lists for all the POSIX and backslash sequence character
1719# classes in two forms:
1720# 1) ones which match only in the ASCII range
1721# 2) ones which match either in the Latin1 range, or the entire Unicode range
1722#
1723# These get compiled in, and hence affect the memory footprint of every Perl
1724# program, even those not using Unicode. To minimize the size, currently
1725# the Latin1 version is generated for the beyond ASCII range except for those
1726# lists that are quite small for the entire range, such as for \s, which is 22
1727# UVs long plus 4 UVs (currently) for the header.
1728#
1729# To save even more memory, the ASCII versions could be derived from the
1730# larger ones at runtime, saving some memory (minus the expense of the machine
1731# instructions to do so), but these are all small anyway, so their total is
1732# about 100 UVs.
1733#
1734# In the list of properties below that get generated, the L1 prefix is a fake
1735# property that means just the Latin1 range of the full property (whose name
1736# has an X prefix instead of L1).
a02047bf
KW
1737#
1738# An initial & means to use the subroutine from this file instead of an
1739# official inversion list.
3f427fd9 1740
0c4ecf42
KW
1741for my $charset (get_supported_code_pages()) {
1742 print $out_fh "\n" . get_conditional_compile_line_start($charset);
1743
99f21fb9 1744 @a2n = @{get_a2n($charset)};
226b74db
KW
1745 # Below is the list of property names to generate. '&' means to use the
1746 # subroutine to generate the inversion list instead of the generic code
1747 # below. Some properties have a comma-separated list after the name,
1748 # These are extra enums to add to those found in the Unicode tables.
99f21fb9
KW
1749 no warnings 'qw';
1750 # Ignore non-alpha in sort
1751 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
c0382778 1752 Assigned
1c8c3428
KW
1753 ASCII
1754 Cased
1755 VertSpace
1756 XPerlSpace
1757 XPosixAlnum
1758 XPosixAlpha
1759 XPosixBlank
1760 XPosixCntrl
1761 XPosixDigit
1762 XPosixGraph
1763 XPosixLower
1764 XPosixPrint
1765 XPosixPunct
1766 XPosixSpace
1767 XPosixUpper
1768 XPosixWord
1769 XPosixXDigit
1770 _Perl_Any_Folds
1771 &NonL1_Perl_Non_Final_Folds
1772 _Perl_Folds_To_Multi_Char
1773 &UpperLatin1
1774 _Perl_IDStart
1775 _Perl_IDCont
226b74db
KW
1776 _Perl_GCB,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,EDGE
1777 _Perl_LB,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner,EDGE,
1778 _Perl_SB,SContinue,CR,Extend,LF,EDGE
1779 _Perl_WB,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,EDGE,UNKNOWN
1c8c3428 1780 )
226b74db
KW
1781 # NOTE that the convention is that extra enum
1782 # values come after the property name, separated by
1783 # commas, with the enums that aren't ever defined
1784 # by Unicode coming last, at least 4 all-uppercase
1785 # characters. The others are enum names that are
1786 # needed by perl, but aren't in all Unicode
1787 # releases.
0f5e3c71
KW
1788 ) {
1789
1790 # For the Latin1 properties, we change to use the eXtended version of the
1791 # base property, then go through the result and get rid of everything not
1792 # in Latin1 (above 255). Actually, we retain the element for the range
1793 # that crosses the 255/256 boundary if it is one that matches the
1794 # property. For example, in the Word property, there is a range of code
1795 # points that start at U+00F8 and goes through U+02C1. Instead of
1796 # artificially cutting that off at 256 because 256 is the first code point
1797 # above Latin1, we let the range go to its natural ending. That gives us
1798 # extra information with no added space taken. But if the range that
1799 # crosses the boundary is one that doesn't match the property, we don't
1800 # start a new range above 255, as that could be construed as going to
1801 # infinity. For example, the Upper property doesn't include the character
1802 # at 255, but does include the one at 256. We don't include the 256 one.
1803 my $prop_name = $prop;
1804 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
1805 my $extra_enums = "";
1806 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
1807 my $lookup_prop = $prop_name;
1808 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1809 or $lookup_prop =~ s/^L1//);
1810 my $nonl1_only = 0;
1811 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 1812 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
1813
1814 my @invlist;
99f21fb9
KW
1815 my @invmap;
1816 my $map_format;
1817 my $map_default;
1818 my $maps_to_code_point;
1819 my $to_adjust;
0f5e3c71
KW
1820 if ($is_local_sub) {
1821 @invlist = eval $lookup_prop;
289ce9cc 1822 die $@ if $@;
0f5e3c71
KW
1823 }
1824 else {
1825 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 1826 if (! @invlist) {
99f21fb9 1827
ad85f59a
KW
1828 # If couldn't find a non-empty inversion list, see if it is
1829 # instead an inversion map
1830 my ($list_ref, $map_ref, $format, $default)
99f21fb9 1831 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
1832 if (! $list_ref) {
1833 # An empty return here could mean an unknown property, or
1834 # merely that the original inversion list is empty. Call
1835 # in scalar context to differentiate
1836 my $count = prop_invlist($lookup_prop,
1837 '_perl_core_internal_ok');
1838 die "Could not find inversion list for '$lookup_prop'"
1839 unless defined $count;
1840 }
1841 else {
18b852b3
KW
1842 @invlist = @$list_ref;
1843 @invmap = @$map_ref;
1844 $map_format = $format;
1845 $map_default = $default;
1846 $maps_to_code_point = $map_format =~ /x/;
1847 $to_adjust = $map_format =~ /a/;
ad85f59a 1848 }
99f21fb9 1849 }
0f5e3c71 1850 }
ad85f59a
KW
1851
1852
1853 # Short-circuit an empty inversion list.
1854 if (! @invlist) {
1855 output_invlist($prop_name, \@invlist, $charset);
1856 next;
1857 }
ceb1de32 1858
99f21fb9
KW
1859 # Re-order the Unicode code points to native ones for this platform.
1860 # This is only needed for code points below 256, because native code
1861 # points are only in that range. For inversion maps of properties
1862 # where the mappings are adjusted (format =~ /a/), this reordering
1863 # could mess up the adjustment pattern that was in the input, so that
1864 # has to be dealt with.
1865 #
1866 # And inversion maps that map to code points need to eventually have
1867 # all those code points remapped to native, and it's better to do that
1868 # here, going through the whole list not just those below 256. This
1869 # is because some inversion maps have adjustments (format =~ /a/)
1870 # which may be affected by the reordering. This code needs to be done
1871 # both for when we are translating the inversion lists for < 256, and
1872 # for the inversion maps for everything. By doing both in this loop,
1873 # we can share that code.
1874 #
1875 # So, we go through everything for an inversion map to code points;
1876 # otherwise, we can skip any remapping at all if we are going to
1877 # output only the above-Latin1 values, or if the range spans the whole
1878 # of 0..256, as the remap will also include all of 0..256 (256 not
1879 # 255 because a re-ordering could cause 256 to need to be in the same
1880 # range as 255.)
1881 if ((@invmap && $maps_to_code_point)
1882 || (! $nonl1_only || ($invlist[0] < 256
1883 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 1884 {
fb4554ea 1885
99f21fb9 1886 if (! @invmap) { # Straight inversion list
fb4554ea
KW
1887 # Look at all the ranges that start before 257.
1888 my @latin1_list;
1889 while (@invlist) {
1890 last if $invlist[0] > 256;
1891 my $upper = @invlist > 1
1892 ? $invlist[1] - 1 # In range
8a6c81cf
KW
1893
1894 # To infinity. You may want to stop much much
1895 # earlier; going this high may expose perl
1896 # deficiencies with very large numbers.
1897 : $Unicode::UCD::MAX_CP;
fb4554ea 1898 for my $j ($invlist[0] .. $upper) {
99f21fb9 1899 push @latin1_list, a2n($j);
0f5e3c71 1900 }
fb4554ea
KW
1901
1902 shift @invlist; # Shift off the range that's in the list
1903 shift @invlist; # Shift off the range not in the list
0c4ecf42 1904 }
fb4554ea
KW
1905
1906 # Here @invlist contains all the ranges in the original that start
1907 # at code points above 256, and @latin1_list contains all the
1908 # native code points for ranges that start with a Unicode code
1909 # point below 257. We sort the latter and convert it to inversion
1910 # list format. Then simply prepend it to the list of the higher
1911 # code points.
1912 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 1913 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 1914 unshift @invlist, @latin1_list;
99f21fb9
KW
1915 }
1916 else { # Is an inversion map
1917
1918 # This is a similar procedure as plain inversion list, but has
1919 # multiple buckets. A plain inversion list just has two
1920 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
1921 # pretty much can ignore the 2nd bucket, as it is completely
1922 # defined by the 1st. But here, what we do is create buckets
1923 # which contain the code points that map to each, translated
1924 # to native and turned into an inversion list. Thus each
1925 # bucket is an inversion list of native code points that map
1926 # to it or don't map to it. We use these to create an
1927 # inversion map for the whole property.
1928
1929 # As mentioned earlier, we use this procedure to not just
1930 # remap the inversion list to native values, but also the maps
1931 # of code points to native ones. In the latter case we have
1932 # to look at the whole of the inversion map (or at least to
1933 # above Unicode; as the maps of code points above that should
1934 # all be to the default).
1935 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
1936
1937 my %mapped_lists; # A hash whose keys are the buckets.
1938 while (@invlist) {
1939 last if $invlist[0] > $upper_limit;
1940
1941 # This shouldn't actually happen, as prop_invmap() returns
1942 # an extra element at the end that is beyond $upper_limit
1943 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
1944
1945 my $bucket;
1946
1947 # A hash key can't be a ref (we are only expecting arrays
1948 # of scalars here), so convert any such to a string that
1949 # will be converted back later (using a vertical tab as
1950 # the separator). Even if the mapping is to code points,
1951 # we don't translate to native here because the code
d8049362 1952 # output_invmap() calls to output these arrays assumes the
99f21fb9
KW
1953 # input is Unicode, not native.
1954 if (ref $invmap[0]) {
1955 $bucket = join "\cK", @{$invmap[0]};
1956 }
1957 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
1958
1959 # Do convert to native for maps to single code points.
1960 # There are some properties that have a few outlier
1961 # maps that aren't code points, so the above test
1962 # skips those.
1963 $bucket = a2n($invmap[0]);
1964 } else {
1965 $bucket = $invmap[0];
1966 }
1967
1968 # We now have the bucket that all code points in the range
1969 # map to, though possibly they need to be adjusted. Go
1970 # through the range and put each translated code point in
1971 # it into its bucket.
1972 my $base_map = $invmap[0];
1973 for my $j ($invlist[0] .. $invlist[1] - 1) {
1974 if ($to_adjust
1975 # The 1st code point doesn't need adjusting
1976 && $j > $invlist[0]
1977
1978 # Skip any non-numeric maps: these are outliers
1979 # that aren't code points.
1980 && $base_map =~ $numeric_re
1981
1982 # 'ne' because the default can be a string
1983 && $base_map ne $map_default)
1984 {
1985 # We adjust, by incrementing each the bucket and
1986 # the map. For code point maps, translate to
1987 # native
1988 $base_map++;
1989 $bucket = ($maps_to_code_point)
1990 ? a2n($base_map)
1991 : $base_map;
1992 }
1993
1994 # Add the native code point to the bucket for the
1995 # current map
1996 push @{$mapped_lists{$bucket}}, a2n($j);
1997 } # End of loop through all code points in the range
1998
1999 # Get ready for the next range
2000 shift @invlist;
2001 shift @invmap;
2002 } # End of loop through all ranges in the map.
2003
2004 # Here, @invlist and @invmap retain all the ranges from the
2005 # originals that start with code points above $upper_limit.
2006 # Each bucket in %mapped_lists contains all the code points
2007 # that map to that bucket. If the bucket is for a map to a
2008 # single code point is a single code point, the bucket has
2009 # been converted to native. If something else (including
2010 # multiple code points), no conversion is done.
2011 #
2012 # Now we recreate the inversion map into %xlated, but this
2013 # time for the native character set.
2014 my %xlated;
2015 foreach my $bucket (keys %mapped_lists) {
2016
2017 # Sort and convert this bucket to an inversion list. The
2018 # result will be that ranges that start with even-numbered
2019 # indexes will be for code points that map to this bucket;
2020 # odd ones map to some other bucket, and are discarded
2021 # below.
2022 @{$mapped_lists{$bucket}}
2023 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
2024 @{$mapped_lists{$bucket}}
2025 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
2026
2027 # Add each even-numbered range in the bucket to %xlated;
2028 # so that the keys of %xlated become the range start code
2029 # points, and the values are their corresponding maps.
2030 while (@{$mapped_lists{$bucket}}) {
2031 my $range_start = $mapped_lists{$bucket}->[0];
2032 if ($bucket =~ /\cK/) {
2033 @{$xlated{$range_start}} = split /\cK/, $bucket;
2034 }
2035 else {
2036 $xlated{$range_start} = $bucket;
2037 }
2038 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
2039 shift @{$mapped_lists{$bucket}}; # Get ready for next
2040 # iteration
2041 }
2042 } # End of loop through all the buckets.
2043
2044 # Here %xlated's keys are the range starts of all the code
2045 # points in the inversion map. Construct an inversion list
2046 # from them.
2047 my @new_invlist = sort { $a <=> $b } keys %xlated;
2048
2049 # If the list is adjusted, we want to munge this list so that
2050 # we only have one entry for where consecutive code points map
2051 # to consecutive values. We just skip the subsequent entries
2052 # where this is the case.
2053 if ($to_adjust) {
2054 my @temp;
2055 for my $i (0 .. @new_invlist - 1) {
2056 next if $i > 0
2057 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
2058 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
2059 && $xlated{$new_invlist[$i]} =~ $numeric_re
2060 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
2061 push @temp, $new_invlist[$i];
2062 }
2063 @new_invlist = @temp;
2064 }
2065
2066 # The inversion map comes from %xlated's values. We can
2067 # unshift each onto the front of the untouched portion, in
2068 # reverse order of the portion we did process.
2069 foreach my $start (reverse @new_invlist) {
2070 unshift @invmap, $xlated{$start};
2071 }
2072
2073 # Finally prepend the inversion list we have just constructed to the
2074 # one that contains anything we didn't process.
2075 unshift @invlist, @new_invlist;
2076 }
2077 }
2078
2079 # prop_invmap() returns an extra final entry, which we can now
2080 # discard.
2081 if (@invmap) {
2082 pop @invlist;
2083 pop @invmap;
ceb1de32 2084 }
0f5e3c71
KW
2085
2086 if ($l1_only) {
99f21fb9 2087 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
2088 for my $i (0 .. @invlist - 1 - 1) {
2089 if ($invlist[$i] > 255) {
2090
2091 # In an inversion list, even-numbered elements give the code
2092 # points that begin ranges that match the property;
2093 # odd-numbered give ones that begin ranges that don't match.
2094 # If $i is odd, we are at the first code point above 255 that
2095 # doesn't match, which means the range it is ending does
2096 # match, and crosses the 255/256 boundary. We want to include
2097 # this ending point, so increment $i, so the splice below
2098 # includes it. Conversely, if $i is even, it is the first
2099 # code point above 255 that matches, which means there was no
2100 # matching range that crossed the boundary, and we don't want
2101 # to include this code point, so splice before it.
2102 $i++ if $i % 2 != 0;
2103
2104 # Remove everything past this.
2105 splice @invlist, $i;
99f21fb9 2106 splice @invmap, $i if @invmap;
0f5e3c71
KW
2107 last;
2108 }
0c4ecf42
KW
2109 }
2110 }
0f5e3c71
KW
2111 elsif ($nonl1_only) {
2112 my $found_nonl1 = 0;
2113 for my $i (0 .. @invlist - 1 - 1) {
2114 next if $invlist[$i] < 256;
2115
2116 # Here, we have the first element in the array that indicates an
2117 # element above Latin1. Get rid of all previous ones.
2118 splice @invlist, 0, $i;
99f21fb9 2119 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
2120
2121 # If this one's index is not divisible by 2, it means that this
2122 # element is inverting away from being in the list, which means
99f21fb9
KW
2123 # all code points from 256 to this one are in this list (or
2124 # map to the default for inversion maps)
2125 if ($i % 2 != 0) {
2126 unshift @invlist, 256;
2127 unshift @invmap, $map_default if @invmap;
2128 }
0f5e3c71 2129 $found_nonl1 = 1;
3f427fd9
KW
2130 last;
2131 }
0f5e3c71 2132 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 2133 }
3f427fd9 2134
0f5e3c71 2135 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 2136 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 2137 }
bffc0129 2138 end_file_pound_if;
0c4ecf42 2139 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
2140}
2141
973a28ed
KW
2142switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
2143
2144output_GCB_table();
6b659339 2145output_LB_table();
7e54b87f 2146output_WB_table();
6b659339 2147
973a28ed
KW
2148end_file_pound_if;
2149
2308ab83 2150my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
2151my @sources = ($0, qw(lib/unicore/mktables
2152 lib/Unicode/UCD.pm
2153 regen/charset_translations.pl
2154 ));
9a3da3ad
FC
2155{
2156 # Depend on mktables’ own sources. It’s a shorter list of files than
2157 # those that Unicode::UCD uses.
1ae6ead9 2158 if (! open my $mktables_list, '<', $sources_list) {
2308ab83
KW
2159
2160 # This should force a rebuild once $sources_list exists
2161 push @sources, $sources_list;
2162 }
2163 else {
2164 while(<$mktables_list>) {
2165 last if /===/;
2166 chomp;
2167 push @sources, "lib/unicore/$_" if /^[^#]/;
2168 }
9a3da3ad
FC
2169 }
2170}
6b659339
KW
2171
2172read_only_bottom_close_and_rename($out_fh, \@sources);