This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/mk_invlists.pl: Simplify handling of early Unicode releases
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
99f21fb9
KW
5use Unicode::UCD qw(prop_aliases
6 prop_values
7 prop_value_aliases
8 prop_invlist
9 prop_invmap search_invlist
10 );
3d7c117d
MB
11require './regen/regen_lib.pl';
12require './regen/charset_translations.pl';
9d9177be
KW
13
14# This program outputs charclass_invlists.h, which contains various inversion
15# lists in the form of C arrays that are to be used as-is for inversion lists.
16# Thus, the lists it contains are essentially pre-compiled, and need only a
17# light-weight fast wrapper to make them usable at run-time.
18
19# As such, this code knows about the internal structure of these lists, and
20# any change made to that has to be done here as well. A random number stored
21# in the headers is used to minimize the possibility of things getting
22# out-of-sync, or the wrong data structure being passed. Currently that
23# random number is:
99f21fb9
KW
24
25# charclass_invlists.h now also has a partial implementation of inversion
26# maps; enough to generate tables for the line break properties, such as GCB
27
0a07b44b 28my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be 29
99f21fb9
KW
30# integer or float
31my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
32
33# Matches valid C language enum names: begins with ASCII alphabetic, then any
34# ASCII \w
35my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
36
9d9177be
KW
37my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
40
bffc0129 41my $in_file_pound_if = 0;
43b443dd 42
289ce9cc
KW
43my $max_hdr_len = 3; # In headings, how wide a name is allowed?
44
9d9177be
KW
45print $out_fh "/* See the generating file for comments */\n\n";
46
bffc0129
KW
47# The symbols generated by this program are all currently defined only in a
48# single dot c each. The code knows where most of them go, but this hash
49# gives overrides for the exceptions to the typical place
50my %exceptions_to_where_to_define =
51 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
52 AboveLatin1 => 'PERL_IN_REGCOMP_C',
53 Latin1 => 'PERL_IN_REGCOMP_C',
54 UpperLatin1 => 'PERL_IN_REGCOMP_C',
55 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
56 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
57 _Perl_IDCont => 'PERL_IN_UTF8_C',
58 _Perl_IDStart => 'PERL_IN_UTF8_C',
59 );
015bb97c 60
973a28ed
KW
61my %gcb_enums;
62my @gcb_short_enums;
289ce9cc 63my %gcb_abbreviations;
6b659339
KW
64my %lb_enums;
65my @lb_short_enums;
289ce9cc 66my %lb_abbreviations;
7e54b87f
KW
67my %wb_enums;
68my @wb_short_enums;
289ce9cc 69my %wb_abbreviations;
6b659339 70
99f21fb9
KW
71my @a2n;
72
73sub uniques {
74 # Returns non-duplicated input values. From "Perl Best Practices:
75 # Encapsulated Cleverness". p. 455 in first edition.
76
77 my %seen;
78 return grep { ! $seen{$_}++ } @_;
79}
80
81sub a2n($) {
82 my $cp = shift;
83
84 # Returns the input Unicode code point translated to native.
85
86 return $cp if $cp !~ $numeric_re || $cp > 255;
87 return $a2n[$cp];
88}
89
bffc0129
KW
90sub end_file_pound_if {
91 if ($in_file_pound_if) {
92 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
93 $in_file_pound_if = 0;
94 }
95}
96
97sub switch_pound_if ($$) {
98 my $name = shift;
99 my $new_pound_if = shift;
100
101 # Switch to new #if given by the 2nd argument. If there is an override
102 # for this, it instead switches to that. The 1st argument is the
103 # static's name, used to look up the overrides
104
105 if (exists $exceptions_to_where_to_define{$name}) {
106 $new_pound_if = $exceptions_to_where_to_define{$name};
107 }
108
109 # Exit current #if if the new one is different from the old
110 if ($in_file_pound_if
111 && $in_file_pound_if !~ /$new_pound_if/)
112 {
113 end_file_pound_if;
114 }
115
116 # Enter new #if, if not already in it.
117 if (! $in_file_pound_if) {
118 $in_file_pound_if = "defined($new_pound_if)";
119 print $out_fh "\n#if $in_file_pound_if\n";
43b443dd
KW
120 }
121}
122
0c4ecf42 123sub output_invlist ($$;$) {
9d9177be
KW
124 my $name = shift;
125 my $invlist = shift; # Reference to inversion list array
0c4ecf42 126 my $charset = shift // ""; # name of character set for comment
9d9177be 127
76d3994c 128 die "No inversion list for $name" unless defined $invlist
ad85f59a 129 && ref $invlist eq 'ARRAY';
76d3994c 130
9d9177be
KW
131 # Output the inversion list $invlist using the name $name for it.
132 # It is output in the exact internal form for inversion lists.
133
a0316a6c
KW
134 # Is the last element of the header 0, or 1 ?
135 my $zero_or_one = 0;
ad85f59a 136 if (@$invlist && $invlist->[0] != 0) {
a0316a6c 137 unshift @$invlist, 0;
9d9177be
KW
138 $zero_or_one = 1;
139 }
0a07b44b 140 my $count = @$invlist;
9d9177be 141
bffc0129 142 switch_pound_if ($name, 'PERL_IN_PERL_C');
43b443dd 143
0c4ecf42
KW
144 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
145 print $out_fh " /* for $charset */" if $charset;
146 print $out_fh "\n";
9d9177be 147
a0316a6c 148 print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be
KW
149 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
150 print $out_fh "\t", $zero_or_one,
a0316a6c
KW
151 ",\t/* 0 if the list starts at 0;",
152 "\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be
KW
153
154 # The main body are the UVs passed in to this routine. Do the final
155 # element separately
47d53124
KW
156 for my $i (0 .. @$invlist - 1) {
157 printf $out_fh "\t0x%X", $invlist->[$i];
158 print $out_fh "," if $i < @$invlist - 1;
159 print $out_fh "\n";
9d9177be
KW
160 }
161
9d9177be
KW
162 print $out_fh "};\n";
163}
164
99f21fb9
KW
165sub output_invmap ($$$$$$$) {
166 my $name = shift;
167 my $invmap = shift; # Reference to inversion map array
168 my $prop_name = shift;
169 my $input_format = shift; # The inversion map's format
170 my $default = shift; # The property value for code points who
171 # otherwise don't have a value specified.
172 my $extra_enums = shift; # comma-separated list of our additions to the
173 # property's standard possible values
174 my $charset = shift // ""; # name of character set for comment
175
176 # Output the inversion map $invmap for property $prop_name, but use $name
177 # as the actual data structure's name.
178
179 my $count = @$invmap;
180
181 my $output_format;
182 my $declaration_type;
183 my %enums;
184 my $name_prefix;
185
186 if ($input_format eq 's') {
02f811dd
KW
187 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
188 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
226b74db 189 my @input_enums;
f79a09fc 190
226b74db
KW
191 # Find all the possible input values. These become the enum names
192 # that comprise the inversion map.
193 @input_enums = sort(uniques(@$invmap));
6b659339 194
226b74db
KW
195 # The internal enums come last, and in the order specified.
196 my @enums = @input_enums;
27a619f7
KW
197 my @extras;
198 if ($extra_enums ne "") {
199 @extras = split /,/, $extra_enums;
226b74db
KW
200
201 # Don't add if already there.
202 foreach my $this_extra (@extras) {
203 next if grep { $_ eq $this_extra } @enums;
204
205 push @enums, $this_extra;
206 }
27a619f7 207 }
289ce9cc 208
226b74db
KW
209 # Assign a value to each element of the enum type we are creating.
210 # The default value always gets 0; the others are arbitrarily
211 # assigned.
27a619f7
KW
212 my $enum_val = 0;
213 my $canonical_default = prop_value_aliases($prop_name, $default);
214 $default = $canonical_default if defined $canonical_default;
215 $enums{$default} = $enum_val++;
226b74db 216
27a619f7
KW
217 for my $enum (@enums) {
218 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
219 }
220
226b74db 221 # Calculate the data for the special tables output for these properties.
27a619f7
KW
222 if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) {
223
226b74db
KW
224 # The data includes the hashes %gcb_enums, %lb_enums, etc.
225 # Similarly we calculate column headings for the tables.
226 #
27a619f7 227 # We use string evals to allow the same code to work on
226b74db 228 # all the tables
27a619f7
KW
229 my $type = lc $prop_name;
230
27a619f7
KW
231 my $placeholder = "a";
232
233 # Skip if we've already done this code, which populated
234 # this hash
235 if (eval "! \%${type}_enums") {
236
226b74db 237 # For each enum in the type ...
27a619f7
KW
238 foreach my $enum (sort keys %enums) {
239 my $value = $enums{$enum};
240 my $short;
241 my $abbreviated_from;
242
243 # Special case this wb property value to make the
244 # name more clear
245 if ($enum eq 'Perl_Tailored_HSpace') {
246 $short = 'hs';
247 $abbreviated_from = $enum;
248 }
27a619f7 249 else {
226b74db
KW
250
251 # Use the official short name, if found.
27a619f7
KW
252 ($short) = prop_value_aliases($type, $enum);
253
226b74db
KW
254 if (! defined $short) {
255
256 # But if there is no official name, use the name
257 # that came from the data (if any). Otherwise,
258 # the name had to come from the extras list.
259 # There are two types of values in that list.
260 #
261 # First are those enums that are not part of the
262 # property, but are defined by this code. By
263 # convention these have all-caps names of at least
264 # 4 characters. We use the lowercased name for
265 # thse.
266 #
267 # Second are enums that are needed to get
268 # regexec.c to compile, but don't exist in all
269 # Unicode releases. To get here, we must be
270 # compiling an earlier Unicode release that
271 # doesn't have that enum, so just use a unique
272 # anonymous name for it.
273 if (grep { $_ eq $enum } @input_enums) {
274 $short = $enum
275 }
276 elsif ($enum !~ / ^ [A-Z]{4,} $ /x) {
277 $short = $placeholder++;
278 }
279 else {
280 $short = lc $enum;
281 }
282 }
27a619f7
KW
283 }
284
285 # If our short name is too long, or we already
286 # know that the name is an abbreviation, truncate
287 # to make sure it's short enough, and remember
226b74db
KW
288 # that we did this so we can later add a comment in the
289 # generated file
27a619f7
KW
290 if ( $abbreviated_from
291 || length $short > $max_hdr_len)
292 {
293 $short = substr($short, 0, $max_hdr_len);
294 $abbreviated_from = $enum
295 unless $abbreviated_from;
296 # If the name we are to display conflicts, try
297 # another.
298 while (eval "exists
299 \$${type}_abbreviations{$short}")
300 {
289ce9cc 301 die $@ if $@;
256fceb3
KW
302
303 # The increment operator on strings doesn't work
304 # on those containing an '_', so just use the
305 # final portion.
306 my @short = split '_', $short;
307 $short[-1]++;
308 $short = join "_", @short;
289ce9cc 309 }
19a5f1d5 310
27a619f7 311 eval "\$${type}_abbreviations{$short} = '$enum'";
19a5f1d5 312 die $@ if $@;
7e54b87f 313 }
27a619f7
KW
314
315 # Remember the mapping from the property value
316 # (enum) name to its value.
317 eval "\$${type}_enums{$enum} = $value";
318 die $@ if $@;
319
320 # Remember the inverse mapping to the short name
321 # so that we can properly label the generated
322 # table's rows and columns
323 eval "\$${type}_short_enums[$value] = '$short'";
324 die $@ if $@;
7e54b87f 325 }
99f21fb9 326 }
19a5f1d5 327 }
99f21fb9 328
19a5f1d5
KW
329 # Inversion map stuff is currently used only by regexec
330 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
331
332 # The short names tend to be two lower case letters, but it looks
333 # better for those if they are upper. XXX
334 $short_name = uc($short_name) if length($short_name) < 3
226b74db 335 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
19a5f1d5
KW
336 $name_prefix = "${short_name}_";
337 my $enum_count = keys %enums;
338 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
339
226b74db 340 # Start the enum definition for this map
19a5f1d5
KW
341 print $out_fh "\ntypedef enum {\n";
342 my @enum_list;
343 foreach my $enum (keys %enums) {
344 $enum_list[$enums{$enum}] = $enum;
99f21fb9 345 }
19a5f1d5
KW
346 foreach my $i (0 .. @enum_list - 1) {
347 my $name = $enum_list[$i];
348 print $out_fh "\t${name_prefix}$name = $i";
349 print $out_fh "," if $i < $enum_count - 1;
350 print $out_fh "\n";
351 }
352 $declaration_type = "${name_prefix}enum";
353 print $out_fh "} $declaration_type;\n";
226b74db 354 # Finished with the enum defintion.
19a5f1d5
KW
355
356 $output_format = "${name_prefix}%s";
99f21fb9
KW
357 }
358 else {
359 die "'$input_format' invmap() format for '$prop_name' unimplemented";
360 }
361
362 die "No inversion map for $prop_name" unless defined $invmap
363 && ref $invmap eq 'ARRAY'
364 && $count;
365
226b74db 366 # Now output the inversion map proper
99f21fb9
KW
367 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
368 print $out_fh " /* for $charset */" if $charset;
369 print $out_fh "\n";
370
371 # The main body are the scalars passed in to this routine.
372 for my $i (0 .. $count - 1) {
373 my $element = $invmap->[$i];
02f811dd
KW
374 my $full_element_name = prop_value_aliases($prop_name, $element);
375 $element = $full_element_name if defined $full_element_name;
376 $element = $name_prefix . $element;
99f21fb9
KW
377 print $out_fh "\t$element";
378 print $out_fh "," if $i < $count - 1;
379 print $out_fh "\n";
380 }
381 print $out_fh "};\n";
99f21fb9
KW
382}
383
5a7e5385 384sub mk_invlist_from_sorted_cp_list {
a02047bf
KW
385
386 # Returns an inversion list constructed from the sorted input array of
387 # code points
388
389 my $list_ref = shift;
390
99f21fb9
KW
391 return unless @$list_ref;
392
a02047bf
KW
393 # Initialize to just the first element
394 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
395
396 # For each succeeding element, if it extends the previous range, adjust
397 # up, otherwise add it.
398 for my $i (1 .. @$list_ref - 1) {
399 if ($invlist[-1] == $list_ref->[$i]) {
400 $invlist[-1]++;
401 }
402 else {
403 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
404 }
405 }
406 return @invlist;
407}
408
409# Read in the Case Folding rules, and construct arrays of code points for the
410# properties we need.
411my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
412die "Could not find inversion map for Case_Folding" unless defined $format;
413die "Incorrect format '$format' for Case_Folding inversion map"
347b9066
KW
414 unless $format eq 'al'
415 || $format eq 'a';
a02047bf
KW
416my @has_multi_char_fold;
417my @is_non_final_fold;
418
419for my $i (0 .. @$folds_ref - 1) {
420 next unless ref $folds_ref->[$i]; # Skip single-char folds
421 push @has_multi_char_fold, $cp_ref->[$i];
422
b6a6e956 423 # Add to the non-finals list each code point that is in a non-final
a02047bf
KW
424 # position
425 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
426 push @is_non_final_fold, $folds_ref->[$i][$j]
427 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
428 }
429}
430
a02047bf
KW
431sub _Perl_Non_Final_Folds {
432 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
5a7e5385 433 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
a02047bf
KW
434}
435
99f21fb9
KW
436sub prop_name_for_cmp ($) { # Sort helper
437 my $name = shift;
438
439 # Returns the input lowercased, with non-alphas removed, as well as
440 # everything starting with a comma
441
442 $name =~ s/,.*//;
443 $name =~ s/[[:^alpha:]]//g;
444 return lc $name;
445}
446
892d8259 447sub UpperLatin1 {
5a7e5385 448 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
892d8259
KW
449}
450
289ce9cc
KW
451sub output_table_common {
452
453 # Common subroutine to actually output the generated rules table.
454
455 my ($property,
456 $table_value_defines_ref,
457 $table_ref,
458 $names_ref,
459 $abbreviations_ref) = @_;
460 my $size = @$table_ref;
461
462 # Output the #define list, sorted by numeric value
463 if ($table_value_defines_ref) {
464 my $max_name_length = 0;
465 my @defines;
466
467 # Put in order, and at the same time find the longest name
468 while (my ($enum, $value) = each %$table_value_defines_ref) {
469 $defines[$value] = $enum;
470
471 my $length = length $enum;
472 $max_name_length = $length if $length > $max_name_length;
473 }
474
475 print $out_fh "\n";
476
477 # Output, so that the values are vertically aligned in a column after
478 # the longest name
479 foreach my $i (0 .. @defines - 1) {
480 next unless defined $defines[$i];
481 printf $out_fh "#define %-*s %2d\n",
482 $max_name_length,
483 $defines[$i],
484 $i;
485 }
486 }
487
488 my $column_width = 2; # We currently allow 2 digits for the number
489
490 # If the maximum value in the table is 1, it can be a bool. (Being above
491 # a U8 is not currently handled
492 my $max_element = 0;
493 for my $i (0 .. $size - 1) {
494 for my $j (0 .. $size - 1) {
495 next if $max_element >= $table_ref->[$i][$j];
496 $max_element = $table_ref->[$i][$j];
497 }
498 }
499 die "Need wider table column width given '$max_element"
500 if length $max_element > $column_width;
501
502 my $table_type = ($max_element == 1)
503 ? 'bool'
504 : 'U8';
505
506 # If a name is longer than the width set aside for a column, its column
507 # needs to have increased spacing so that the name doesn't get truncated
508 # nor run into an adjacent column
509 my @spacers;
510
511 # If we are being compiled on a Unicode version earlier than that which
512 # this file was designed for, it may be that some of the property values
513 # aren't in the current release, and so would be undefined if we didn't
514 # define them ourselves. Earlier code has done this, making them
515 # lowercase characters of length one. We look to see if any exist, so
516 # that we can add an annotation to the output table
517 my $has_placeholder = 0;
518
519 for my $i (0 .. $size - 1) {
520 no warnings 'numeric';
521 $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
522 $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
523 }
524
525 print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
526
527 # Calculate the column heading line
528 my $header_line = "/* "
529 . (" " x $max_hdr_len) # We let the row heading meld to
530 # the '*/' for those that are at
531 # the max
532 . " " x 3; # Space for '*/ '
533 # Now each column
534 for my $i (0 .. $size - 1) {
535 $header_line .= sprintf "%s%*s",
536 $spacers[$i],
537 $column_width + 1, # 1 for the ','
538 $names_ref->[$i];
539 }
540 $header_line .= " */\n";
541
542 # If we have annotations, output it now.
543 if ($has_placeholder || scalar %$abbreviations_ref) {
544 my $text = "";
545 foreach my $abbr (sort keys %$abbreviations_ref) {
546 $text .= "; " if $text;
547 $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
548 }
549 if ($has_placeholder) {
550 $text .= "; other " if $text;
551 $text .= "lowercase names are placeholders for"
552 . " property values not defined until a later Unicode"
553 . " release, so are irrelevant in this one, as they are"
554 . " not assigned to any code points";
555 }
556
557 my $indent = " " x 3;
558 $text = $indent . "/* $text */";
559
560 # Wrap the text so that it is no wider than the table, which the
561 # header line gives.
562 my $output_width = length $header_line;
563 while (length $text > $output_width) {
564 my $cur_line = substr($text, 0, $output_width);
565
566 # Find the first blank back from the right end to wrap at.
567 for (my $i = $output_width -1; $i > 0; $i--) {
568 if (substr($text, $i, 1) eq " ") {
569 print $out_fh substr($text, 0, $i), "\n";
570
571 # Set so will look at just the remaining tail (which will
572 # be indented and have a '*' after the indent
573 $text = $indent . " * " . substr($text, $i + 1);
574 last;
575 }
576 }
577 }
578
579 # And any remaining
580 print $out_fh $text, "\n" if $text;
581 }
582
583 # We calculated the header line earlier just to get its width so that we
584 # could make sure the annotations fit into that.
585 print $out_fh $header_line;
586
587 # Now output the bulk of the table.
588 for my $i (0 .. $size - 1) {
589
590 # First the row heading.
591 printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i];
592 print $out_fh "{"; # Then the brace for this row
593
594 # Then each column
595 for my $j (0 .. $size -1) {
596 print $out_fh $spacers[$j];
597 printf $out_fh "%*d", $column_width, $table_ref->[$i][$j];
598 print $out_fh "," if $j < $size - 1;
599 }
600 print $out_fh " }";
601 print $out_fh "," if $i < $size - 1;
602 print $out_fh "\n";
603 }
604
605 print $out_fh "};\n";
606}
607
973a28ed
KW
608sub output_GCB_table() {
609
610 # Create and output the pair table for use in determining Grapheme Cluster
611 # Breaks, given in http://www.unicode.org/reports/tr29/.
b0e24409
KW
612 my %gcb_actions = (
613 GCB_NOBREAK => 0,
614 GCB_BREAKABLE => 1,
615 GCB_RI_then_RI => 2, # Rules 12 and 13
616 GCB_EX_then_EM => 3, # Rule 10
617 );
973a28ed
KW
618
619 # The table is constructed in reverse order of the rules, to make the
620 # lower-numbered, higher priority ones override the later ones, as the
621 # algorithm stops at the earliest matching rule
622
623 my @gcb_table;
624 my $table_size = @gcb_short_enums;
625
626 # Otherwise, break everywhere.
b0e24409 627 # GB99 Any ÷ Any
973a28ed
KW
628 for my $i (0 .. $table_size - 1) {
629 for my $j (0 .. $table_size - 1) {
630 $gcb_table[$i][$j] = 1;
631 }
632 }
633
b0e24409
KW
634 # Do not break within emoji flag sequences. That is, do not break between
635 # regional indicator (RI) symbols if there is an odd number of RI
636 # characters before the break point. Must be resolved in runtime code.
637 #
c492f156 638 # GB12 sot (RI RI)* RI × RI
b0e24409
KW
639 # GB13 [^RI] (RI RI)* RI × RI
640 $gcb_table[$gcb_enums{'Regional_Indicator'}]
641 [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI};
642
643 # Do not break within emoji modifier sequences or emoji zwj sequences.
644 # GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ )
645 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0;
646 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0;
647
648 # GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier
649 $gcb_table[$gcb_enums{'Extend'}][$gcb_enums{'E_Modifier'}]
650 = $gcb_actions{GCB_EX_then_EM};
651 $gcb_table[$gcb_enums{'E_Base'}][$gcb_enums{'E_Modifier'}] = 0;
652 $gcb_table[$gcb_enums{'E_Base_GAZ'}][$gcb_enums{'E_Modifier'}] = 0;
653
654 # Do not break before extending characters or ZWJ.
973a28ed 655 # Do not break before SpacingMarks, or after Prepend characters.
973a28ed 656 # GB9b Prepend ×
b0e24409
KW
657 # GB9a × SpacingMark
658 # GB9 × ( Extend | ZWJ )
973a28ed 659 for my $i (0 .. @gcb_table - 1) {
289ce9cc 660 $gcb_table[$gcb_enums{'Prepend'}][$i] = 0;
b0e24409
KW
661 $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0;
662 $gcb_table[$i][$gcb_enums{'Extend'}] = 0;
663 $gcb_table[$i][$gcb_enums{'ZWJ'}] = 0;
973a28ed
KW
664 }
665
973a28ed
KW
666 # Do not break Hangul syllable sequences.
667 # GB8 ( LVT | T) × T
668 $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
669 $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
670
671 # GB7 ( LV | V ) × ( V | T )
672 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
673 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
674 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
675 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
676
677 # GB6 L × ( L | V | LV | LVT )
678 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
679 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
680 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
681 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
682
289ce9cc
KW
683 # Do not break between a CR and LF. Otherwise, break before and after
684 # controls.
973a28ed
KW
685 # GB5 ÷ ( Control | CR | LF )
686 # GB4 ( Control | CR | LF ) ÷
687 for my $i (0 .. @gcb_table - 1) {
289ce9cc 688 $gcb_table[$i][$gcb_enums{'Control'}] = 1;
973a28ed
KW
689 $gcb_table[$i][$gcb_enums{'CR'}] = 1;
690 $gcb_table[$i][$gcb_enums{'LF'}] = 1;
289ce9cc 691 $gcb_table[$gcb_enums{'Control'}][$i] = 1;
973a28ed
KW
692 $gcb_table[$gcb_enums{'CR'}][$i] = 1;
693 $gcb_table[$gcb_enums{'LF'}][$i] = 1;
694 }
695
696 # GB3 CR × LF
697 $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
698
b0e24409 699 # Break at the start and end of text, unless the text is empty
973a28ed
KW
700 # GB1 sot ÷
701 # GB2 ÷ eot
702 for my $i (0 .. @gcb_table - 1) {
289ce9cc
KW
703 $gcb_table[$i][$gcb_enums{'EDGE'}] = 1;
704 $gcb_table[$gcb_enums{'EDGE'}][$i] = 1;
973a28ed 705 }
289ce9cc 706 $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0;
973a28ed 707
b0e24409 708 output_table_common('GCB', \%gcb_actions,
289ce9cc 709 \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations);
973a28ed
KW
710}
711
6b659339
KW
712sub output_LB_table() {
713
714 # Create and output the enums, #defines, and pair table for use in
715 # determining Line Breaks. This uses the default line break algorithm,
716 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
717 # in that page, as the Unicode-furnished tests assume that tailoring.
718
6b659339
KW
719 # The result is really just true or false. But we follow along with tr14,
720 # creating a rule which is false for something like X SP* X. That gets
721 # encoding 2. The rest of the actions are synthetic ones that indicate
722 # some context handling is required. These each are added to the
723 # underlying 0, 1, or 2, instead of replacing them, so that the underlying
724 # value can be retrieved. Actually only rules from 7 through 18 (which
725 # are the ones where space matter) are possible to have 2 added to them.
726 # The others below add just 0 or 1. It might be possible for one
727 # synthetic rule to be added to another, yielding a larger value. This
728 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
729 # names of the middle grouping below, it is impossible for that to occur
730 # for them because they all start with mutually exclusive classes. That
731 # the final rule can't be added to any of the others isn't obvious from
732 # its name, so it is assigned a power of 2 higher than the others can get
733 # to so any addition would preserve all data. (And the code will reach an
734 # assert(0) on debugging builds should this happen.)
735 my %lb_actions = (
736 LB_NOBREAK => 0,
737 LB_BREAKABLE => 1,
738 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
739
b0e24409 740 LB_CM_ZWJ_foo => 3, # Rule 9
6b659339
KW
741 LB_SP_foo => 6, # Rule 18
742 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
743 LB_SY_or_IS_then_various => 11, # Rule 25
744 LB_HY_or_BA_then_foo => 13, # Rule 21
b0e24409 745 LB_RI_then_RI => 15, # Rule 30a
6b659339 746
b0e24409 747 LB_various_then_PO_or_PR => (1<<5), # Rule 25
6b659339
KW
748 );
749
6b659339
KW
750 # Construct the LB pair table. This is based on the rules in
751 # http://www.unicode.org/reports/tr14/, but modified as those rules are
752 # designed for someone taking a string of text and sequentially going
753 # through it to find the break opportunities, whereas, Perl requires
754 # determining if a given random spot is a break opportunity, without
755 # knowing all the entire string before it.
756 #
757 # The table is constructed in reverse order of the rules, to make the
758 # lower-numbered, higher priority ones override the later ones, as the
759 # algorithm stops at the earliest matching rule
760
761 my @lb_table;
762 my $table_size = @lb_short_enums;
763
764 # LB31. Break everywhere else
765 for my $i (0 .. $table_size - 1) {
766 for my $j (0 .. $table_size - 1) {
767 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
768 }
769 }
770
b0e24409
KW
771 # LB30b Do not break between an emoji base and an emoji modifier.
772 # EB × EM
773 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'E_Modifier'}]
774 = $lb_actions{'LB_NOBREAK'};
775
776 # LB30a Break between two regional indicator symbols if and only if there
777 # are an even number of regional indicators preceding the position of the
778 # break.
779 # sot (RI RI)* RI × RI
780 # [^RI] (RI RI)* RI × RI
289ce9cc 781 $lb_table[$lb_enums{'Regional_Indicator'}]
b0e24409 782 [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_RI_then_RI'};
6b659339
KW
783
784 # LB30 Do not break between letters, numbers, or ordinary symbols and
785 # opening or closing parentheses.
786 # (AL | HL | NU) × OP
289ce9cc
KW
787 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}]
788 = $lb_actions{'LB_NOBREAK'};
789 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}]
790 = $lb_actions{'LB_NOBREAK'};
791 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}]
792 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
793
794 # CP × (AL | HL | NU)
289ce9cc
KW
795 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}]
796 = $lb_actions{'LB_NOBREAK'};
797 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}]
798 = $lb_actions{'LB_NOBREAK'};
799 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}]
800 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
801
802 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
803 # IS × (AL | HL)
289ce9cc
KW
804 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}]
805 = $lb_actions{'LB_NOBREAK'};
806 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
807 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
808
809 # LB28 Do not break between alphabetics (“at”).
810 # (AL | HL) × (AL | HL)
289ce9cc
KW
811 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}]
812 = $lb_actions{'LB_NOBREAK'};
813 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}]
814 = $lb_actions{'LB_NOBREAK'};
815 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}]
816 = $lb_actions{'LB_NOBREAK'};
817 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}]
818 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
819
820 # LB27 Treat a Korean Syllable Block the same as ID.
821 # (JL | JV | JT | H2 | H3) × IN
289ce9cc
KW
822 $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}]
823 = $lb_actions{'LB_NOBREAK'};
824 $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}]
825 = $lb_actions{'LB_NOBREAK'};
826 $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}]
827 = $lb_actions{'LB_NOBREAK'};
828 $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}]
829 = $lb_actions{'LB_NOBREAK'};
830 $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}]
831 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
832
833 # (JL | JV | JT | H2 | H3) × PO
289ce9cc
KW
834 $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}]
835 = $lb_actions{'LB_NOBREAK'};
836 $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}]
837 = $lb_actions{'LB_NOBREAK'};
838 $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}]
839 = $lb_actions{'LB_NOBREAK'};
840 $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}]
841 = $lb_actions{'LB_NOBREAK'};
842 $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}]
843 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
844
845 # PR × (JL | JV | JT | H2 | H3)
289ce9cc
KW
846 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}]
847 = $lb_actions{'LB_NOBREAK'};
848 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}]
849 = $lb_actions{'LB_NOBREAK'};
850 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}]
851 = $lb_actions{'LB_NOBREAK'};
852 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}]
853 = $lb_actions{'LB_NOBREAK'};
854 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}]
855 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
856
857 # LB26 Do not break a Korean syllable.
858 # JL × (JL | JV | H2 | H3)
859 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
860 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
861 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
862 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
863
864 # (JV | H2) × (JV | JT)
865 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
866 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
867 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
868 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
869
870 # (JT | H3) × JT
871 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
872 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
873
874 # LB25 Do not break between the following pairs of classes relevant to
875 # numbers, as tailored by example 7 in
876 # http://www.unicode.org/reports/tr14/#Examples
877 # We follow that tailoring because Unicode's test cases expect it
878 # (PR | PO) × ( OP | HY )? NU
289ce9cc
KW
879 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}]
880 = $lb_actions{'LB_NOBREAK'};
881 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}]
882 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
883
884 # Given that (OP | HY )? is optional, we have to test for it in code.
885 # We add in the action (instead of overriding) for this, so that in
886 # the code we can recover the underlying break value.
289ce9cc 887 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 888 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 889 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}]
6b659339 890 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 891 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339 892 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
289ce9cc 893 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}]
6b659339
KW
894 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
895
896 # ( OP | HY ) × NU
289ce9cc
KW
897 $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}]
898 = $lb_actions{'LB_NOBREAK'};
899 $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}]
900 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
901
902 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
903 # which can be rewritten as:
904 # NU (SY | IS)* × (NU | SY | IS | CL | CP )
289ce9cc
KW
905 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}]
906 = $lb_actions{'LB_NOBREAK'};
907 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}]
908 = $lb_actions{'LB_NOBREAK'};
909 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}]
910 = $lb_actions{'LB_NOBREAK'};
911 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}]
912 = $lb_actions{'LB_NOBREAK'};
913 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}]
914 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
915
916 # Like earlier where we have to test in code, we add in the action so
917 # that we can recover the underlying values. This is done in rules
918 # below, as well. The code assumes that we haven't added 2 actions.
919 # Shoul a later Unicode release break that assumption, then tests
920 # should start failing.
289ce9cc 921 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}]
6b659339 922 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 923 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}]
6b659339 924 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 925 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}]
6b659339 926 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 927 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}]
6b659339 928 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 929 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}]
6b659339 930 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 931 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}]
6b659339 932 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 933 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}]
6b659339 934 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 935 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}]
6b659339 936 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 937 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}]
6b659339 938 += $lb_actions{'LB_SY_or_IS_then_various'};
289ce9cc 939 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}]
6b659339
KW
940 += $lb_actions{'LB_SY_or_IS_then_various'};
941
942 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
943 # which can be rewritten as:
944 # NU (SY | IS)* (CL | CP)? × (PO | PR)
289ce9cc
KW
945 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}]
946 = $lb_actions{'LB_NOBREAK'};
947 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}]
948 = $lb_actions{'LB_NOBREAK'};
6b659339 949
289ce9cc 950 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}]
6b659339 951 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 952 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}]
6b659339 953 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 954 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}]
6b659339 955 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 956 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}]
6b659339
KW
957 += $lb_actions{'LB_various_then_PO_or_PR'};
958
289ce9cc 959 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}]
6b659339 960 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 961 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}]
6b659339 962 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 963 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}]
6b659339 964 += $lb_actions{'LB_various_then_PO_or_PR'};
289ce9cc 965 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}]
6b659339
KW
966 += $lb_actions{'LB_various_then_PO_or_PR'};
967
b0e24409
KW
968 # LB24 Do not break between numeric prefix/postfix and letters, or between
969 # letters and prefix/postfix.
970 # (PR | PO) × (AL | HL)
289ce9cc
KW
971 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}]
972 = $lb_actions{'LB_NOBREAK'};
973 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
974 = $lb_actions{'LB_NOBREAK'};
289ce9cc
KW
975 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}]
976 = $lb_actions{'LB_NOBREAK'};
977 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
978 = $lb_actions{'LB_NOBREAK'};
6b659339 979
b0e24409
KW
980 # (AL | HL) × (PR | PO)
981 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Prefix_Numeric'}]
982 = $lb_actions{'LB_NOBREAK'};
983 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Prefix_Numeric'}]
984 = $lb_actions{'LB_NOBREAK'};
985 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Postfix_Numeric'}]
986 = $lb_actions{'LB_NOBREAK'};
987 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Postfix_Numeric'}]
988 = $lb_actions{'LB_NOBREAK'};
989
990 # LB23a Do not break between numeric prefixes and ideographs, or between
991 # ideographs and numeric postfixes.
992 # PR × (ID | EB | EM)
993 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}]
994 = $lb_actions{'LB_NOBREAK'};
995 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Base'}]
996 = $lb_actions{'LB_NOBREAK'};
997 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Modifier'}]
998 = $lb_actions{'LB_NOBREAK'};
999
1000 # (ID | EB | EM) × PO
289ce9cc
KW
1001 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}]
1002 = $lb_actions{'LB_NOBREAK'};
b0e24409
KW
1003 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Postfix_Numeric'}]
1004 = $lb_actions{'LB_NOBREAK'};
1005 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Postfix_Numeric'}]
1006 = $lb_actions{'LB_NOBREAK'};
6b659339 1007
b0e24409 1008 # LB23 Do not break between digits and letters
6b659339 1009 # (AL | HL) × NU
289ce9cc
KW
1010 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}]
1011 = $lb_actions{'LB_NOBREAK'};
1012 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}]
1013 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1014
1015 # NU × (AL | HL)
289ce9cc
KW
1016 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}]
1017 = $lb_actions{'LB_NOBREAK'};
1018 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}]
1019 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1020
1021 # LB22 Do not break between two ellipses, or between letters, numbers or
1022 # exclamations and ellipsis.
1023 # (AL | HL) × IN
289ce9cc
KW
1024 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}]
1025 = $lb_actions{'LB_NOBREAK'};
1026 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}]
1027 = $lb_actions{'LB_NOBREAK'};
6b659339 1028
289ce9cc
KW
1029 # Exclamation × IN
1030 $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}]
1031 = $lb_actions{'LB_NOBREAK'};
6b659339 1032
b0e24409 1033 # (ID | EB | EM) × IN
289ce9cc
KW
1034 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}]
1035 = $lb_actions{'LB_NOBREAK'};
b0e24409
KW
1036 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Inseparable'}]
1037 = $lb_actions{'LB_NOBREAK'};
1038 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Inseparable'}]
1039 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1040
1041 # IN × IN
289ce9cc
KW
1042 $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}]
1043 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1044
1045 # NU × IN
289ce9cc
KW
1046 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}]
1047 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1048
1049 # LB21b Don’t break between Solidus and Hebrew letters.
1050 # SY × HL
289ce9cc
KW
1051 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}]
1052 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1053
1054 # LB21a Don't break after Hebrew + Hyphen.
1055 # HL (HY | BA) ×
1056 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1057 $lb_table[$lb_enums{'Hyphen'}][$i]
1058 += $lb_actions{'LB_HY_or_BA_then_foo'};
1059 $lb_table[$lb_enums{'Break_After'}][$i]
1060 += $lb_actions{'LB_HY_or_BA_then_foo'};
6b659339
KW
1061 }
1062
1063 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
1064 # spaces, small kana, and other non-starters, or after acute accents.
1065 # × BA
1066 # × HY
1067 # × NS
1068 # BB ×
1069 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1070 $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'};
1071 $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'};
1072 $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'};
1073 $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1074 }
1075
1076 # LB20 Break before and after unresolved CB.
1077 # ÷ CB
1078 # CB ÷
1079 # Conditional breaks should be resolved external to the line breaking
1080 # rules. However, the default action is to treat unresolved CB as breaking
1081 # before and after.
1082 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1083 $lb_table[$i][$lb_enums{'Contingent_Break'}]
1084 = $lb_actions{'LB_BREAKABLE'};
1085 $lb_table[$lb_enums{'Contingent_Break'}][$i]
1086 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1087 }
1088
1089 # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
1090 # × QU
1091 # QU ×
1092 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1093 $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'};
1094 $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1095 }
1096
1097 # LB18 Break after spaces
1098 # SP ÷
1099 for my $i (0 .. @lb_table - 1) {
289ce9cc 1100 $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1101 }
1102
1103 # LB17 Do not break within ‘——’, even with intervening spaces.
1104 # B2 SP* × B2
289ce9cc 1105 $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}]
6b659339
KW
1106 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1107
1108 # LB16 Do not break between closing punctuation and a nonstarter even with
1109 # intervening spaces.
1110 # (CL | CP) SP* × NS
289ce9cc 1111 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}]
6b659339 1112 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1113 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}]
6b659339
KW
1114 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1115
1116
1117 # LB15 Do not break within ‘”[’, even with intervening spaces.
1118 # QU SP* × OP
289ce9cc 1119 $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}]
6b659339
KW
1120 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1121
1122 # LB14 Do not break after ‘[’, even after spaces.
1123 # OP SP* ×
1124 for my $i (0 .. @lb_table - 1) {
289ce9cc 1125 $lb_table[$lb_enums{'Open_Punctuation'}][$i]
6b659339
KW
1126 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1127 }
1128
1129 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
1130 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
1131 # [^NU] × CL
1132 # [^NU] × CP
1133 # × EX
1134 # [^NU] × IS
1135 # [^NU] × SY
1136 for my $i (0 .. @lb_table - 1) {
289ce9cc 1137 $lb_table[$i][$lb_enums{'Exclamation'}]
6b659339
KW
1138 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1139
289ce9cc 1140 next if $i == $lb_enums{'Numeric'};
6b659339 1141
289ce9cc 1142 $lb_table[$i][$lb_enums{'Close_Punctuation'}]
6b659339 1143 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1144 $lb_table[$i][$lb_enums{'Close_Parenthesis'}]
6b659339 1145 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1146 $lb_table[$i][$lb_enums{'Infix_Numeric'}]
6b659339 1147 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1148 $lb_table[$i][$lb_enums{'Break_Symbols'}]
6b659339
KW
1149 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1150 }
1151
1152 # LB12a Do not break before NBSP and related characters, except after
1153 # spaces and hyphens.
1154 # [^SP BA HY] × GL
1155 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1156 next if $i == $lb_enums{'Space'}
1157 || $i == $lb_enums{'Break_After'}
1158 || $i == $lb_enums{'Hyphen'};
6b659339
KW
1159
1160 # We don't break, but if a property above has said don't break even
1161 # with space between, don't override that (also in the next few rules)
289ce9cc 1162 next if $lb_table[$i][$lb_enums{'Glue'}]
6b659339 1163 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1164 $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1165 }
1166
1167 # LB12 Do not break after NBSP and related characters.
1168 # GL ×
1169 for my $i (0 .. @lb_table - 1) {
289ce9cc 1170 next if $lb_table[$lb_enums{'Glue'}][$i]
6b659339 1171 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
289ce9cc 1172 $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1173 }
1174
1175 # LB11 Do not break before or after Word joiner and related characters.
1176 # × WJ
1177 # WJ ×
1178 for my $i (0 .. @lb_table - 1) {
289ce9cc 1179 if ($lb_table[$i][$lb_enums{'Word_Joiner'}]
6b659339
KW
1180 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1181 {
289ce9cc 1182 $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'};
6b659339 1183 }
289ce9cc 1184 if ($lb_table[$lb_enums{'Word_Joiner'}][$i]
6b659339
KW
1185 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1186 {
289ce9cc 1187 $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1188 }
1189 }
1190
1191 # Special case this here to avoid having to do a special case in the code,
1192 # by making this the same as other things with a SP in front of them that
1193 # don't break, we avoid an extra test
289ce9cc 1194 $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}]
6b659339
KW
1195 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1196
1197 # LB9 and LB10 are done in the same loop
1198 #
1199 # LB9 Do not break a combining character sequence; treat it as if it has
1200 # the line breaking class of the base character in all of the
b0e24409
KW
1201 # higher-numbered rules. Treat ZWJ as if it were CM
1202 # Treat X (CM|ZWJ)* as if it were X.
6b659339
KW
1203 # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
1204
b0e24409
KW
1205 # LB10 Treat any remaining combining mark or ZWJ as AL. This catches the
1206 # case where a CM or ZWJ is the first character on the line or follows SP,
1207 # BK, CR, LF, NL, or ZW.
6b659339
KW
1208 for my $i (0 .. @lb_table - 1) {
1209
b0e24409
KW
1210 # When the CM or ZWJ is the first in the pair, we don't know without
1211 # looking behind whether the CM or ZWJ is going to attach to an
1212 # earlier character, or not. So have to figure this out at runtime in
1213 # the code
1214 $lb_table[$lb_enums{'Combining_Mark'}][$i]
1215 = $lb_actions{'LB_CM_ZWJ_foo'};
1216 $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_CM_ZWJ_foo'};
289ce9cc
KW
1217
1218 if ( $i == $lb_enums{'Mandatory_Break'}
1219 || $i == $lb_enums{'EDGE'}
1220 || $i == $lb_enums{'Carriage_Return'}
1221 || $i == $lb_enums{'Line_Feed'}
1222 || $i == $lb_enums{'Next_Line'}
1223 || $i == $lb_enums{'Space'}
1224 || $i == $lb_enums{'ZWSpace'})
6b659339
KW
1225 {
1226 # For these classes, a following CM doesn't combine, and should do
289ce9cc
KW
1227 # whatever 'Alphabetic' would do.
1228 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1229 = $lb_table[$i][$lb_enums{'Alphabetic'}];
b0e24409
KW
1230 $lb_table[$i][$lb_enums{'ZWJ'}]
1231 = $lb_table[$i][$lb_enums{'Alphabetic'}];
6b659339
KW
1232 }
1233 else {
b0e24409
KW
1234 # For these classes, the CM or ZWJ combines, so doesn't break,
1235 # inheriting the type of nobreak from the master character.
289ce9cc 1236 if ($lb_table[$i][$lb_enums{'Combining_Mark'}]
6b659339
KW
1237 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1238 {
289ce9cc
KW
1239 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1240 = $lb_actions{'LB_NOBREAK'};
6b659339 1241 }
b0e24409
KW
1242 if ($lb_table[$i][$lb_enums{'ZWJ'}]
1243 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1244 {
1245 $lb_table[$i][$lb_enums{'ZWJ'}]
1246 = $lb_actions{'LB_NOBREAK'};
1247 }
6b659339
KW
1248 }
1249 }
1250
b0e24409
KW
1251 # LB8a Do not break between a zero width joiner and an ideograph, emoji
1252 # base or emoji modifier. This rule prevents breaks within emoji joiner
1253 # sequences.
1254 # ZWJ × (ID | EB | EM)
1255 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}]
1256 = $lb_actions{'LB_NOBREAK'};
1257 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}]
1258 = $lb_actions{'LB_NOBREAK'};
1259 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}]
1260 = $lb_actions{'LB_NOBREAK'};
1261
6b659339
KW
1262 # LB8 Break before any character following a zero-width space, even if one
1263 # or more spaces intervene.
1264 # ZW SP* ÷
1265 for my $i (0 .. @lb_table - 1) {
289ce9cc 1266 $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1267 }
1268
1269 # Because of LB8-10, we need to look at context for "SP x", and this must
1270 # be done in the code. So override the existing rules for that, by adding
1271 # a constant to get new rules that tell the code it needs to look at
1272 # context. By adding this action instead of replacing the existing one,
1273 # we can get back to the original rule if necessary.
1274 for my $i (0 .. @lb_table - 1) {
289ce9cc 1275 $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'};
6b659339
KW
1276 }
1277
1278 # LB7 Do not break before spaces or zero width space.
1279 # × SP
1280 # × ZW
1281 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1282 $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'};
1283 $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1284 }
1285
1286 # LB6 Do not break before hard line breaks.
1287 # × ( BK | CR | LF | NL )
1288 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1289 $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'};
1290 $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'};
1291 $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'};
1292 $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1293 }
1294
1295 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
1296 # CR × LF
1297 # CR !
1298 # LF !
1299 # NL !
1300 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1301 $lb_table[$lb_enums{'Carriage_Return'}][$i]
1302 = $lb_actions{'LB_BREAKABLE'};
1303 $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'};
1304 $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'};
6b659339 1305 }
289ce9cc
KW
1306 $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}]
1307 = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1308
1309 # LB4 Always break after hard line breaks.
1310 # BK !
1311 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1312 $lb_table[$lb_enums{'Mandatory_Break'}][$i]
1313 = $lb_actions{'LB_BREAKABLE'};
6b659339
KW
1314 }
1315
6b659339
KW
1316 # LB3 Always break at the end of text.
1317 # ! eot
b0e24409
KW
1318 # LB2 Never break at the start of text.
1319 # sot ×
6b659339 1320 for my $i (0 .. @lb_table - 1) {
289ce9cc
KW
1321 $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'};
1322 $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'};
6b659339
KW
1323 }
1324
1325 # LB1 Assign a line breaking class to each code point of the input.
1326 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
1327 # depending on criteria outside the scope of this algorithm.
1328 #
1329 # In the absence of such criteria all characters with a specific
1330 # combination of original class and General_Category property value are
1331 # resolved as follows:
1332 # Original Resolved General_Category
1333 # AI, SG, XX AL Any
1334 # SA CM Only Mn or Mc
1335 # SA AL Any except Mn and Mc
1336 # CJ NS Any
1337 #
1338 # This is done in mktables, so we never see any of the remapped-from
1339 # classes.
1340
289ce9cc
KW
1341 output_table_common('LB', \%lb_actions,
1342 \@lb_table, \@lb_short_enums, \%lb_abbreviations);
6b659339
KW
1343}
1344
7e54b87f
KW
1345sub output_WB_table() {
1346
1347 # Create and output the enums, #defines, and pair table for use in
1348 # determining Word Breaks, given in http://www.unicode.org/reports/tr29/.
1349
1350 # This uses the same mechanism in the other bounds tables generated by
1351 # this file. The actions that could override a 0 or 1 are added to those
1352 # numbers; the actions that clearly don't depend on the underlying rule
1353 # simply overwrite
1354 my %wb_actions = (
1355 WB_NOBREAK => 0,
1356 WB_BREAKABLE => 1,
1357 WB_hs_then_hs => 2,
b0e24409 1358 WB_Ex_or_FO_or_ZWJ_then_foo => 3,
7e54b87f
KW
1359 WB_DQ_then_HL => 4,
1360 WB_HL_then_DQ => 6,
1361 WB_LE_or_HL_then_MB_or_ML_or_SQ => 8,
1362 WB_MB_or_ML_or_SQ_then_LE_or_HL => 10,
1363 WB_MB_or_MN_or_SQ_then_NU => 12,
1364 WB_NU_then_MB_or_MN_or_SQ => 14,
b0e24409 1365 WB_RI_then_RI => 16,
7e54b87f
KW
1366 );
1367
7e54b87f
KW
1368 # Construct the WB pair table.
1369 # The table is constructed in reverse order of the rules, to make the
1370 # lower-numbered, higher priority ones override the later ones, as the
1371 # algorithm stops at the earliest matching rule
1372
1373 my @wb_table;
1374 my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
39c4defe 1375 die "UNKNOWN must be final WB enum" unless $wb_short_enums[-1] =~ /unk/i;
7e54b87f
KW
1376
1377 # Otherwise, break everywhere (including around ideographs).
b0e24409 1378 # WB99 Any ÷ Any
7e54b87f
KW
1379 for my $i (0 .. $table_size - 1) {
1380 for my $j (0 .. $table_size - 1) {
1381 $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'};
1382 }
1383 }
1384
b0e24409
KW
1385 # Do not break within emoji flag sequences. That is, do not break between
1386 # regional indicator (RI) symbols if there is an odd number of RI
1387 # characters before the break point.
1388 # WB16 [^RI] (RI RI)* RI × RI
c492f156 1389 # WB15 sot (RI RI)* RI × RI
289ce9cc 1390 $wb_table[$wb_enums{'Regional_Indicator'}]
b0e24409
KW
1391 [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_RI_then_RI'};
1392
1393 # Do not break within emoji modifier sequences.
1394 # WB14 ( E_Base | EBG ) × E_Modifier
1395 $wb_table[$wb_enums{'E_Base'}][$wb_enums{'E_Modifier'}]
1396 = $wb_actions{'WB_NOBREAK'};
1397 $wb_table[$wb_enums{'E_Base_GAZ'}][$wb_enums{'E_Modifier'}]
1398 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1399
1400 # Do not break from extenders.
1401 # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
289ce9cc
KW
1402 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
1403 = $wb_actions{'WB_NOBREAK'};
1404 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
1405 = $wb_actions{'WB_NOBREAK'};
1406 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
1407 = $wb_actions{'WB_NOBREAK'};
1408 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}]
1409 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1410
1411 # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
1412 # × # ExtendNumLet
289ce9cc
KW
1413 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
1414 = $wb_actions{'WB_NOBREAK'};
1415 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
1416 = $wb_actions{'WB_NOBREAK'};
1417 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
1418 = $wb_actions{'WB_NOBREAK'};
1419 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}]
1420 = $wb_actions{'WB_NOBREAK'};
1421 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}]
1422 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1423
1424 # Do not break between Katakana.
1425 # WB13 Katakana × Katakana
289ce9cc
KW
1426 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}]
1427 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1428
1429 # Do not break within sequences, such as “3.2” or “3,456.789”.
1430 # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
289ce9cc 1431 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}]
7e54b87f 1432 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1433 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}]
7e54b87f 1434 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
289ce9cc 1435 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1436 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1437
1438 # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
289ce9cc 1439 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}]
7e54b87f 1440 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1441 $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}]
7e54b87f 1442 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
289ce9cc 1443 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}]
7e54b87f
KW
1444 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1445
1446 # Do not break within sequences of digits, or digits adjacent to letters
1447 # (“3a”, or “A3”).
1448 # WB10 Numeric × (ALetter | Hebrew_Letter)
289ce9cc
KW
1449 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
1450 = $wb_actions{'WB_NOBREAK'};
1451 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
1452 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1453
1454 # WB9 (ALetter | Hebrew_Letter) × Numeric
289ce9cc
KW
1455 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
1456 = $wb_actions{'WB_NOBREAK'};
1457 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
1458 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1459
1460 # WB8 Numeric × Numeric
289ce9cc
KW
1461 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}]
1462 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1463
1464 # Do not break letters across certain punctuation.
1465 # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
289ce9cc
KW
1466 $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}]
1467 += $wb_actions{'WB_DQ_then_HL'};
7e54b87f
KW
1468
1469 # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
289ce9cc
KW
1470 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}]
1471 += $wb_actions{'WB_HL_then_DQ'};
7e54b87f
KW
1472
1473 # WB7a Hebrew_Letter × Single_Quote
289ce9cc
KW
1474 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1475 = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1476
1477 # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)
1478 # × (ALetter | Hebrew_Letter)
289ce9cc 1479 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
7e54b87f 1480 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1481 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1482 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1483 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
7e54b87f 1484 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1485 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f 1486 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1487 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
7e54b87f 1488 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
289ce9cc 1489 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
7e54b87f
KW
1490 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1491
1492 # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
1493 # | Single_Quote) (ALetter | Hebrew_Letter)
289ce9cc 1494 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1495 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1496 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
7e54b87f 1497 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1498 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
7e54b87f 1499 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1500 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
7e54b87f 1501 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1502 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
7e54b87f 1503 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
289ce9cc 1504 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
7e54b87f
KW
1505 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1506
1507 # Do not break between most letters.
1508 # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
289ce9cc
KW
1509 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
1510 = $wb_actions{'WB_NOBREAK'};
1511 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
1512 = $wb_actions{'WB_NOBREAK'};
1513 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
1514 = $wb_actions{'WB_NOBREAK'};
1515 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
1516 = $wb_actions{'WB_NOBREAK'};
7e54b87f 1517
b0e24409
KW
1518 # Ignore Format and Extend characters, except after sot, CR, LF, and
1519 # Newline. This also has the effect of: Any × (Format | Extend | ZWJ)
1520 # WB4 X (Extend | Format | ZWJ)* → X
7e54b87f 1521 for my $i (0 .. @wb_table - 1) {
289ce9cc 1522 $wb_table[$wb_enums{'Extend'}][$i]
b0e24409 1523 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
289ce9cc 1524 $wb_table[$wb_enums{'Format'}][$i]
b0e24409
KW
1525 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1526 $wb_table[$wb_enums{'ZWJ'}][$i]
1527 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1528 }
1529 for my $i (0 .. @wb_table - 1) {
1530 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1531 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
1532 $wb_table[$i][$wb_enums{'ZWJ'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1533 }
1534
1535 # Implied is that these attach to the character before them, except for
1536 # the characters that mark the end of a region of text. The rules below
1537 # override the ones set up here, for all the characters that need
1538 # overriding.
1539 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1540 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1541 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
7e54b87f
KW
1542 }
1543
b0e24409
KW
1544 # Do not break within emoji zwj sequences.
1545 # WB3c ZWJ × ( Glue_After_Zwj | EBG )
1546 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}]
1547 = $wb_actions{'WB_NOBREAK'};
1548 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}]
1549 = $wb_actions{'WB_NOBREAK'};
1550
7e54b87f
KW
1551 # Break before and after white space
1552 # WB3b ÷ (Newline | CR | LF)
1553 # WB3a (Newline | CR | LF) ÷
1554 # et. al.
289ce9cc 1555 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1556 for my $j (0 .. @wb_table - 1) {
1557 $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'};
1558 $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'};
1559 }
1560 }
1561
1562 # But do not break within white space.
1563 # WB3 CR × LF
1564 # et.al.
289ce9cc
KW
1565 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1566 for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
7e54b87f
KW
1567 $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'};
1568 }
1569 }
1570
b0e24409 1571 # And do not break horizontal space followed by Extend or Format or ZWJ
289ce9cc
KW
1572 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}]
1573 = $wb_actions{'WB_NOBREAK'};
1574 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}]
1575 = $wb_actions{'WB_NOBREAK'};
b0e24409
KW
1576 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'ZWJ'}]
1577 = $wb_actions{'WB_NOBREAK'};
289ce9cc
KW
1578 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}]
1579 [$wb_enums{'Perl_Tailored_HSpace'}]
1580 = $wb_actions{'WB_hs_then_hs'};
7e54b87f 1581
b0e24409
KW
1582 # Break at the start and end of text, unless the text is empty
1583 # WB2 Any ÷ eot
1584 # WB1 sot ÷ Any
7e54b87f 1585 for my $i (0 .. @wb_table - 1) {
289ce9cc
KW
1586 $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'};
1587 $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'};
7e54b87f 1588 }
289ce9cc 1589 $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0;
7e54b87f 1590
289ce9cc
KW
1591 output_table_common('WB', \%wb_actions,
1592 \@wb_table, \@wb_short_enums, \%wb_abbreviations);
7e54b87f
KW
1593}
1594
9d9177be
KW
1595output_invlist("Latin1", [ 0, 256 ]);
1596output_invlist("AboveLatin1", [ 256 ]);
1597
bffc0129 1598end_file_pound_if;
43b443dd 1599
3f427fd9
KW
1600# We construct lists for all the POSIX and backslash sequence character
1601# classes in two forms:
1602# 1) ones which match only in the ASCII range
1603# 2) ones which match either in the Latin1 range, or the entire Unicode range
1604#
1605# These get compiled in, and hence affect the memory footprint of every Perl
1606# program, even those not using Unicode. To minimize the size, currently
1607# the Latin1 version is generated for the beyond ASCII range except for those
1608# lists that are quite small for the entire range, such as for \s, which is 22
1609# UVs long plus 4 UVs (currently) for the header.
1610#
1611# To save even more memory, the ASCII versions could be derived from the
1612# larger ones at runtime, saving some memory (minus the expense of the machine
1613# instructions to do so), but these are all small anyway, so their total is
1614# about 100 UVs.
1615#
1616# In the list of properties below that get generated, the L1 prefix is a fake
1617# property that means just the Latin1 range of the full property (whose name
1618# has an X prefix instead of L1).
a02047bf
KW
1619#
1620# An initial & means to use the subroutine from this file instead of an
1621# official inversion list.
3f427fd9 1622
0c4ecf42
KW
1623for my $charset (get_supported_code_pages()) {
1624 print $out_fh "\n" . get_conditional_compile_line_start($charset);
1625
99f21fb9 1626 @a2n = @{get_a2n($charset)};
226b74db
KW
1627 # Below is the list of property names to generate. '&' means to use the
1628 # subroutine to generate the inversion list instead of the generic code
1629 # below. Some properties have a comma-separated list after the name,
1630 # These are extra enums to add to those found in the Unicode tables.
99f21fb9
KW
1631 no warnings 'qw';
1632 # Ignore non-alpha in sort
1633 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
c0382778 1634 Assigned
1c8c3428
KW
1635 ASCII
1636 Cased
1637 VertSpace
1638 XPerlSpace
1639 XPosixAlnum
1640 XPosixAlpha
1641 XPosixBlank
1642 XPosixCntrl
1643 XPosixDigit
1644 XPosixGraph
1645 XPosixLower
1646 XPosixPrint
1647 XPosixPunct
1648 XPosixSpace
1649 XPosixUpper
1650 XPosixWord
1651 XPosixXDigit
1652 _Perl_Any_Folds
1653 &NonL1_Perl_Non_Final_Folds
1654 _Perl_Folds_To_Multi_Char
1655 &UpperLatin1
1656 _Perl_IDStart
1657 _Perl_IDCont
226b74db
KW
1658 _Perl_GCB,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,EDGE
1659 _Perl_LB,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner,EDGE,
1660 _Perl_SB,SContinue,CR,Extend,LF,EDGE
1661 _Perl_WB,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,EDGE,UNKNOWN
1c8c3428 1662 )
226b74db
KW
1663 # NOTE that the convention is that extra enum
1664 # values come after the property name, separated by
1665 # commas, with the enums that aren't ever defined
1666 # by Unicode coming last, at least 4 all-uppercase
1667 # characters. The others are enum names that are
1668 # needed by perl, but aren't in all Unicode
1669 # releases.
0f5e3c71
KW
1670 ) {
1671
1672 # For the Latin1 properties, we change to use the eXtended version of the
1673 # base property, then go through the result and get rid of everything not
1674 # in Latin1 (above 255). Actually, we retain the element for the range
1675 # that crosses the 255/256 boundary if it is one that matches the
1676 # property. For example, in the Word property, there is a range of code
1677 # points that start at U+00F8 and goes through U+02C1. Instead of
1678 # artificially cutting that off at 256 because 256 is the first code point
1679 # above Latin1, we let the range go to its natural ending. That gives us
1680 # extra information with no added space taken. But if the range that
1681 # crosses the boundary is one that doesn't match the property, we don't
1682 # start a new range above 255, as that could be construed as going to
1683 # infinity. For example, the Upper property doesn't include the character
1684 # at 255, but does include the one at 256. We don't include the 256 one.
1685 my $prop_name = $prop;
1686 my $is_local_sub = $prop_name =~ s/^&//;
99f21fb9
KW
1687 my $extra_enums = "";
1688 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
0f5e3c71
KW
1689 my $lookup_prop = $prop_name;
1690 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1691 or $lookup_prop =~ s/^L1//);
1692 my $nonl1_only = 0;
1693 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
99f21fb9 1694 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
0f5e3c71
KW
1695
1696 my @invlist;
99f21fb9
KW
1697 my @invmap;
1698 my $map_format;
1699 my $map_default;
1700 my $maps_to_code_point;
1701 my $to_adjust;
0f5e3c71
KW
1702 if ($is_local_sub) {
1703 @invlist = eval $lookup_prop;
289ce9cc 1704 die $@ if $@;
0f5e3c71
KW
1705 }
1706 else {
1707 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
99f21fb9 1708 if (! @invlist) {
99f21fb9 1709
ad85f59a
KW
1710 # If couldn't find a non-empty inversion list, see if it is
1711 # instead an inversion map
1712 my ($list_ref, $map_ref, $format, $default)
99f21fb9 1713 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
ad85f59a
KW
1714 if (! $list_ref) {
1715 # An empty return here could mean an unknown property, or
1716 # merely that the original inversion list is empty. Call
1717 # in scalar context to differentiate
1718 my $count = prop_invlist($lookup_prop,
1719 '_perl_core_internal_ok');
1720 die "Could not find inversion list for '$lookup_prop'"
1721 unless defined $count;
1722 }
1723 else {
18b852b3
KW
1724 @invlist = @$list_ref;
1725 @invmap = @$map_ref;
1726 $map_format = $format;
1727 $map_default = $default;
1728 $maps_to_code_point = $map_format =~ /x/;
1729 $to_adjust = $map_format =~ /a/;
ad85f59a 1730 }
99f21fb9 1731 }
0f5e3c71 1732 }
ad85f59a
KW
1733
1734
1735 # Short-circuit an empty inversion list.
1736 if (! @invlist) {
1737 output_invlist($prop_name, \@invlist, $charset);
1738 next;
1739 }
ceb1de32 1740
99f21fb9
KW
1741 # Re-order the Unicode code points to native ones for this platform.
1742 # This is only needed for code points below 256, because native code
1743 # points are only in that range. For inversion maps of properties
1744 # where the mappings are adjusted (format =~ /a/), this reordering
1745 # could mess up the adjustment pattern that was in the input, so that
1746 # has to be dealt with.
1747 #
1748 # And inversion maps that map to code points need to eventually have
1749 # all those code points remapped to native, and it's better to do that
1750 # here, going through the whole list not just those below 256. This
1751 # is because some inversion maps have adjustments (format =~ /a/)
1752 # which may be affected by the reordering. This code needs to be done
1753 # both for when we are translating the inversion lists for < 256, and
1754 # for the inversion maps for everything. By doing both in this loop,
1755 # we can share that code.
1756 #
1757 # So, we go through everything for an inversion map to code points;
1758 # otherwise, we can skip any remapping at all if we are going to
1759 # output only the above-Latin1 values, or if the range spans the whole
1760 # of 0..256, as the remap will also include all of 0..256 (256 not
1761 # 255 because a re-ordering could cause 256 to need to be in the same
1762 # range as 255.)
1763 if ((@invmap && $maps_to_code_point)
1764 || (! $nonl1_only || ($invlist[0] < 256
1765 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
ceb1de32 1766 {
fb4554ea 1767
99f21fb9 1768 if (! @invmap) { # Straight inversion list
fb4554ea
KW
1769 # Look at all the ranges that start before 257.
1770 my @latin1_list;
1771 while (@invlist) {
1772 last if $invlist[0] > 256;
1773 my $upper = @invlist > 1
1774 ? $invlist[1] - 1 # In range
8a6c81cf
KW
1775
1776 # To infinity. You may want to stop much much
1777 # earlier; going this high may expose perl
1778 # deficiencies with very large numbers.
1779 : $Unicode::UCD::MAX_CP;
fb4554ea 1780 for my $j ($invlist[0] .. $upper) {
99f21fb9 1781 push @latin1_list, a2n($j);
0f5e3c71 1782 }
fb4554ea
KW
1783
1784 shift @invlist; # Shift off the range that's in the list
1785 shift @invlist; # Shift off the range not in the list
0c4ecf42 1786 }
fb4554ea
KW
1787
1788 # Here @invlist contains all the ranges in the original that start
1789 # at code points above 256, and @latin1_list contains all the
1790 # native code points for ranges that start with a Unicode code
1791 # point below 257. We sort the latter and convert it to inversion
1792 # list format. Then simply prepend it to the list of the higher
1793 # code points.
1794 @latin1_list = sort { $a <=> $b } @latin1_list;
5a7e5385 1795 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
fb4554ea 1796 unshift @invlist, @latin1_list;
99f21fb9
KW
1797 }
1798 else { # Is an inversion map
1799
1800 # This is a similar procedure as plain inversion list, but has
1801 # multiple buckets. A plain inversion list just has two
1802 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
1803 # pretty much can ignore the 2nd bucket, as it is completely
1804 # defined by the 1st. But here, what we do is create buckets
1805 # which contain the code points that map to each, translated
1806 # to native and turned into an inversion list. Thus each
1807 # bucket is an inversion list of native code points that map
1808 # to it or don't map to it. We use these to create an
1809 # inversion map for the whole property.
1810
1811 # As mentioned earlier, we use this procedure to not just
1812 # remap the inversion list to native values, but also the maps
1813 # of code points to native ones. In the latter case we have
1814 # to look at the whole of the inversion map (or at least to
1815 # above Unicode; as the maps of code points above that should
1816 # all be to the default).
1817 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
1818
1819 my %mapped_lists; # A hash whose keys are the buckets.
1820 while (@invlist) {
1821 last if $invlist[0] > $upper_limit;
1822
1823 # This shouldn't actually happen, as prop_invmap() returns
1824 # an extra element at the end that is beyond $upper_limit
1825 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
1826
1827 my $bucket;
1828
1829 # A hash key can't be a ref (we are only expecting arrays
1830 # of scalars here), so convert any such to a string that
1831 # will be converted back later (using a vertical tab as
1832 # the separator). Even if the mapping is to code points,
1833 # we don't translate to native here because the code
d8049362 1834 # output_invmap() calls to output these arrays assumes the
99f21fb9
KW
1835 # input is Unicode, not native.
1836 if (ref $invmap[0]) {
1837 $bucket = join "\cK", @{$invmap[0]};
1838 }
1839 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
1840
1841 # Do convert to native for maps to single code points.
1842 # There are some properties that have a few outlier
1843 # maps that aren't code points, so the above test
1844 # skips those.
1845 $bucket = a2n($invmap[0]);
1846 } else {
1847 $bucket = $invmap[0];
1848 }
1849
1850 # We now have the bucket that all code points in the range
1851 # map to, though possibly they need to be adjusted. Go
1852 # through the range and put each translated code point in
1853 # it into its bucket.
1854 my $base_map = $invmap[0];
1855 for my $j ($invlist[0] .. $invlist[1] - 1) {
1856 if ($to_adjust
1857 # The 1st code point doesn't need adjusting
1858 && $j > $invlist[0]
1859
1860 # Skip any non-numeric maps: these are outliers
1861 # that aren't code points.
1862 && $base_map =~ $numeric_re
1863
1864 # 'ne' because the default can be a string
1865 && $base_map ne $map_default)
1866 {
1867 # We adjust, by incrementing each the bucket and
1868 # the map. For code point maps, translate to
1869 # native
1870 $base_map++;
1871 $bucket = ($maps_to_code_point)
1872 ? a2n($base_map)
1873 : $base_map;
1874 }
1875
1876 # Add the native code point to the bucket for the
1877 # current map
1878 push @{$mapped_lists{$bucket}}, a2n($j);
1879 } # End of loop through all code points in the range
1880
1881 # Get ready for the next range
1882 shift @invlist;
1883 shift @invmap;
1884 } # End of loop through all ranges in the map.
1885
1886 # Here, @invlist and @invmap retain all the ranges from the
1887 # originals that start with code points above $upper_limit.
1888 # Each bucket in %mapped_lists contains all the code points
1889 # that map to that bucket. If the bucket is for a map to a
1890 # single code point is a single code point, the bucket has
1891 # been converted to native. If something else (including
1892 # multiple code points), no conversion is done.
1893 #
1894 # Now we recreate the inversion map into %xlated, but this
1895 # time for the native character set.
1896 my %xlated;
1897 foreach my $bucket (keys %mapped_lists) {
1898
1899 # Sort and convert this bucket to an inversion list. The
1900 # result will be that ranges that start with even-numbered
1901 # indexes will be for code points that map to this bucket;
1902 # odd ones map to some other bucket, and are discarded
1903 # below.
1904 @{$mapped_lists{$bucket}}
1905 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
1906 @{$mapped_lists{$bucket}}
1907 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
1908
1909 # Add each even-numbered range in the bucket to %xlated;
1910 # so that the keys of %xlated become the range start code
1911 # points, and the values are their corresponding maps.
1912 while (@{$mapped_lists{$bucket}}) {
1913 my $range_start = $mapped_lists{$bucket}->[0];
1914 if ($bucket =~ /\cK/) {
1915 @{$xlated{$range_start}} = split /\cK/, $bucket;
1916 }
1917 else {
1918 $xlated{$range_start} = $bucket;
1919 }
1920 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
1921 shift @{$mapped_lists{$bucket}}; # Get ready for next
1922 # iteration
1923 }
1924 } # End of loop through all the buckets.
1925
1926 # Here %xlated's keys are the range starts of all the code
1927 # points in the inversion map. Construct an inversion list
1928 # from them.
1929 my @new_invlist = sort { $a <=> $b } keys %xlated;
1930
1931 # If the list is adjusted, we want to munge this list so that
1932 # we only have one entry for where consecutive code points map
1933 # to consecutive values. We just skip the subsequent entries
1934 # where this is the case.
1935 if ($to_adjust) {
1936 my @temp;
1937 for my $i (0 .. @new_invlist - 1) {
1938 next if $i > 0
1939 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
1940 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
1941 && $xlated{$new_invlist[$i]} =~ $numeric_re
1942 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
1943 push @temp, $new_invlist[$i];
1944 }
1945 @new_invlist = @temp;
1946 }
1947
1948 # The inversion map comes from %xlated's values. We can
1949 # unshift each onto the front of the untouched portion, in
1950 # reverse order of the portion we did process.
1951 foreach my $start (reverse @new_invlist) {
1952 unshift @invmap, $xlated{$start};
1953 }
1954
1955 # Finally prepend the inversion list we have just constructed to the
1956 # one that contains anything we didn't process.
1957 unshift @invlist, @new_invlist;
1958 }
1959 }
1960
1961 # prop_invmap() returns an extra final entry, which we can now
1962 # discard.
1963 if (@invmap) {
1964 pop @invlist;
1965 pop @invmap;
ceb1de32 1966 }
0f5e3c71
KW
1967
1968 if ($l1_only) {
99f21fb9 1969 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
0f5e3c71
KW
1970 for my $i (0 .. @invlist - 1 - 1) {
1971 if ($invlist[$i] > 255) {
1972
1973 # In an inversion list, even-numbered elements give the code
1974 # points that begin ranges that match the property;
1975 # odd-numbered give ones that begin ranges that don't match.
1976 # If $i is odd, we are at the first code point above 255 that
1977 # doesn't match, which means the range it is ending does
1978 # match, and crosses the 255/256 boundary. We want to include
1979 # this ending point, so increment $i, so the splice below
1980 # includes it. Conversely, if $i is even, it is the first
1981 # code point above 255 that matches, which means there was no
1982 # matching range that crossed the boundary, and we don't want
1983 # to include this code point, so splice before it.
1984 $i++ if $i % 2 != 0;
1985
1986 # Remove everything past this.
1987 splice @invlist, $i;
99f21fb9 1988 splice @invmap, $i if @invmap;
0f5e3c71
KW
1989 last;
1990 }
0c4ecf42
KW
1991 }
1992 }
0f5e3c71
KW
1993 elsif ($nonl1_only) {
1994 my $found_nonl1 = 0;
1995 for my $i (0 .. @invlist - 1 - 1) {
1996 next if $invlist[$i] < 256;
1997
1998 # Here, we have the first element in the array that indicates an
1999 # element above Latin1. Get rid of all previous ones.
2000 splice @invlist, 0, $i;
99f21fb9 2001 splice @invmap, 0, $i if @invmap;
0f5e3c71
KW
2002
2003 # If this one's index is not divisible by 2, it means that this
2004 # element is inverting away from being in the list, which means
99f21fb9
KW
2005 # all code points from 256 to this one are in this list (or
2006 # map to the default for inversion maps)
2007 if ($i % 2 != 0) {
2008 unshift @invlist, 256;
2009 unshift @invmap, $map_default if @invmap;
2010 }
0f5e3c71 2011 $found_nonl1 = 1;
3f427fd9
KW
2012 last;
2013 }
0f5e3c71 2014 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9 2015 }
3f427fd9 2016
0f5e3c71 2017 output_invlist($prop_name, \@invlist, $charset);
99f21fb9 2018 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
0f5e3c71 2019 }
bffc0129 2020 end_file_pound_if;
0c4ecf42 2021 print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be
KW
2022}
2023
973a28ed
KW
2024switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
2025
2026output_GCB_table();
6b659339 2027output_LB_table();
7e54b87f 2028output_WB_table();
6b659339 2029
973a28ed
KW
2030end_file_pound_if;
2031
2308ab83 2032my $sources_list = "lib/unicore/mktables.lst";
216b41c2
KW
2033my @sources = ($0, qw(lib/unicore/mktables
2034 lib/Unicode/UCD.pm
2035 regen/charset_translations.pl
2036 ));
9a3da3ad
FC
2037{
2038 # Depend on mktables’ own sources. It’s a shorter list of files than
2039 # those that Unicode::UCD uses.
1ae6ead9 2040 if (! open my $mktables_list, '<', $sources_list) {
2308ab83
KW
2041
2042 # This should force a rebuild once $sources_list exists
2043 push @sources, $sources_list;
2044 }
2045 else {
2046 while(<$mktables_list>) {
2047 last if /===/;
2048 chomp;
2049 push @sources, "lib/unicore/$_" if /^[^#]/;
2050 }
9a3da3ad
FC
2051 }
2052}
6b659339
KW
2053
2054read_only_bottom_close_and_rename($out_fh, \@sources);