5 use Unicode::UCD qw(prop_aliases
9 prop_invmap search_invlist
11 require './regen/regen_lib.pl';
12 require './regen/charset_translations.pl';
14 # This program outputs charclass_invlists.h, which contains various inversion
15 # lists in the form of C arrays that are to be used as-is for inversion lists.
16 # Thus, the lists it contains are essentially pre-compiled, and need only a
17 # light-weight fast wrapper to make them usable at run-time.
19 # As such, this code knows about the internal structure of these lists, and
20 # any change made to that has to be done here as well. A random number stored
21 # in the headers is used to minimize the possibility of things getting
22 # out-of-sync, or the wrong data structure being passed. Currently that
25 # charclass_invlists.h now also has a partial implementation of inversion
26 # maps; enough to generate tables for the line break properties, such as GCB
28 my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
31 my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
33 # Matches valid C language enum names: begins with ASCII alphabetic, then any
35 my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
37 my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
41 my $in_file_pound_if = 0;
43 my $max_hdr_len = 3; # In headings, how wide a name is allowed?
45 print $out_fh "/* See the generating file for comments */\n\n";
47 # The symbols generated by this program are all currently defined only in a
48 # single dot c each. The code knows where most of them go, but this hash
49 # gives overrides for the exceptions to the typical place
50 my %exceptions_to_where_to_define =
51 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
52 AboveLatin1 => 'PERL_IN_REGCOMP_C',
53 Latin1 => 'PERL_IN_REGCOMP_C',
54 UpperLatin1 => 'PERL_IN_REGCOMP_C',
55 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
56 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
57 _Perl_IDCont => 'PERL_IN_UTF8_C',
58 _Perl_IDStart => 'PERL_IN_UTF8_C',
61 # This hash contains the properties with enums that have hard-coded references
62 # to them in C code. It is neeed to make sure that if perl is compiled
63 # with an older Unicode data set, that all the enum values the code is
64 # expecting will still be in the enum typedef. Thus the code doesn't have to
65 # change. The Unicode version won't have any code points that have the enum
66 # values not in that version, so the code that handles them will not get
67 # exercised. This is far better than having to #ifdef things. The names here
68 # should be the long names of the respective property values. The reason for
69 # this is because regexec.c uses them as case labels, and the long name is
70 # generally more understandable than the short.
71 my %hard_coded_enums =
126 'Regional_Indicator',
169 'Perl_Tailored_HSpace',
170 'Regional_Indicator',
178 my %gcb_abbreviations;
181 my %lb_abbreviations;
184 my %wb_abbreviations;
189 # Returns non-duplicated input values. From "Perl Best Practices:
190 # Encapsulated Cleverness". p. 455 in first edition.
193 return grep { ! $seen{$_}++ } @_;
199 # Returns the input Unicode code point translated to native.
201 return $cp if $cp !~ $numeric_re || $cp > 255;
205 sub end_file_pound_if {
206 if ($in_file_pound_if) {
207 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
208 $in_file_pound_if = 0;
212 sub switch_pound_if ($$) {
214 my $new_pound_if = shift;
216 # Switch to new #if given by the 2nd argument. If there is an override
217 # for this, it instead switches to that. The 1st argument is the
218 # static's name, used to look up the overrides
220 if (exists $exceptions_to_where_to_define{$name}) {
221 $new_pound_if = $exceptions_to_where_to_define{$name};
224 # Exit current #if if the new one is different from the old
225 if ($in_file_pound_if
226 && $in_file_pound_if !~ /$new_pound_if/)
231 # Enter new #if, if not already in it.
232 if (! $in_file_pound_if) {
233 $in_file_pound_if = "defined($new_pound_if)";
234 print $out_fh "\n#if $in_file_pound_if\n";
238 sub output_invlist ($$;$) {
240 my $invlist = shift; # Reference to inversion list array
241 my $charset = shift // ""; # name of character set for comment
243 die "No inversion list for $name" unless defined $invlist
244 && ref $invlist eq 'ARRAY';
246 # Output the inversion list $invlist using the name $name for it.
247 # It is output in the exact internal form for inversion lists.
249 # Is the last element of the header 0, or 1 ?
251 if (@$invlist && $invlist->[0] != 0) {
252 unshift @$invlist, 0;
255 my $count = @$invlist;
257 switch_pound_if ($name, 'PERL_IN_PERL_C');
259 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
260 print $out_fh " /* for $charset */" if $charset;
263 print $out_fh "\t$count,\t/* Number of elements */\n";
264 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
265 print $out_fh "\t", $zero_or_one,
266 ",\t/* 0 if the list starts at 0;",
267 "\n\t\t 1 if it starts at the element beyond 0 */\n";
269 # The main body are the UVs passed in to this routine. Do the final
271 for my $i (0 .. @$invlist - 1) {
272 printf $out_fh "\t0x%X", $invlist->[$i];
273 print $out_fh "," if $i < @$invlist - 1;
277 print $out_fh "};\n";
280 sub output_invmap ($$$$$$$) {
282 my $invmap = shift; # Reference to inversion map array
283 my $prop_name = shift;
284 my $input_format = shift; # The inversion map's format
285 my $default = shift; # The property value for code points who
286 # otherwise don't have a value specified.
287 my $extra_enums = shift; # comma-separated list of our additions to the
288 # property's standard possible values
289 my $charset = shift // ""; # name of character set for comment
291 # Output the inversion map $invmap for property $prop_name, but use $name
292 # as the actual data structure's name.
294 my $count = @$invmap;
297 my $declaration_type;
301 if ($input_format eq 's') {
302 my $orig_prop_name = $prop_name;
303 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
304 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
306 if ($orig_prop_name eq $prop_name) {
307 @enums = prop_values($prop_name);
310 @enums = uniques(@$invmap);
314 die "Only enum properties are currently handled; '$prop_name' isn't one";
317 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
318 my @canonical_input_enums;
319 if (@expected_enums) {
320 if (@expected_enums < @enums) {
321 die 'You need to update %hard_coded_enums to reflect new'
322 . " entries in this Unicode version\n"
323 . "Expected: " . join(", ", sort @expected_enums) . "\n"
324 . " Got: " . join(", ", sort @enums);
327 if (! defined prop_aliases($prop_name)) {
329 # Convert the input enums into canonical form and
331 @canonical_input_enums = map { lc ($_ =~ s/_//gr) }
334 @enums = sort @expected_enums;
337 # The internal enums come last, and in the order specified
339 if ($extra_enums ne "") {
340 @extras = split /,/, $extra_enums;
341 push @enums, @extras;
344 # Assign a value to each element of the enum. The default
345 # value always gets 0; the others are arbitrarily assigned.
347 my $canonical_default = prop_value_aliases($prop_name, $default);
348 $default = $canonical_default if defined $canonical_default;
349 $enums{$default} = $enum_val++;
350 for my $enum (@enums) {
351 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
354 # Calculate the enum values for certain properties like
355 # _Perl_GCB and _Perl_LB, because we output special tables for
357 if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) {
359 # We use string evals to allow the same code to work on
360 # all tables we're doing.
361 my $type = lc $prop_name;
363 # We use lowercase single letter names for any property
364 # values not in the release of Unicode being compiled now.
365 my $placeholder = "a";
367 # Skip if we've already done this code, which populated
369 if (eval "! \%${type}_enums") {
372 foreach my $enum (sort keys %enums) {
373 my $value = $enums{$enum};
375 my $abbreviated_from;
377 # Special case this wb property value to make the
379 if ($enum eq 'Perl_Tailored_HSpace') {
381 $abbreviated_from = $enum;
383 elsif (grep { $_ eq $enum } @extras) {
385 # The 'short' name for one of the property
386 # values added by this file is just the
390 elsif (grep {$_ eq lc ( $enum =~ s/_//gr) }
391 @canonical_input_enums)
392 { # On Unicode versions that predate the
393 # official property, we have set up this array
394 # to be the canonical form of each enum in the
395 # substitute property. If the enum we're
396 # looking at is canonically the same as one of
397 # these, use its name instead of generating a
398 # placeholder one in the next clause (which
399 # will happen because prop_value_aliases()
400 # will fail because it only works on official
405 # Use the official short name for the other
406 # property values, which should all be
408 ($short) = prop_value_aliases($type, $enum);
410 # But create a placeholder for ones not in
411 # this Unicode version.
412 $short = $placeholder++ unless defined $short;
415 # If our short name is too long, or we already
416 # know that the name is an abbreviation, truncate
417 # to make sure it's short enough, and remember
418 # that we did this so we can later place in a
419 # comment in the generated file
420 if ( $abbreviated_from
421 || length $short > $max_hdr_len)
423 $short = substr($short, 0, $max_hdr_len);
424 $abbreviated_from = $enum
425 unless $abbreviated_from;
426 # If the name we are to display conflicts, try
429 \$${type}_abbreviations{$short}")
435 eval "\$${type}_abbreviations{$short} = '$enum'";
439 # Remember the mapping from the property value
440 # (enum) name to its value.
441 eval "\$${type}_enums{$enum} = $value";
444 # Remember the inverse mapping to the short name
445 # so that we can properly label the generated
446 # table's rows and columns
447 eval "\$${type}_short_enums[$value] = '$short'";
454 # Inversion map stuff is currently used only by regexec
455 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
458 # The short names tend to be two lower case letters, but it looks
459 # better for those if they are upper. XXX
460 $short_name = uc($short_name) if length($short_name) < 3
461 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
462 $name_prefix = "${short_name}_";
463 my $enum_count = keys %enums;
464 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
466 print $out_fh "\ntypedef enum {\n";
468 foreach my $enum (keys %enums) {
469 $enum_list[$enums{$enum}] = $enum;
471 foreach my $i (0 .. @enum_list - 1) {
472 my $name = $enum_list[$i];
473 print $out_fh "\t${name_prefix}$name = $i";
474 print $out_fh "," if $i < $enum_count - 1;
477 $declaration_type = "${name_prefix}enum";
478 print $out_fh "} $declaration_type;\n";
480 $output_format = "${name_prefix}%s";
484 die "'$input_format' invmap() format for '$prop_name' unimplemented";
487 die "No inversion map for $prop_name" unless defined $invmap
488 && ref $invmap eq 'ARRAY'
491 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
492 print $out_fh " /* for $charset */" if $charset;
495 # The main body are the scalars passed in to this routine.
496 for my $i (0 .. $count - 1) {
497 my $element = $invmap->[$i];
498 my $full_element_name = prop_value_aliases($prop_name, $element);
499 $element = $full_element_name if defined $full_element_name;
500 $element = $name_prefix . $element;
501 print $out_fh "\t$element";
502 print $out_fh "," if $i < $count - 1;
505 print $out_fh "};\n";
508 sub mk_invlist_from_sorted_cp_list {
510 # Returns an inversion list constructed from the sorted input array of
513 my $list_ref = shift;
515 return unless @$list_ref;
517 # Initialize to just the first element
518 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
520 # For each succeeding element, if it extends the previous range, adjust
521 # up, otherwise add it.
522 for my $i (1 .. @$list_ref - 1) {
523 if ($invlist[-1] == $list_ref->[$i]) {
527 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
533 # Read in the Case Folding rules, and construct arrays of code points for the
534 # properties we need.
535 my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
536 die "Could not find inversion map for Case_Folding" unless defined $format;
537 die "Incorrect format '$format' for Case_Folding inversion map"
538 unless $format eq 'al'
540 my @has_multi_char_fold;
541 my @is_non_final_fold;
543 for my $i (0 .. @$folds_ref - 1) {
544 next unless ref $folds_ref->[$i]; # Skip single-char folds
545 push @has_multi_char_fold, $cp_ref->[$i];
547 # Add to the non-finals list each code point that is in a non-final
549 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
550 push @is_non_final_fold, $folds_ref->[$i][$j]
551 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
555 sub _Perl_Non_Final_Folds {
556 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
557 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
560 sub prop_name_for_cmp ($) { # Sort helper
563 # Returns the input lowercased, with non-alphas removed, as well as
564 # everything starting with a comma
567 $name =~ s/[[:^alpha:]]//g;
572 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
575 sub output_table_common {
577 # Common subroutine to actually output the generated rules table.
580 $table_value_defines_ref,
583 $abbreviations_ref) = @_;
584 my $size = @$table_ref;
586 # Output the #define list, sorted by numeric value
587 if ($table_value_defines_ref) {
588 my $max_name_length = 0;
591 # Put in order, and at the same time find the longest name
592 while (my ($enum, $value) = each %$table_value_defines_ref) {
593 $defines[$value] = $enum;
595 my $length = length $enum;
596 $max_name_length = $length if $length > $max_name_length;
601 # Output, so that the values are vertically aligned in a column after
603 foreach my $i (0 .. @defines - 1) {
604 next unless defined $defines[$i];
605 printf $out_fh "#define %-*s %2d\n",
612 my $column_width = 2; # We currently allow 2 digits for the number
614 # If the maximum value in the table is 1, it can be a bool. (Being above
615 # a U8 is not currently handled
617 for my $i (0 .. $size - 1) {
618 for my $j (0 .. $size - 1) {
619 next if $max_element >= $table_ref->[$i][$j];
620 $max_element = $table_ref->[$i][$j];
623 die "Need wider table column width given '$max_element"
624 if length $max_element > $column_width;
626 my $table_type = ($max_element == 1)
630 # If a name is longer than the width set aside for a column, its column
631 # needs to have increased spacing so that the name doesn't get truncated
632 # nor run into an adjacent column
635 # If we are being compiled on a Unicode version earlier than that which
636 # this file was designed for, it may be that some of the property values
637 # aren't in the current release, and so would be undefined if we didn't
638 # define them ourselves. Earlier code has done this, making them
639 # lowercase characters of length one. We look to see if any exist, so
640 # that we can add an annotation to the output table
641 my $has_placeholder = 0;
643 for my $i (0 .. $size - 1) {
644 no warnings 'numeric';
645 $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
646 $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
649 print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
651 # Calculate the column heading line
652 my $header_line = "/* "
653 . (" " x $max_hdr_len) # We let the row heading meld to
654 # the '*/' for those that are at
656 . " " x 3; # Space for '*/ '
658 for my $i (0 .. $size - 1) {
659 $header_line .= sprintf "%s%*s",
661 $column_width + 1, # 1 for the ','
664 $header_line .= " */\n";
666 # If we have annotations, output it now.
667 if ($has_placeholder || scalar %$abbreviations_ref) {
669 foreach my $abbr (sort keys %$abbreviations_ref) {
670 $text .= "; " if $text;
671 $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
673 if ($has_placeholder) {
674 $text .= "; other " if $text;
675 $text .= "lowercase names are placeholders for"
676 . " property values not defined until a later Unicode"
677 . " release, so are irrelevant in this one, as they are"
678 . " not assigned to any code points";
681 my $indent = " " x 3;
682 $text = $indent . "/* $text */";
684 # Wrap the text so that it is no wider than the table, which the
686 my $output_width = length $header_line;
687 while (length $text > $output_width) {
688 my $cur_line = substr($text, 0, $output_width);
690 # Find the first blank back from the right end to wrap at.
691 for (my $i = $output_width -1; $i > 0; $i--) {
692 if (substr($text, $i, 1) eq " ") {
693 print $out_fh substr($text, 0, $i), "\n";
695 # Set so will look at just the remaining tail (which will
696 # be indented and have a '*' after the indent
697 $text = $indent . " * " . substr($text, $i + 1);
704 print $out_fh $text, "\n" if $text;
707 # We calculated the header line earlier just to get its width so that we
708 # could make sure the annotations fit into that.
709 print $out_fh $header_line;
711 # Now output the bulk of the table.
712 for my $i (0 .. $size - 1) {
714 # First the row heading.
715 printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i];
716 print $out_fh "{"; # Then the brace for this row
719 for my $j (0 .. $size -1) {
720 print $out_fh $spacers[$j];
721 printf $out_fh "%*d", $column_width, $table_ref->[$i][$j];
722 print $out_fh "," if $j < $size - 1;
725 print $out_fh "," if $i < $size - 1;
729 print $out_fh "};\n";
732 sub output_GCB_table() {
734 # Create and output the pair table for use in determining Grapheme Cluster
735 # Breaks, given in http://www.unicode.org/reports/tr29/.
739 GCB_RI_then_RI => 2, # Rules 12 and 13
740 GCB_EX_then_EM => 3, # Rule 10
743 # The table is constructed in reverse order of the rules, to make the
744 # lower-numbered, higher priority ones override the later ones, as the
745 # algorithm stops at the earliest matching rule
748 my $table_size = @gcb_short_enums;
750 # Otherwise, break everywhere.
752 for my $i (0 .. $table_size - 1) {
753 for my $j (0 .. $table_size - 1) {
754 $gcb_table[$i][$j] = 1;
758 # Do not break within emoji flag sequences. That is, do not break between
759 # regional indicator (RI) symbols if there is an odd number of RI
760 # characters before the break point. Must be resolved in runtime code.
762 # GB12 ^ (RI RI)* RI × RI
763 # GB13 [^RI] (RI RI)* RI × RI
764 $gcb_table[$gcb_enums{'Regional_Indicator'}]
765 [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI};
767 # Do not break within emoji modifier sequences or emoji zwj sequences.
768 # GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ )
769 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0;
770 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0;
772 # GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier
773 $gcb_table[$gcb_enums{'Extend'}][$gcb_enums{'E_Modifier'}]
774 = $gcb_actions{GCB_EX_then_EM};
775 $gcb_table[$gcb_enums{'E_Base'}][$gcb_enums{'E_Modifier'}] = 0;
776 $gcb_table[$gcb_enums{'E_Base_GAZ'}][$gcb_enums{'E_Modifier'}] = 0;
778 # Do not break before extending characters or ZWJ.
779 # Do not break before SpacingMarks, or after Prepend characters.
782 # GB9 × ( Extend | ZWJ )
783 for my $i (0 .. @gcb_table - 1) {
784 $gcb_table[$gcb_enums{'Prepend'}][$i] = 0;
785 $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0;
786 $gcb_table[$i][$gcb_enums{'Extend'}] = 0;
787 $gcb_table[$i][$gcb_enums{'ZWJ'}] = 0;
790 # Do not break Hangul syllable sequences.
792 $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
793 $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
795 # GB7 ( LV | V ) × ( V | T )
796 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
797 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
798 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
799 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
801 # GB6 L × ( L | V | LV | LVT )
802 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
803 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
804 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
805 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
807 # Do not break between a CR and LF. Otherwise, break before and after
809 # GB5 ÷ ( Control | CR | LF )
810 # GB4 ( Control | CR | LF ) ÷
811 for my $i (0 .. @gcb_table - 1) {
812 $gcb_table[$i][$gcb_enums{'Control'}] = 1;
813 $gcb_table[$i][$gcb_enums{'CR'}] = 1;
814 $gcb_table[$i][$gcb_enums{'LF'}] = 1;
815 $gcb_table[$gcb_enums{'Control'}][$i] = 1;
816 $gcb_table[$gcb_enums{'CR'}][$i] = 1;
817 $gcb_table[$gcb_enums{'LF'}][$i] = 1;
821 $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
823 # Break at the start and end of text, unless the text is empty
826 for my $i (0 .. @gcb_table - 1) {
827 $gcb_table[$i][$gcb_enums{'EDGE'}] = 1;
828 $gcb_table[$gcb_enums{'EDGE'}][$i] = 1;
830 $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0;
832 output_table_common('GCB', \%gcb_actions,
833 \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations);
836 sub output_LB_table() {
838 # Create and output the enums, #defines, and pair table for use in
839 # determining Line Breaks. This uses the default line break algorithm,
840 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
841 # in that page, as the Unicode-furnished tests assume that tailoring.
843 # The result is really just true or false. But we follow along with tr14,
844 # creating a rule which is false for something like X SP* X. That gets
845 # encoding 2. The rest of the actions are synthetic ones that indicate
846 # some context handling is required. These each are added to the
847 # underlying 0, 1, or 2, instead of replacing them, so that the underlying
848 # value can be retrieved. Actually only rules from 7 through 18 (which
849 # are the ones where space matter) are possible to have 2 added to them.
850 # The others below add just 0 or 1. It might be possible for one
851 # synthetic rule to be added to another, yielding a larger value. This
852 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
853 # names of the middle grouping below, it is impossible for that to occur
854 # for them because they all start with mutually exclusive classes. That
855 # the final rule can't be added to any of the others isn't obvious from
856 # its name, so it is assigned a power of 2 higher than the others can get
857 # to so any addition would preserve all data. (And the code will reach an
858 # assert(0) on debugging builds should this happen.)
862 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
864 LB_CM_ZWJ_foo => 3, # Rule 9
865 LB_SP_foo => 6, # Rule 18
866 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
867 LB_SY_or_IS_then_various => 11, # Rule 25
868 LB_HY_or_BA_then_foo => 13, # Rule 21
869 LB_RI_then_RI => 15, # Rule 30a
871 LB_various_then_PO_or_PR => (1<<5), # Rule 25
874 # Construct the LB pair table. This is based on the rules in
875 # http://www.unicode.org/reports/tr14/, but modified as those rules are
876 # designed for someone taking a string of text and sequentially going
877 # through it to find the break opportunities, whereas, Perl requires
878 # determining if a given random spot is a break opportunity, without
879 # knowing all the entire string before it.
881 # The table is constructed in reverse order of the rules, to make the
882 # lower-numbered, higher priority ones override the later ones, as the
883 # algorithm stops at the earliest matching rule
886 my $table_size = @lb_short_enums;
888 # LB31. Break everywhere else
889 for my $i (0 .. $table_size - 1) {
890 for my $j (0 .. $table_size - 1) {
891 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
895 # LB30b Do not break between an emoji base and an emoji modifier.
897 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'E_Modifier'}]
898 = $lb_actions{'LB_NOBREAK'};
900 # LB30a Break between two regional indicator symbols if and only if there
901 # are an even number of regional indicators preceding the position of the
903 # sot (RI RI)* RI × RI
904 # [^RI] (RI RI)* RI × RI
905 $lb_table[$lb_enums{'Regional_Indicator'}]
906 [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_RI_then_RI'};
908 # LB30 Do not break between letters, numbers, or ordinary symbols and
909 # opening or closing parentheses.
910 # (AL | HL | NU) × OP
911 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}]
912 = $lb_actions{'LB_NOBREAK'};
913 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}]
914 = $lb_actions{'LB_NOBREAK'};
915 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}]
916 = $lb_actions{'LB_NOBREAK'};
918 # CP × (AL | HL | NU)
919 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}]
920 = $lb_actions{'LB_NOBREAK'};
921 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}]
922 = $lb_actions{'LB_NOBREAK'};
923 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}]
924 = $lb_actions{'LB_NOBREAK'};
926 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
928 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}]
929 = $lb_actions{'LB_NOBREAK'};
930 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
931 = $lb_actions{'LB_NOBREAK'};
933 # LB28 Do not break between alphabetics (“at”).
934 # (AL | HL) × (AL | HL)
935 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}]
936 = $lb_actions{'LB_NOBREAK'};
937 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}]
938 = $lb_actions{'LB_NOBREAK'};
939 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}]
940 = $lb_actions{'LB_NOBREAK'};
941 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}]
942 = $lb_actions{'LB_NOBREAK'};
944 # LB27 Treat a Korean Syllable Block the same as ID.
945 # (JL | JV | JT | H2 | H3) × IN
946 $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}]
947 = $lb_actions{'LB_NOBREAK'};
948 $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}]
949 = $lb_actions{'LB_NOBREAK'};
950 $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}]
951 = $lb_actions{'LB_NOBREAK'};
952 $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}]
953 = $lb_actions{'LB_NOBREAK'};
954 $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}]
955 = $lb_actions{'LB_NOBREAK'};
957 # (JL | JV | JT | H2 | H3) × PO
958 $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}]
959 = $lb_actions{'LB_NOBREAK'};
960 $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}]
961 = $lb_actions{'LB_NOBREAK'};
962 $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}]
963 = $lb_actions{'LB_NOBREAK'};
964 $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}]
965 = $lb_actions{'LB_NOBREAK'};
966 $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}]
967 = $lb_actions{'LB_NOBREAK'};
969 # PR × (JL | JV | JT | H2 | H3)
970 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}]
971 = $lb_actions{'LB_NOBREAK'};
972 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}]
973 = $lb_actions{'LB_NOBREAK'};
974 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}]
975 = $lb_actions{'LB_NOBREAK'};
976 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}]
977 = $lb_actions{'LB_NOBREAK'};
978 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}]
979 = $lb_actions{'LB_NOBREAK'};
981 # LB26 Do not break a Korean syllable.
982 # JL × (JL | JV | H2 | H3)
983 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
984 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
985 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
986 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
988 # (JV | H2) × (JV | JT)
989 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
990 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
991 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
992 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
995 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
996 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
998 # LB25 Do not break between the following pairs of classes relevant to
999 # numbers, as tailored by example 7 in
1000 # http://www.unicode.org/reports/tr14/#Examples
1001 # We follow that tailoring because Unicode's test cases expect it
1002 # (PR | PO) × ( OP | HY )? NU
1003 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}]
1004 = $lb_actions{'LB_NOBREAK'};
1005 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}]
1006 = $lb_actions{'LB_NOBREAK'};
1008 # Given that (OP | HY )? is optional, we have to test for it in code.
1009 # We add in the action (instead of overriding) for this, so that in
1010 # the code we can recover the underlying break value.
1011 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}]
1012 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1013 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}]
1014 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1015 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}]
1016 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1017 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}]
1018 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1021 $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}]
1022 = $lb_actions{'LB_NOBREAK'};
1023 $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}]
1024 = $lb_actions{'LB_NOBREAK'};
1026 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
1027 # which can be rewritten as:
1028 # NU (SY | IS)* × (NU | SY | IS | CL | CP )
1029 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}]
1030 = $lb_actions{'LB_NOBREAK'};
1031 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}]
1032 = $lb_actions{'LB_NOBREAK'};
1033 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}]
1034 = $lb_actions{'LB_NOBREAK'};
1035 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}]
1036 = $lb_actions{'LB_NOBREAK'};
1037 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}]
1038 = $lb_actions{'LB_NOBREAK'};
1040 # Like earlier where we have to test in code, we add in the action so
1041 # that we can recover the underlying values. This is done in rules
1042 # below, as well. The code assumes that we haven't added 2 actions.
1043 # Shoul a later Unicode release break that assumption, then tests
1044 # should start failing.
1045 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}]
1046 += $lb_actions{'LB_SY_or_IS_then_various'};
1047 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}]
1048 += $lb_actions{'LB_SY_or_IS_then_various'};
1049 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}]
1050 += $lb_actions{'LB_SY_or_IS_then_various'};
1051 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}]
1052 += $lb_actions{'LB_SY_or_IS_then_various'};
1053 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}]
1054 += $lb_actions{'LB_SY_or_IS_then_various'};
1055 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}]
1056 += $lb_actions{'LB_SY_or_IS_then_various'};
1057 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}]
1058 += $lb_actions{'LB_SY_or_IS_then_various'};
1059 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}]
1060 += $lb_actions{'LB_SY_or_IS_then_various'};
1061 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}]
1062 += $lb_actions{'LB_SY_or_IS_then_various'};
1063 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}]
1064 += $lb_actions{'LB_SY_or_IS_then_various'};
1066 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
1067 # which can be rewritten as:
1068 # NU (SY | IS)* (CL | CP)? × (PO | PR)
1069 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}]
1070 = $lb_actions{'LB_NOBREAK'};
1071 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}]
1072 = $lb_actions{'LB_NOBREAK'};
1074 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}]
1075 += $lb_actions{'LB_various_then_PO_or_PR'};
1076 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}]
1077 += $lb_actions{'LB_various_then_PO_or_PR'};
1078 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}]
1079 += $lb_actions{'LB_various_then_PO_or_PR'};
1080 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}]
1081 += $lb_actions{'LB_various_then_PO_or_PR'};
1083 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}]
1084 += $lb_actions{'LB_various_then_PO_or_PR'};
1085 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}]
1086 += $lb_actions{'LB_various_then_PO_or_PR'};
1087 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}]
1088 += $lb_actions{'LB_various_then_PO_or_PR'};
1089 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}]
1090 += $lb_actions{'LB_various_then_PO_or_PR'};
1092 # LB24 Do not break between numeric prefix/postfix and letters, or between
1093 # letters and prefix/postfix.
1094 # (PR | PO) × (AL | HL)
1095 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}]
1096 = $lb_actions{'LB_NOBREAK'};
1097 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1098 = $lb_actions{'LB_NOBREAK'};
1099 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}]
1100 = $lb_actions{'LB_NOBREAK'};
1101 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1102 = $lb_actions{'LB_NOBREAK'};
1104 # (AL | HL) × (PR | PO)
1105 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Prefix_Numeric'}]
1106 = $lb_actions{'LB_NOBREAK'};
1107 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Prefix_Numeric'}]
1108 = $lb_actions{'LB_NOBREAK'};
1109 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Postfix_Numeric'}]
1110 = $lb_actions{'LB_NOBREAK'};
1111 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Postfix_Numeric'}]
1112 = $lb_actions{'LB_NOBREAK'};
1114 # LB23a Do not break between numeric prefixes and ideographs, or between
1115 # ideographs and numeric postfixes.
1116 # PR × (ID | EB | EM)
1117 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}]
1118 = $lb_actions{'LB_NOBREAK'};
1119 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Base'}]
1120 = $lb_actions{'LB_NOBREAK'};
1121 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Modifier'}]
1122 = $lb_actions{'LB_NOBREAK'};
1124 # (ID | EB | EM) × PO
1125 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}]
1126 = $lb_actions{'LB_NOBREAK'};
1127 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Postfix_Numeric'}]
1128 = $lb_actions{'LB_NOBREAK'};
1129 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Postfix_Numeric'}]
1130 = $lb_actions{'LB_NOBREAK'};
1132 # LB23 Do not break between digits and letters
1134 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}]
1135 = $lb_actions{'LB_NOBREAK'};
1136 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}]
1137 = $lb_actions{'LB_NOBREAK'};
1140 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}]
1141 = $lb_actions{'LB_NOBREAK'};
1142 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}]
1143 = $lb_actions{'LB_NOBREAK'};
1145 # LB22 Do not break between two ellipses, or between letters, numbers or
1146 # exclamations and ellipsis.
1148 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}]
1149 = $lb_actions{'LB_NOBREAK'};
1150 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}]
1151 = $lb_actions{'LB_NOBREAK'};
1154 $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}]
1155 = $lb_actions{'LB_NOBREAK'};
1157 # (ID | EB | EM) × IN
1158 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}]
1159 = $lb_actions{'LB_NOBREAK'};
1160 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Inseparable'}]
1161 = $lb_actions{'LB_NOBREAK'};
1162 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Inseparable'}]
1163 = $lb_actions{'LB_NOBREAK'};
1166 $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}]
1167 = $lb_actions{'LB_NOBREAK'};
1170 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}]
1171 = $lb_actions{'LB_NOBREAK'};
1173 # LB21b Don’t break between Solidus and Hebrew letters.
1175 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}]
1176 = $lb_actions{'LB_NOBREAK'};
1178 # LB21a Don't break after Hebrew + Hyphen.
1180 for my $i (0 .. @lb_table - 1) {
1181 $lb_table[$lb_enums{'Hyphen'}][$i]
1182 += $lb_actions{'LB_HY_or_BA_then_foo'};
1183 $lb_table[$lb_enums{'Break_After'}][$i]
1184 += $lb_actions{'LB_HY_or_BA_then_foo'};
1187 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
1188 # spaces, small kana, and other non-starters, or after acute accents.
1193 for my $i (0 .. @lb_table - 1) {
1194 $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'};
1195 $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'};
1196 $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'};
1197 $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'};
1200 # LB20 Break before and after unresolved CB.
1203 # Conditional breaks should be resolved external to the line breaking
1204 # rules. However, the default action is to treat unresolved CB as breaking
1206 for my $i (0 .. @lb_table - 1) {
1207 $lb_table[$i][$lb_enums{'Contingent_Break'}]
1208 = $lb_actions{'LB_BREAKABLE'};
1209 $lb_table[$lb_enums{'Contingent_Break'}][$i]
1210 = $lb_actions{'LB_BREAKABLE'};
1213 # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
1216 for my $i (0 .. @lb_table - 1) {
1217 $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'};
1218 $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'};
1221 # LB18 Break after spaces
1223 for my $i (0 .. @lb_table - 1) {
1224 $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'};
1227 # LB17 Do not break within ‘——’, even with intervening spaces.
1229 $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}]
1230 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1232 # LB16 Do not break between closing punctuation and a nonstarter even with
1233 # intervening spaces.
1234 # (CL | CP) SP* × NS
1235 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}]
1236 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1237 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}]
1238 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1241 # LB15 Do not break within ‘”[’, even with intervening spaces.
1243 $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}]
1244 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1246 # LB14 Do not break after ‘[’, even after spaces.
1248 for my $i (0 .. @lb_table - 1) {
1249 $lb_table[$lb_enums{'Open_Punctuation'}][$i]
1250 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1253 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
1254 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
1260 for my $i (0 .. @lb_table - 1) {
1261 $lb_table[$i][$lb_enums{'Exclamation'}]
1262 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1264 next if $i == $lb_enums{'Numeric'};
1266 $lb_table[$i][$lb_enums{'Close_Punctuation'}]
1267 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1268 $lb_table[$i][$lb_enums{'Close_Parenthesis'}]
1269 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1270 $lb_table[$i][$lb_enums{'Infix_Numeric'}]
1271 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1272 $lb_table[$i][$lb_enums{'Break_Symbols'}]
1273 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1276 # LB12a Do not break before NBSP and related characters, except after
1277 # spaces and hyphens.
1279 for my $i (0 .. @lb_table - 1) {
1280 next if $i == $lb_enums{'Space'}
1281 || $i == $lb_enums{'Break_After'}
1282 || $i == $lb_enums{'Hyphen'};
1284 # We don't break, but if a property above has said don't break even
1285 # with space between, don't override that (also in the next few rules)
1286 next if $lb_table[$i][$lb_enums{'Glue'}]
1287 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1288 $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'};
1291 # LB12 Do not break after NBSP and related characters.
1293 for my $i (0 .. @lb_table - 1) {
1294 next if $lb_table[$lb_enums{'Glue'}][$i]
1295 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1296 $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'};
1299 # LB11 Do not break before or after Word joiner and related characters.
1302 for my $i (0 .. @lb_table - 1) {
1303 if ($lb_table[$i][$lb_enums{'Word_Joiner'}]
1304 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1306 $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'};
1308 if ($lb_table[$lb_enums{'Word_Joiner'}][$i]
1309 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1311 $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'};
1315 # Special case this here to avoid having to do a special case in the code,
1316 # by making this the same as other things with a SP in front of them that
1317 # don't break, we avoid an extra test
1318 $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}]
1319 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1321 # LB9 and LB10 are done in the same loop
1323 # LB9 Do not break a combining character sequence; treat it as if it has
1324 # the line breaking class of the base character in all of the
1325 # higher-numbered rules. Treat ZWJ as if it were CM
1326 # Treat X (CM|ZWJ)* as if it were X.
1327 # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
1329 # LB10 Treat any remaining combining mark or ZWJ as AL. This catches the
1330 # case where a CM or ZWJ is the first character on the line or follows SP,
1331 # BK, CR, LF, NL, or ZW.
1332 for my $i (0 .. @lb_table - 1) {
1334 # When the CM or ZWJ is the first in the pair, we don't know without
1335 # looking behind whether the CM or ZWJ is going to attach to an
1336 # earlier character, or not. So have to figure this out at runtime in
1338 $lb_table[$lb_enums{'Combining_Mark'}][$i]
1339 = $lb_actions{'LB_CM_ZWJ_foo'};
1340 $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_CM_ZWJ_foo'};
1342 if ( $i == $lb_enums{'Mandatory_Break'}
1343 || $i == $lb_enums{'EDGE'}
1344 || $i == $lb_enums{'Carriage_Return'}
1345 || $i == $lb_enums{'Line_Feed'}
1346 || $i == $lb_enums{'Next_Line'}
1347 || $i == $lb_enums{'Space'}
1348 || $i == $lb_enums{'ZWSpace'})
1350 # For these classes, a following CM doesn't combine, and should do
1351 # whatever 'Alphabetic' would do.
1352 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1353 = $lb_table[$i][$lb_enums{'Alphabetic'}];
1354 $lb_table[$i][$lb_enums{'ZWJ'}]
1355 = $lb_table[$i][$lb_enums{'Alphabetic'}];
1358 # For these classes, the CM or ZWJ combines, so doesn't break,
1359 # inheriting the type of nobreak from the master character.
1360 if ($lb_table[$i][$lb_enums{'Combining_Mark'}]
1361 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1363 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1364 = $lb_actions{'LB_NOBREAK'};
1366 if ($lb_table[$i][$lb_enums{'ZWJ'}]
1367 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1369 $lb_table[$i][$lb_enums{'ZWJ'}]
1370 = $lb_actions{'LB_NOBREAK'};
1375 # LB8a Do not break between a zero width joiner and an ideograph, emoji
1376 # base or emoji modifier. This rule prevents breaks within emoji joiner
1378 # ZWJ × (ID | EB | EM)
1379 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}]
1380 = $lb_actions{'LB_NOBREAK'};
1381 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}]
1382 = $lb_actions{'LB_NOBREAK'};
1383 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}]
1384 = $lb_actions{'LB_NOBREAK'};
1386 # LB8 Break before any character following a zero-width space, even if one
1387 # or more spaces intervene.
1389 for my $i (0 .. @lb_table - 1) {
1390 $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'};
1393 # Because of LB8-10, we need to look at context for "SP x", and this must
1394 # be done in the code. So override the existing rules for that, by adding
1395 # a constant to get new rules that tell the code it needs to look at
1396 # context. By adding this action instead of replacing the existing one,
1397 # we can get back to the original rule if necessary.
1398 for my $i (0 .. @lb_table - 1) {
1399 $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'};
1402 # LB7 Do not break before spaces or zero width space.
1405 for my $i (0 .. @lb_table - 1) {
1406 $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'};
1407 $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'};
1410 # LB6 Do not break before hard line breaks.
1411 # × ( BK | CR | LF | NL )
1412 for my $i (0 .. @lb_table - 1) {
1413 $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'};
1414 $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'};
1415 $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'};
1416 $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'};
1419 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
1424 for my $i (0 .. @lb_table - 1) {
1425 $lb_table[$lb_enums{'Carriage_Return'}][$i]
1426 = $lb_actions{'LB_BREAKABLE'};
1427 $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'};
1428 $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'};
1430 $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}]
1431 = $lb_actions{'LB_NOBREAK'};
1433 # LB4 Always break after hard line breaks.
1435 for my $i (0 .. @lb_table - 1) {
1436 $lb_table[$lb_enums{'Mandatory_Break'}][$i]
1437 = $lb_actions{'LB_BREAKABLE'};
1440 # LB3 Always break at the end of text.
1442 # LB2 Never break at the start of text.
1444 for my $i (0 .. @lb_table - 1) {
1445 $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'};
1446 $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'};
1449 # LB1 Assign a line breaking class to each code point of the input.
1450 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
1451 # depending on criteria outside the scope of this algorithm.
1453 # In the absence of such criteria all characters with a specific
1454 # combination of original class and General_Category property value are
1455 # resolved as follows:
1456 # Original Resolved General_Category
1458 # SA CM Only Mn or Mc
1459 # SA AL Any except Mn and Mc
1462 # This is done in mktables, so we never see any of the remapped-from
1465 output_table_common('LB', \%lb_actions,
1466 \@lb_table, \@lb_short_enums, \%lb_abbreviations);
1469 sub output_WB_table() {
1471 # Create and output the enums, #defines, and pair table for use in
1472 # determining Word Breaks, given in http://www.unicode.org/reports/tr29/.
1474 # This uses the same mechanism in the other bounds tables generated by
1475 # this file. The actions that could override a 0 or 1 are added to those
1476 # numbers; the actions that clearly don't depend on the underlying rule
1482 WB_Ex_or_FO_or_ZWJ_then_foo => 3,
1485 WB_LE_or_HL_then_MB_or_ML_or_SQ => 8,
1486 WB_MB_or_ML_or_SQ_then_LE_or_HL => 10,
1487 WB_MB_or_MN_or_SQ_then_NU => 12,
1488 WB_NU_then_MB_or_MN_or_SQ => 14,
1489 WB_RI_then_RI => 16,
1492 # Construct the WB pair table.
1493 # The table is constructed in reverse order of the rules, to make the
1494 # lower-numbered, higher priority ones override the later ones, as the
1495 # algorithm stops at the earliest matching rule
1498 my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
1500 # Otherwise, break everywhere (including around ideographs).
1502 for my $i (0 .. $table_size - 1) {
1503 for my $j (0 .. $table_size - 1) {
1504 $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'};
1508 # Do not break within emoji flag sequences. That is, do not break between
1509 # regional indicator (RI) symbols if there is an odd number of RI
1510 # characters before the break point.
1511 # WB16 [^RI] (RI RI)* RI × RI
1512 # WB15 ^ (RI RI)* RI × RI
1513 $wb_table[$wb_enums{'Regional_Indicator'}]
1514 [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_RI_then_RI'};
1516 # Do not break within emoji modifier sequences.
1517 # WB14 ( E_Base | EBG ) × E_Modifier
1518 $wb_table[$wb_enums{'E_Base'}][$wb_enums{'E_Modifier'}]
1519 = $wb_actions{'WB_NOBREAK'};
1520 $wb_table[$wb_enums{'E_Base_GAZ'}][$wb_enums{'E_Modifier'}]
1521 = $wb_actions{'WB_NOBREAK'};
1523 # Do not break from extenders.
1524 # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
1525 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
1526 = $wb_actions{'WB_NOBREAK'};
1527 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
1528 = $wb_actions{'WB_NOBREAK'};
1529 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
1530 = $wb_actions{'WB_NOBREAK'};
1531 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}]
1532 = $wb_actions{'WB_NOBREAK'};
1534 # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
1536 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
1537 = $wb_actions{'WB_NOBREAK'};
1538 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
1539 = $wb_actions{'WB_NOBREAK'};
1540 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
1541 = $wb_actions{'WB_NOBREAK'};
1542 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}]
1543 = $wb_actions{'WB_NOBREAK'};
1544 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}]
1545 = $wb_actions{'WB_NOBREAK'};
1547 # Do not break between Katakana.
1548 # WB13 Katakana × Katakana
1549 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}]
1550 = $wb_actions{'WB_NOBREAK'};
1552 # Do not break within sequences, such as “3.2” or “3,456.789”.
1553 # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
1554 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}]
1555 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1556 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}]
1557 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1558 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}]
1559 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1561 # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
1562 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}]
1563 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1564 $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}]
1565 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1566 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}]
1567 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1569 # Do not break within sequences of digits, or digits adjacent to letters
1571 # WB10 Numeric × (ALetter | Hebrew_Letter)
1572 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
1573 = $wb_actions{'WB_NOBREAK'};
1574 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
1575 = $wb_actions{'WB_NOBREAK'};
1577 # WB9 (ALetter | Hebrew_Letter) × Numeric
1578 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
1579 = $wb_actions{'WB_NOBREAK'};
1580 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
1581 = $wb_actions{'WB_NOBREAK'};
1583 # WB8 Numeric × Numeric
1584 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}]
1585 = $wb_actions{'WB_NOBREAK'};
1587 # Do not break letters across certain punctuation.
1588 # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
1589 $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}]
1590 += $wb_actions{'WB_DQ_then_HL'};
1592 # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
1593 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}]
1594 += $wb_actions{'WB_HL_then_DQ'};
1596 # WB7a Hebrew_Letter × Single_Quote
1597 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1598 = $wb_actions{'WB_NOBREAK'};
1600 # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)
1601 # × (ALetter | Hebrew_Letter)
1602 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
1603 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1604 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
1605 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1606 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
1607 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1608 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
1609 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1610 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
1611 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1612 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
1613 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1615 # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
1616 # | Single_Quote) (ALetter | Hebrew_Letter)
1617 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
1618 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1619 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
1620 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1621 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
1622 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1623 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
1624 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1625 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
1626 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1627 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1628 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1630 # Do not break between most letters.
1631 # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
1632 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
1633 = $wb_actions{'WB_NOBREAK'};
1634 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
1635 = $wb_actions{'WB_NOBREAK'};
1636 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
1637 = $wb_actions{'WB_NOBREAK'};
1638 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
1639 = $wb_actions{'WB_NOBREAK'};
1641 # Ignore Format and Extend characters, except after sot, CR, LF, and
1642 # Newline. This also has the effect of: Any × (Format | Extend | ZWJ)
1643 # WB4 X (Extend | Format | ZWJ)* → X
1644 for my $i (0 .. @wb_table - 1) {
1645 $wb_table[$wb_enums{'Extend'}][$i]
1646 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1647 $wb_table[$wb_enums{'Format'}][$i]
1648 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1649 $wb_table[$wb_enums{'ZWJ'}][$i]
1650 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1652 for my $i (0 .. @wb_table - 1) {
1653 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1654 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
1655 $wb_table[$i][$wb_enums{'ZWJ'}] = $wb_actions{'WB_NOBREAK'};
1658 # Implied is that these attach to the character before them, except for
1659 # the characters that mark the end of a region of text. The rules below
1660 # override the ones set up here, for all the characters that need
1662 for my $i (0 .. @wb_table - 1) {
1663 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1664 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
1667 # Do not break within emoji zwj sequences.
1668 # WB3c ZWJ × ( Glue_After_Zwj | EBG )
1669 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}]
1670 = $wb_actions{'WB_NOBREAK'};
1671 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}]
1672 = $wb_actions{'WB_NOBREAK'};
1674 # Break before and after white space
1675 # WB3b ÷ (Newline | CR | LF)
1676 # WB3a (Newline | CR | LF) ÷
1678 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1679 for my $j (0 .. @wb_table - 1) {
1680 $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'};
1681 $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'};
1685 # But do not break within white space.
1688 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1689 for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1690 $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'};
1694 # And do not break horizontal space followed by Extend or Format or ZWJ
1695 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}]
1696 = $wb_actions{'WB_NOBREAK'};
1697 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}]
1698 = $wb_actions{'WB_NOBREAK'};
1699 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'ZWJ'}]
1700 = $wb_actions{'WB_NOBREAK'};
1701 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}]
1702 [$wb_enums{'Perl_Tailored_HSpace'}]
1703 = $wb_actions{'WB_hs_then_hs'};
1705 # Break at the start and end of text, unless the text is empty
1708 for my $i (0 .. @wb_table - 1) {
1709 $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'};
1710 $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'};
1712 $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0;
1714 output_table_common('WB', \%wb_actions,
1715 \@wb_table, \@wb_short_enums, \%wb_abbreviations);
1718 output_invlist("Latin1", [ 0, 256 ]);
1719 output_invlist("AboveLatin1", [ 256 ]);
1723 # We construct lists for all the POSIX and backslash sequence character
1724 # classes in two forms:
1725 # 1) ones which match only in the ASCII range
1726 # 2) ones which match either in the Latin1 range, or the entire Unicode range
1728 # These get compiled in, and hence affect the memory footprint of every Perl
1729 # program, even those not using Unicode. To minimize the size, currently
1730 # the Latin1 version is generated for the beyond ASCII range except for those
1731 # lists that are quite small for the entire range, such as for \s, which is 22
1732 # UVs long plus 4 UVs (currently) for the header.
1734 # To save even more memory, the ASCII versions could be derived from the
1735 # larger ones at runtime, saving some memory (minus the expense of the machine
1736 # instructions to do so), but these are all small anyway, so their total is
1739 # In the list of properties below that get generated, the L1 prefix is a fake
1740 # property that means just the Latin1 range of the full property (whose name
1741 # has an X prefix instead of L1).
1743 # An initial & means to use the subroutine from this file instead of an
1744 # official inversion list.
1746 for my $charset (get_supported_code_pages()) {
1747 print $out_fh "\n" . get_conditional_compile_line_start($charset);
1749 @a2n = @{get_a2n($charset)};
1751 # Ignore non-alpha in sort
1752 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
1772 &NonL1_Perl_Non_Final_Folds
1773 _Perl_Folds_To_Multi_Char
1780 _Perl_WB,EDGE,UNKNOWN
1784 # For the Latin1 properties, we change to use the eXtended version of the
1785 # base property, then go through the result and get rid of everything not
1786 # in Latin1 (above 255). Actually, we retain the element for the range
1787 # that crosses the 255/256 boundary if it is one that matches the
1788 # property. For example, in the Word property, there is a range of code
1789 # points that start at U+00F8 and goes through U+02C1. Instead of
1790 # artificially cutting that off at 256 because 256 is the first code point
1791 # above Latin1, we let the range go to its natural ending. That gives us
1792 # extra information with no added space taken. But if the range that
1793 # crosses the boundary is one that doesn't match the property, we don't
1794 # start a new range above 255, as that could be construed as going to
1795 # infinity. For example, the Upper property doesn't include the character
1796 # at 255, but does include the one at 256. We don't include the 256 one.
1797 my $prop_name = $prop;
1798 my $is_local_sub = $prop_name =~ s/^&//;
1799 my $extra_enums = "";
1800 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
1801 my $lookup_prop = $prop_name;
1802 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1803 or $lookup_prop =~ s/^L1//);
1805 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
1806 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
1812 my $maps_to_code_point;
1814 if ($is_local_sub) {
1815 @invlist = eval $lookup_prop;
1819 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
1822 # If couldn't find a non-empty inversion list, see if it is
1823 # instead an inversion map
1824 my ($list_ref, $map_ref, $format, $default)
1825 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
1827 # An empty return here could mean an unknown property, or
1828 # merely that the original inversion list is empty. Call
1829 # in scalar context to differentiate
1830 my $count = prop_invlist($lookup_prop,
1831 '_perl_core_internal_ok');
1832 die "Could not find inversion list for '$lookup_prop'"
1833 unless defined $count;
1836 @invlist = @$list_ref;
1837 @invmap = @$map_ref;
1838 $map_format = $format;
1839 $map_default = $default;
1840 $maps_to_code_point = $map_format =~ /x/;
1841 $to_adjust = $map_format =~ /a/;
1847 # Short-circuit an empty inversion list.
1849 output_invlist($prop_name, \@invlist, $charset);
1853 # Re-order the Unicode code points to native ones for this platform.
1854 # This is only needed for code points below 256, because native code
1855 # points are only in that range. For inversion maps of properties
1856 # where the mappings are adjusted (format =~ /a/), this reordering
1857 # could mess up the adjustment pattern that was in the input, so that
1858 # has to be dealt with.
1860 # And inversion maps that map to code points need to eventually have
1861 # all those code points remapped to native, and it's better to do that
1862 # here, going through the whole list not just those below 256. This
1863 # is because some inversion maps have adjustments (format =~ /a/)
1864 # which may be affected by the reordering. This code needs to be done
1865 # both for when we are translating the inversion lists for < 256, and
1866 # for the inversion maps for everything. By doing both in this loop,
1867 # we can share that code.
1869 # So, we go through everything for an inversion map to code points;
1870 # otherwise, we can skip any remapping at all if we are going to
1871 # output only the above-Latin1 values, or if the range spans the whole
1872 # of 0..256, as the remap will also include all of 0..256 (256 not
1873 # 255 because a re-ordering could cause 256 to need to be in the same
1875 if ((@invmap && $maps_to_code_point)
1876 || (! $nonl1_only || ($invlist[0] < 256
1877 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
1880 if (! @invmap) { # Straight inversion list
1881 # Look at all the ranges that start before 257.
1884 last if $invlist[0] > 256;
1885 my $upper = @invlist > 1
1886 ? $invlist[1] - 1 # In range
1888 # To infinity. You may want to stop much much
1889 # earlier; going this high may expose perl
1890 # deficiencies with very large numbers.
1891 : $Unicode::UCD::MAX_CP;
1892 for my $j ($invlist[0] .. $upper) {
1893 push @latin1_list, a2n($j);
1896 shift @invlist; # Shift off the range that's in the list
1897 shift @invlist; # Shift off the range not in the list
1900 # Here @invlist contains all the ranges in the original that start
1901 # at code points above 256, and @latin1_list contains all the
1902 # native code points for ranges that start with a Unicode code
1903 # point below 257. We sort the latter and convert it to inversion
1904 # list format. Then simply prepend it to the list of the higher
1906 @latin1_list = sort { $a <=> $b } @latin1_list;
1907 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
1908 unshift @invlist, @latin1_list;
1910 else { # Is an inversion map
1912 # This is a similar procedure as plain inversion list, but has
1913 # multiple buckets. A plain inversion list just has two
1914 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
1915 # pretty much can ignore the 2nd bucket, as it is completely
1916 # defined by the 1st. But here, what we do is create buckets
1917 # which contain the code points that map to each, translated
1918 # to native and turned into an inversion list. Thus each
1919 # bucket is an inversion list of native code points that map
1920 # to it or don't map to it. We use these to create an
1921 # inversion map for the whole property.
1923 # As mentioned earlier, we use this procedure to not just
1924 # remap the inversion list to native values, but also the maps
1925 # of code points to native ones. In the latter case we have
1926 # to look at the whole of the inversion map (or at least to
1927 # above Unicode; as the maps of code points above that should
1928 # all be to the default).
1929 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
1931 my %mapped_lists; # A hash whose keys are the buckets.
1933 last if $invlist[0] > $upper_limit;
1935 # This shouldn't actually happen, as prop_invmap() returns
1936 # an extra element at the end that is beyond $upper_limit
1937 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
1941 # A hash key can't be a ref (we are only expecting arrays
1942 # of scalars here), so convert any such to a string that
1943 # will be converted back later (using a vertical tab as
1944 # the separator). Even if the mapping is to code points,
1945 # we don't translate to native here because the code
1946 # output_map() calls to output these arrays assumes the
1947 # input is Unicode, not native.
1948 if (ref $invmap[0]) {
1949 $bucket = join "\cK", @{$invmap[0]};
1951 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
1953 # Do convert to native for maps to single code points.
1954 # There are some properties that have a few outlier
1955 # maps that aren't code points, so the above test
1957 $bucket = a2n($invmap[0]);
1959 $bucket = $invmap[0];
1962 # We now have the bucket that all code points in the range
1963 # map to, though possibly they need to be adjusted. Go
1964 # through the range and put each translated code point in
1965 # it into its bucket.
1966 my $base_map = $invmap[0];
1967 for my $j ($invlist[0] .. $invlist[1] - 1) {
1969 # The 1st code point doesn't need adjusting
1972 # Skip any non-numeric maps: these are outliers
1973 # that aren't code points.
1974 && $base_map =~ $numeric_re
1976 # 'ne' because the default can be a string
1977 && $base_map ne $map_default)
1979 # We adjust, by incrementing each the bucket and
1980 # the map. For code point maps, translate to
1983 $bucket = ($maps_to_code_point)
1988 # Add the native code point to the bucket for the
1990 push @{$mapped_lists{$bucket}}, a2n($j);
1991 } # End of loop through all code points in the range
1993 # Get ready for the next range
1996 } # End of loop through all ranges in the map.
1998 # Here, @invlist and @invmap retain all the ranges from the
1999 # originals that start with code points above $upper_limit.
2000 # Each bucket in %mapped_lists contains all the code points
2001 # that map to that bucket. If the bucket is for a map to a
2002 # single code point is a single code point, the bucket has
2003 # been converted to native. If something else (including
2004 # multiple code points), no conversion is done.
2006 # Now we recreate the inversion map into %xlated, but this
2007 # time for the native character set.
2009 foreach my $bucket (keys %mapped_lists) {
2011 # Sort and convert this bucket to an inversion list. The
2012 # result will be that ranges that start with even-numbered
2013 # indexes will be for code points that map to this bucket;
2014 # odd ones map to some other bucket, and are discarded
2016 @{$mapped_lists{$bucket}}
2017 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
2018 @{$mapped_lists{$bucket}}
2019 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
2021 # Add each even-numbered range in the bucket to %xlated;
2022 # so that the keys of %xlated become the range start code
2023 # points, and the values are their corresponding maps.
2024 while (@{$mapped_lists{$bucket}}) {
2025 my $range_start = $mapped_lists{$bucket}->[0];
2026 if ($bucket =~ /\cK/) {
2027 @{$xlated{$range_start}} = split /\cK/, $bucket;
2030 $xlated{$range_start} = $bucket;
2032 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
2033 shift @{$mapped_lists{$bucket}}; # Get ready for next
2036 } # End of loop through all the buckets.
2038 # Here %xlated's keys are the range starts of all the code
2039 # points in the inversion map. Construct an inversion list
2041 my @new_invlist = sort { $a <=> $b } keys %xlated;
2043 # If the list is adjusted, we want to munge this list so that
2044 # we only have one entry for where consecutive code points map
2045 # to consecutive values. We just skip the subsequent entries
2046 # where this is the case.
2049 for my $i (0 .. @new_invlist - 1) {
2051 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
2052 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
2053 && $xlated{$new_invlist[$i]} =~ $numeric_re
2054 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
2055 push @temp, $new_invlist[$i];
2057 @new_invlist = @temp;
2060 # The inversion map comes from %xlated's values. We can
2061 # unshift each onto the front of the untouched portion, in
2062 # reverse order of the portion we did process.
2063 foreach my $start (reverse @new_invlist) {
2064 unshift @invmap, $xlated{$start};
2067 # Finally prepend the inversion list we have just constructed to the
2068 # one that contains anything we didn't process.
2069 unshift @invlist, @new_invlist;
2073 # prop_invmap() returns an extra final entry, which we can now
2081 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
2082 for my $i (0 .. @invlist - 1 - 1) {
2083 if ($invlist[$i] > 255) {
2085 # In an inversion list, even-numbered elements give the code
2086 # points that begin ranges that match the property;
2087 # odd-numbered give ones that begin ranges that don't match.
2088 # If $i is odd, we are at the first code point above 255 that
2089 # doesn't match, which means the range it is ending does
2090 # match, and crosses the 255/256 boundary. We want to include
2091 # this ending point, so increment $i, so the splice below
2092 # includes it. Conversely, if $i is even, it is the first
2093 # code point above 255 that matches, which means there was no
2094 # matching range that crossed the boundary, and we don't want
2095 # to include this code point, so splice before it.
2096 $i++ if $i % 2 != 0;
2098 # Remove everything past this.
2099 splice @invlist, $i;
2100 splice @invmap, $i if @invmap;
2105 elsif ($nonl1_only) {
2106 my $found_nonl1 = 0;
2107 for my $i (0 .. @invlist - 1 - 1) {
2108 next if $invlist[$i] < 256;
2110 # Here, we have the first element in the array that indicates an
2111 # element above Latin1. Get rid of all previous ones.
2112 splice @invlist, 0, $i;
2113 splice @invmap, 0, $i if @invmap;
2115 # If this one's index is not divisible by 2, it means that this
2116 # element is inverting away from being in the list, which means
2117 # all code points from 256 to this one are in this list (or
2118 # map to the default for inversion maps)
2120 unshift @invlist, 256;
2121 unshift @invmap, $map_default if @invmap;
2126 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
2129 output_invlist($prop_name, \@invlist, $charset);
2130 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
2133 print $out_fh "\n" . get_conditional_compile_line_end();
2136 switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
2144 my $sources_list = "lib/unicore/mktables.lst";
2145 my @sources = ($0, qw(lib/unicore/mktables
2147 regen/charset_translations.pl
2150 # Depend on mktables’ own sources. It’s a shorter list of files than
2151 # those that Unicode::UCD uses.
2152 if (! open my $mktables_list, '<', $sources_list) {
2154 # This should force a rebuild once $sources_list exists
2155 push @sources, $sources_list;
2158 while(<$mktables_list>) {
2161 push @sources, "lib/unicore/$_" if /^[^#]/;
2166 read_only_bottom_close_and_rename($out_fh, \@sources);