5 use Unicode::UCD qw(prop_aliases
9 prop_invmap search_invlist
12 require './regen/regen_lib.pl';
13 require './regen/charset_translations.pl';
15 # This program outputs charclass_invlists.h, which contains various inversion
16 # lists in the form of C arrays that are to be used as-is for inversion lists.
17 # Thus, the lists it contains are essentially pre-compiled, and need only a
18 # light-weight fast wrapper to make them usable at run-time.
20 # As such, this code knows about the internal structure of these lists, and
21 # any change made to that has to be done here as well. A random number stored
22 # in the headers is used to minimize the possibility of things getting
23 # out-of-sync, or the wrong data structure being passed. Currently that
26 # charclass_invlists.h now also has a partial implementation of inversion
27 # maps; enough to generate tables for the line break properties, such as GCB
29 my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
32 my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
34 # Matches valid C language enum names: begins with ASCII alphabetic, then any
36 my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
38 my $out_fh = open_new('charclass_invlists.h', '>',
39 {style => '*', by => $0,
40 from => "Unicode::UCD"});
42 my $in_file_pound_if = 0;
44 my $max_hdr_len = 3; # In headings, how wide a name is allowed?
46 print $out_fh "/* See the generating file for comments */\n\n";
48 # The symbols generated by this program are all currently defined only in a
49 # single dot c each. The code knows where most of them go, but this hash
50 # gives overrides for the exceptions to the typical place
51 my %exceptions_to_where_to_define =
52 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
53 AboveLatin1 => 'PERL_IN_REGCOMP_C',
54 Latin1 => 'PERL_IN_REGCOMP_C',
55 UpperLatin1 => 'PERL_IN_REGCOMP_C',
56 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
57 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
58 _Perl_IDCont => 'PERL_IN_UTF8_C',
59 _Perl_IDStart => 'PERL_IN_UTF8_C',
60 Currency_Symbol => 'PERL_IN_LOCALE_C',
62 my %where_to_define_enums = ();
66 my %gcb_abbreviations;
77 # Returns non-duplicated input values. From "Perl Best Practices:
78 # Encapsulated Cleverness". p. 455 in first edition.
81 return grep { ! $seen{$_}++ } @_;
87 # Returns the input Unicode code point translated to native.
89 return $cp if $cp !~ $numeric_re || $cp > 255;
93 sub end_file_pound_if {
94 if ($in_file_pound_if) {
95 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
96 $in_file_pound_if = 0;
100 sub switch_pound_if ($$) {
102 my $new_pound_if = shift;
103 my @new_pound_if = ref ($new_pound_if)
107 # Switch to new #if given by the 2nd argument. If there is an override
108 # for this, it instead switches to that. The 1st argument is the
109 # static's name, used to look up the overrides
111 if (exists $exceptions_to_where_to_define{$name}) {
112 @new_pound_if = $exceptions_to_where_to_define{$name};
114 $new_pound_if = join "", @new_pound_if;
116 # Exit current #if if the new one is different from the old
117 if ( $in_file_pound_if
118 && $in_file_pound_if !~ /$new_pound_if/)
123 # Enter new #if, if not already in it.
124 if (! $in_file_pound_if) {
125 foreach my $element (@new_pound_if) {
126 $element = "defined($element)";
128 $in_file_pound_if = join " || ", @new_pound_if;
129 print $out_fh "\n#if $in_file_pound_if\n";
133 sub output_invlist ($$;$) {
135 my $invlist = shift; # Reference to inversion list array
136 my $charset = shift // ""; # name of character set for comment
138 die "No inversion list for $name" unless defined $invlist
139 && ref $invlist eq 'ARRAY';
141 # Output the inversion list $invlist using the name $name for it.
142 # It is output in the exact internal form for inversion lists.
144 # Is the last element of the header 0, or 1 ?
146 if (@$invlist && $invlist->[0] != 0) {
147 unshift @$invlist, 0;
150 my $count = @$invlist;
152 switch_pound_if ($name, 'PERL_IN_PERL_C');
154 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
155 print $out_fh " /* for $charset */" if $charset;
158 print $out_fh "\t$count,\t/* Number of elements */\n";
159 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
160 print $out_fh "\t", $zero_or_one,
161 ",\t/* 0 if the list starts at 0;",
162 "\n\t\t 1 if it starts at the element beyond 0 */\n";
164 # The main body are the UVs passed in to this routine. Do the final
166 for my $i (0 .. @$invlist - 1) {
167 printf $out_fh "\t0x%X", $invlist->[$i];
168 print $out_fh "," if $i < @$invlist - 1;
172 print $out_fh "};\n";
175 sub output_invmap ($$$$$$$) {
177 my $invmap = shift; # Reference to inversion map array
178 my $prop_name = shift;
179 my $input_format = shift; # The inversion map's format
180 my $default = shift; # The property value for code points who
181 # otherwise don't have a value specified.
182 my $extra_enums = shift; # comma-separated list of our additions to the
183 # property's standard possible values
184 my $charset = shift // ""; # name of character set for comment
186 # Output the inversion map $invmap for property $prop_name, but use $name
187 # as the actual data structure's name.
189 my $count = @$invmap;
192 my $declaration_type;
196 if ($input_format =~ / ^ s l? $ /x) {
197 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
198 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
201 # Find all the possible input values. These become the enum names
202 # that comprise the inversion map. For inputs that don't have sub
203 # lists, we can just get the unique values. Otherwise, we have to
204 # expand the sublists first.
205 if ($input_format ne 'sl') {
206 @input_enums = sort(uniques(@$invmap));
209 foreach my $element (@$invmap) {
211 push @input_enums, @$element;
214 push @input_enums, $element;
217 @input_enums = sort(uniques(@input_enums));
220 # The internal enums come last, and in the order specified.
221 my @enums = @input_enums;
223 if ($extra_enums ne "") {
224 @extras = split /,/, $extra_enums;
226 # Don't add if already there.
227 foreach my $this_extra (@extras) {
228 next if grep { $_ eq $this_extra } @enums;
230 push @enums, $this_extra;
234 # Assign a value to each element of the enum type we are creating.
235 # The default value always gets 0; the others are arbitrarily
238 my $canonical_default = prop_value_aliases($prop_name, $default);
239 $default = $canonical_default if defined $canonical_default;
240 $enums{$default} = $enum_val++;
242 for my $enum (@enums) {
243 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
246 # Calculate the data for the special tables output for these properties.
247 if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) {
249 # The data includes the hashes %gcb_enums, %lb_enums, etc.
250 # Similarly we calculate column headings for the tables.
252 # We use string evals to allow the same code to work on
254 my $type = lc $prop_name;
256 my $placeholder = "a";
258 # Skip if we've already done this code, which populated
260 if (eval "! \%${type}_enums") {
262 # For each enum in the type ...
263 foreach my $enum (sort keys %enums) {
264 my $value = $enums{$enum};
266 my $abbreviated_from;
268 # Special case this wb property value to make the
270 if ($enum eq 'Perl_Tailored_HSpace') {
272 $abbreviated_from = $enum;
276 # Use the official short name, if found.
277 ($short) = prop_value_aliases($type, $enum);
279 if (! defined $short) {
281 # But if there is no official name, use the name
282 # that came from the data (if any). Otherwise,
283 # the name had to come from the extras list.
284 # There are two types of values in that list.
286 # First are those enums that are not part of the
287 # property, but are defined by this code. By
288 # convention these have all-caps names of at least
289 # 4 characters. We use the lowercased name for
292 # Second are enums that are needed to get
293 # regexec.c to compile, but don't exist in all
294 # Unicode releases. To get here, we must be
295 # compiling an earlier Unicode release that
296 # doesn't have that enum, so just use a unique
297 # anonymous name for it.
298 if (grep { $_ eq $enum } @input_enums) {
301 elsif ($enum !~ / ^ [A-Z]{4,} $ /x) {
302 $short = $placeholder++;
310 # If our short name is too long, or we already
311 # know that the name is an abbreviation, truncate
312 # to make sure it's short enough, and remember
313 # that we did this so we can later add a comment in the
315 if ( $abbreviated_from
316 || length $short > $max_hdr_len)
318 $short = substr($short, 0, $max_hdr_len);
319 $abbreviated_from = $enum
320 unless $abbreviated_from;
321 # If the name we are to display conflicts, try
324 \$${type}_abbreviations{$short}")
328 # The increment operator on strings doesn't work
329 # on those containing an '_', so just use the
331 my @short = split '_', $short;
333 $short = join "_", @short;
336 eval "\$${type}_abbreviations{$short} = '$enum'";
340 # Remember the mapping from the property value
341 # (enum) name to its value.
342 eval "\$${type}_enums{$enum} = $value";
345 # Remember the inverse mapping to the short name
346 # so that we can properly label the generated
347 # table's rows and columns
348 eval "\$${type}_short_enums[$value] = '$short'";
354 # Inversion map stuff is used only by regexec, unless it is in the
355 # enum exception list
356 my $where = (exists $where_to_define_enums{$name})
357 ? $where_to_define_enums{$name}
358 : 'PERL_IN_REGEXEC_C';
359 switch_pound_if($name, $where);
361 # The short names tend to be two lower case letters, but it looks
362 # better for those if they are upper. XXX
363 $short_name = uc($short_name) if length($short_name) < 3
364 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
365 $name_prefix = "${short_name}_";
368 #print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
370 if ($input_format eq 'sl') {
372 "\n/* Negative enum values indicate the need to use an auxiliary"
373 . " table\n * consisting of the list of enums this one expands to."
374 . " The absolute\n * values of the negative enums are indices into"
375 . " a table of the auxiliary\n * tables' addresses */";
378 # Start the enum definition for this map
379 print $out_fh "\ntypedef enum {\n";
381 foreach my $enum (keys %enums) {
382 $enum_list[$enums{$enum}] = $enum;
384 foreach my $i (0 .. @enum_list - 1) {
385 print $out_fh ",\n" if $i > 0;
387 my $name = $enum_list[$i];
388 print $out_fh "\t${name_prefix}$name = $i";
391 # For an 'sl' property, we need extra enums, because some of the
392 # elements are lists. Each such distinct list is placed in its own
393 # auxiliary map table. Here, we go through the inversion map, and for
394 # each distinct list found, create an enum value for it, numbered -1,
397 my $aux_table_prefix = "AUX_TABLE_";
398 if ($input_format eq 'sl') {
399 foreach my $element (@$invmap) {
401 # A regular scalar is not one of the lists we're looking for
403 next unless ref $element;
405 my $joined = join ",", sort @$element;
406 my $already_found = exists $multiples{$joined};
409 if ($already_found) { # Use any existing one
410 $i = $multiples{$joined};
412 else { # Otherwise increment to get a new table number
413 $i = keys(%multiples) + 1;
414 $multiples{$joined} = $i;
417 # This changes the inversion map for this entry to not be the
419 $element = "use_$aux_table_prefix$i";
421 # And add to the enum values
422 if (! $already_found) {
423 print $out_fh ",\n\t${name_prefix}$element = -$i";
429 $declaration_type = "${name_prefix}enum";
430 print $out_fh "} $declaration_type;\n";
431 # Finished with the enum defintion.
433 $output_format = "${name_prefix}%s";
435 # If there are auxiliary tables, output them.
438 print $out_fh "\n#define HAS_${name_prefix}AUX_TABLES\n";
440 # Invert keys and values
442 while (my ($key, $value) = each %multiples) {
443 $inverted_mults{$value} = $key;
446 # Output them in sorted order
447 my @sorted_table_list = sort { $a <=> $b } keys %inverted_mults;
449 # Keep track of how big each aux table is
452 # Output each aux table.
453 foreach my $table_number (@sorted_table_list) {
454 my $table = $inverted_mults{$table_number};
455 print $out_fh "\nstatic const $declaration_type $name_prefix$aux_table_prefix$table_number\[] = {\n";
457 # Earlier, we joined the elements of this table together with a comma
458 my @elements = split ",", $table;
460 $aux_counts[$table_number] = scalar @elements;
461 for my $i (0 .. @elements - 1) {
462 print $out_fh ",\n" if $i > 0;
463 print $out_fh "\t${name_prefix}$elements[$i]";
465 print $out_fh "\n};\n";
468 # Output the table that is indexed by the absolute value of the
469 # aux table enum and contains pointers to the tables output just
471 print $out_fh "\nstatic const $declaration_type * const ${name_prefix}${aux_table_prefix}ptrs\[] = {\n";
472 print $out_fh "\tNULL,\t/* Placeholder */\n";
473 for my $i (1 .. @sorted_table_list) {
474 print $out_fh ",\n" if $i > 1;
475 print $out_fh "\t$name_prefix$aux_table_prefix$i";
477 print $out_fh "\n};\n";
480 "\n/* Parallel table to the above, giving the number of elements"
481 . " in each table\n * pointed to */\n";
482 print $out_fh "static const U8 ${name_prefix}${aux_table_prefix}lengths\[] = {\n";
483 print $out_fh "\t0,\t/* Placeholder */\n";
484 for my $i (1 .. @sorted_table_list) {
485 print $out_fh ",\n" if $i > 1;
486 print $out_fh "\t$aux_counts[$i]\t/* $name_prefix$aux_table_prefix$i */";
488 print $out_fh "\n};\n";
489 } # End of outputting the auxiliary and associated tables
491 # The scx property used in regexec.c needs a specialized table which
492 # is most convenient to output here, while the data structures set up
493 # above are still extant. This table contains the code point that is
494 # the zero digit of each script, indexed by script enum value.
495 if (lc $short_name eq 'scx') {
496 my @decimals_invlist = prop_invlist("Numeric_Type=Decimal");
499 # Find all the decimal digits. The 0 of each range is always the
500 # 0th element, except in some early Unicode releases, so check for
502 for (my $i = 0; $i < @decimals_invlist; $i += 2) {
503 my $code_point = $decimals_invlist[$i];
504 next if chr($code_point) !~ /\p{Nv=0}/;
506 # Turn the scripts this zero is in into a list.
507 my @scripts = split ",",
508 charprop($code_point, "_Perl_SCX", '_perl_core_internal_ok');
509 $code_point = sprintf("0x%x", $code_point);
511 foreach my $script (@scripts) {
512 if (! exists $script_zeros{$script}) {
513 $script_zeros{$script} = $code_point;
515 elsif (ref $script_zeros{$script}) {
516 push $script_zeros{$script}->@*, $code_point;
518 else { # Turn into a list if this is the 2nd zero of the
520 my $existing = $script_zeros{$script};
521 undef $script_zeros{$script};
522 push $script_zeros{$script}->@*, $existing, $code_point;
527 # @script_zeros contains the zero, sorted by the script's enum
530 foreach my $script (keys %script_zeros) {
531 my $enum_value = $enums{$script};
532 $script_zeros[$enum_value] = $script_zeros{$script};
536 "\n/* This table, indexed by the script enum, gives the zero"
537 . " code point for that\n * script; 0 if the script has multiple"
538 . " digit sequences. Scripts without a\n * digit sequence use"
539 . " ASCII [0-9], hence are marked '0' */\n";
540 print $out_fh "static const UV script_zeros[] = {\n";
541 for my $i (0 .. @script_zeros - 1) {
542 my $code_point = $script_zeros[$i];
543 if (defined $code_point) {
544 $code_point = " 0" if ref $code_point;
545 print $out_fh "\t$code_point";
547 elsif (lc $enum_list[$i] eq 'inherited') {
548 print $out_fh "\t 0";
550 else { # The only digits a script without its own set accepts
552 print $out_fh "\t'0'";
554 print $out_fh "," if $i < @script_zeros - 1;
555 print $out_fh "\t/* $enum_list[$i] */";
558 print $out_fh "};\n";
559 } # End of special handling of scx
562 die "'$input_format' invmap() format for '$prop_name' unimplemented";
565 die "No inversion map for $prop_name" unless defined $invmap
566 && ref $invmap eq 'ARRAY'
569 # Now output the inversion map proper
570 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
571 print $out_fh " /* for $charset */" if $charset;
574 # The main body are the scalars passed in to this routine.
575 for my $i (0 .. $count - 1) {
576 my $element = $invmap->[$i];
577 my $full_element_name = prop_value_aliases($prop_name, $element);
578 $element = $full_element_name if defined $full_element_name;
579 $element = $name_prefix . $element;
580 print $out_fh "\t$element";
581 print $out_fh "," if $i < $count - 1;
584 print $out_fh "};\n";
587 sub mk_invlist_from_sorted_cp_list {
589 # Returns an inversion list constructed from the sorted input array of
592 my $list_ref = shift;
594 return unless @$list_ref;
596 # Initialize to just the first element
597 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
599 # For each succeeding element, if it extends the previous range, adjust
600 # up, otherwise add it.
601 for my $i (1 .. @$list_ref - 1) {
602 if ($invlist[-1] == $list_ref->[$i]) {
606 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
612 # Read in the Case Folding rules, and construct arrays of code points for the
613 # properties we need.
614 my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
615 die "Could not find inversion map for Case_Folding" unless defined $format;
616 die "Incorrect format '$format' for Case_Folding inversion map"
617 unless $format eq 'al'
619 my @has_multi_char_fold;
620 my @is_non_final_fold;
622 for my $i (0 .. @$folds_ref - 1) {
623 next unless ref $folds_ref->[$i]; # Skip single-char folds
624 push @has_multi_char_fold, $cp_ref->[$i];
626 # Add to the non-finals list each code point that is in a non-final
628 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
629 push @is_non_final_fold, $folds_ref->[$i][$j]
630 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
634 sub _Perl_Non_Final_Folds {
635 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
636 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
639 sub prop_name_for_cmp ($) { # Sort helper
642 # Returns the input lowercased, with non-alphas removed, as well as
643 # everything starting with a comma
646 $name =~ s/[[:^alpha:]]//g;
651 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
654 sub output_table_common {
656 # Common subroutine to actually output the generated rules table.
659 $table_value_defines_ref,
662 $abbreviations_ref) = @_;
663 my $size = @$table_ref;
665 # Output the #define list, sorted by numeric value
666 if ($table_value_defines_ref) {
667 my $max_name_length = 0;
670 # Put in order, and at the same time find the longest name
671 while (my ($enum, $value) = each %$table_value_defines_ref) {
672 $defines[$value] = $enum;
674 my $length = length $enum;
675 $max_name_length = $length if $length > $max_name_length;
680 # Output, so that the values are vertically aligned in a column after
682 foreach my $i (0 .. @defines - 1) {
683 next unless defined $defines[$i];
684 printf $out_fh "#define %-*s %2d\n",
691 my $column_width = 2; # We currently allow 2 digits for the number
693 # If the maximum value in the table is 1, it can be a bool. (Being above
694 # a U8 is not currently handled
696 for my $i (0 .. $size - 1) {
697 for my $j (0 .. $size - 1) {
698 next if $max_element >= $table_ref->[$i][$j];
699 $max_element = $table_ref->[$i][$j];
702 die "Need wider table column width given '$max_element"
703 if length $max_element > $column_width;
705 my $table_type = ($max_element == 1)
709 # If a name is longer than the width set aside for a column, its column
710 # needs to have increased spacing so that the name doesn't get truncated
711 # nor run into an adjacent column
714 # If we are being compiled on a Unicode version earlier than that which
715 # this file was designed for, it may be that some of the property values
716 # aren't in the current release, and so would be undefined if we didn't
717 # define them ourselves. Earlier code has done this, making them
718 # lowercase characters of length one. We look to see if any exist, so
719 # that we can add an annotation to the output table
720 my $has_placeholder = 0;
722 for my $i (0 .. $size - 1) {
723 no warnings 'numeric';
724 $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
725 $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
728 print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
730 # Calculate the column heading line
731 my $header_line = "/* "
732 . (" " x $max_hdr_len) # We let the row heading meld to
733 # the '*/' for those that are at
735 . " " x 3; # Space for '*/ '
737 for my $i (0 .. $size - 1) {
738 $header_line .= sprintf "%s%*s",
740 $column_width + 1, # 1 for the ','
743 $header_line .= " */\n";
745 # If we have annotations, output it now.
746 if ($has_placeholder || scalar %$abbreviations_ref) {
748 foreach my $abbr (sort keys %$abbreviations_ref) {
749 $text .= "; " if $text;
750 $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
752 if ($has_placeholder) {
753 $text .= "; other " if $text;
754 $text .= "lowercase names are placeholders for"
755 . " property values not defined until a later Unicode"
756 . " release, so are irrelevant in this one, as they are"
757 . " not assigned to any code points";
760 my $indent = " " x 3;
761 $text = $indent . "/* $text */";
763 # Wrap the text so that it is no wider than the table, which the
765 my $output_width = length $header_line;
766 while (length $text > $output_width) {
767 my $cur_line = substr($text, 0, $output_width);
769 # Find the first blank back from the right end to wrap at.
770 for (my $i = $output_width -1; $i > 0; $i--) {
771 if (substr($text, $i, 1) eq " ") {
772 print $out_fh substr($text, 0, $i), "\n";
774 # Set so will look at just the remaining tail (which will
775 # be indented and have a '*' after the indent
776 $text = $indent . " * " . substr($text, $i + 1);
783 print $out_fh $text, "\n" if $text;
786 # We calculated the header line earlier just to get its width so that we
787 # could make sure the annotations fit into that.
788 print $out_fh $header_line;
790 # Now output the bulk of the table.
791 for my $i (0 .. $size - 1) {
793 # First the row heading.
794 printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i];
795 print $out_fh "{"; # Then the brace for this row
798 for my $j (0 .. $size -1) {
799 print $out_fh $spacers[$j];
800 printf $out_fh "%*d", $column_width, $table_ref->[$i][$j];
801 print $out_fh "," if $j < $size - 1;
804 print $out_fh "," if $i < $size - 1;
808 print $out_fh "};\n";
811 sub output_GCB_table() {
813 # Create and output the pair table for use in determining Grapheme Cluster
814 # Breaks, given in http://www.unicode.org/reports/tr29/.
818 GCB_RI_then_RI => 2, # Rules 12 and 13
819 GCB_EX_then_EM => 3, # Rule 10
822 # The table is constructed in reverse order of the rules, to make the
823 # lower-numbered, higher priority ones override the later ones, as the
824 # algorithm stops at the earliest matching rule
827 my $table_size = @gcb_short_enums;
829 # Otherwise, break everywhere.
831 for my $i (0 .. $table_size - 1) {
832 for my $j (0 .. $table_size - 1) {
833 $gcb_table[$i][$j] = 1;
837 # Do not break within emoji flag sequences. That is, do not break between
838 # regional indicator (RI) symbols if there is an odd number of RI
839 # characters before the break point. Must be resolved in runtime code.
841 # GB12 sot (RI RI)* RI × RI
842 # GB13 [^RI] (RI RI)* RI × RI
843 $gcb_table[$gcb_enums{'Regional_Indicator'}]
844 [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI};
846 # Do not break within emoji modifier sequences or emoji zwj sequences.
847 # GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ )
848 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0;
849 $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0;
851 # GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier
852 $gcb_table[$gcb_enums{'Extend'}][$gcb_enums{'E_Modifier'}]
853 = $gcb_actions{GCB_EX_then_EM};
854 $gcb_table[$gcb_enums{'E_Base'}][$gcb_enums{'E_Modifier'}] = 0;
855 $gcb_table[$gcb_enums{'E_Base_GAZ'}][$gcb_enums{'E_Modifier'}] = 0;
857 # Do not break before extending characters or ZWJ.
858 # Do not break before SpacingMarks, or after Prepend characters.
861 # GB9 × ( Extend | ZWJ )
862 for my $i (0 .. @gcb_table - 1) {
863 $gcb_table[$gcb_enums{'Prepend'}][$i] = 0;
864 $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0;
865 $gcb_table[$i][$gcb_enums{'Extend'}] = 0;
866 $gcb_table[$i][$gcb_enums{'ZWJ'}] = 0;
869 # Do not break Hangul syllable sequences.
871 $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
872 $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
874 # GB7 ( LV | V ) × ( V | T )
875 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
876 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
877 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
878 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
880 # GB6 L × ( L | V | LV | LVT )
881 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
882 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
883 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
884 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
886 # Do not break between a CR and LF. Otherwise, break before and after
888 # GB5 ÷ ( Control | CR | LF )
889 # GB4 ( Control | CR | LF ) ÷
890 for my $i (0 .. @gcb_table - 1) {
891 $gcb_table[$i][$gcb_enums{'Control'}] = 1;
892 $gcb_table[$i][$gcb_enums{'CR'}] = 1;
893 $gcb_table[$i][$gcb_enums{'LF'}] = 1;
894 $gcb_table[$gcb_enums{'Control'}][$i] = 1;
895 $gcb_table[$gcb_enums{'CR'}][$i] = 1;
896 $gcb_table[$gcb_enums{'LF'}][$i] = 1;
900 $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
902 # Break at the start and end of text, unless the text is empty
905 for my $i (0 .. @gcb_table - 1) {
906 $gcb_table[$i][$gcb_enums{'EDGE'}] = 1;
907 $gcb_table[$gcb_enums{'EDGE'}][$i] = 1;
909 $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0;
911 output_table_common('GCB', \%gcb_actions,
912 \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations);
915 sub output_LB_table() {
917 # Create and output the enums, #defines, and pair table for use in
918 # determining Line Breaks. This uses the default line break algorithm,
919 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
920 # in that page, as the Unicode-furnished tests assume that tailoring.
922 # The result is really just true or false. But we follow along with tr14,
923 # creating a rule which is false for something like X SP* X. That gets
924 # encoding 2. The rest of the actions are synthetic ones that indicate
925 # some context handling is required. These each are added to the
926 # underlying 0, 1, or 2, instead of replacing them, so that the underlying
927 # value can be retrieved. Actually only rules from 7 through 18 (which
928 # are the ones where space matter) are possible to have 2 added to them.
929 # The others below add just 0 or 1. It might be possible for one
930 # synthetic rule to be added to another, yielding a larger value. This
931 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
932 # names of the middle grouping below, it is impossible for that to occur
933 # for them because they all start with mutually exclusive classes. That
934 # the final rule can't be added to any of the others isn't obvious from
935 # its name, so it is assigned a power of 2 higher than the others can get
936 # to so any addition would preserve all data. (And the code will reach an
937 # assert(0) on debugging builds should this happen.)
941 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
943 LB_CM_ZWJ_foo => 3, # Rule 9
944 LB_SP_foo => 6, # Rule 18
945 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
946 LB_SY_or_IS_then_various => 11, # Rule 25
947 LB_HY_or_BA_then_foo => 13, # Rule 21
948 LB_RI_then_RI => 15, # Rule 30a
950 LB_various_then_PO_or_PR => (1<<5), # Rule 25
953 # Construct the LB pair table. This is based on the rules in
954 # http://www.unicode.org/reports/tr14/, but modified as those rules are
955 # designed for someone taking a string of text and sequentially going
956 # through it to find the break opportunities, whereas, Perl requires
957 # determining if a given random spot is a break opportunity, without
958 # knowing all the entire string before it.
960 # The table is constructed in reverse order of the rules, to make the
961 # lower-numbered, higher priority ones override the later ones, as the
962 # algorithm stops at the earliest matching rule
965 my $table_size = @lb_short_enums;
967 # LB31. Break everywhere else
968 for my $i (0 .. $table_size - 1) {
969 for my $j (0 .. $table_size - 1) {
970 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
974 # LB30b Do not break between an emoji base and an emoji modifier.
976 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'E_Modifier'}]
977 = $lb_actions{'LB_NOBREAK'};
979 # LB30a Break between two regional indicator symbols if and only if there
980 # are an even number of regional indicators preceding the position of the
982 # sot (RI RI)* RI × RI
983 # [^RI] (RI RI)* RI × RI
984 $lb_table[$lb_enums{'Regional_Indicator'}]
985 [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_RI_then_RI'};
987 # LB30 Do not break between letters, numbers, or ordinary symbols and
988 # opening or closing parentheses.
989 # (AL | HL | NU) × OP
990 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}]
991 = $lb_actions{'LB_NOBREAK'};
992 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}]
993 = $lb_actions{'LB_NOBREAK'};
994 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}]
995 = $lb_actions{'LB_NOBREAK'};
997 # CP × (AL | HL | NU)
998 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}]
999 = $lb_actions{'LB_NOBREAK'};
1000 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}]
1001 = $lb_actions{'LB_NOBREAK'};
1002 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}]
1003 = $lb_actions{'LB_NOBREAK'};
1005 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
1007 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}]
1008 = $lb_actions{'LB_NOBREAK'};
1009 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1010 = $lb_actions{'LB_NOBREAK'};
1012 # LB28 Do not break between alphabetics (“at”).
1013 # (AL | HL) × (AL | HL)
1014 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}]
1015 = $lb_actions{'LB_NOBREAK'};
1016 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}]
1017 = $lb_actions{'LB_NOBREAK'};
1018 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}]
1019 = $lb_actions{'LB_NOBREAK'};
1020 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}]
1021 = $lb_actions{'LB_NOBREAK'};
1023 # LB27 Treat a Korean Syllable Block the same as ID.
1024 # (JL | JV | JT | H2 | H3) × IN
1025 $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}]
1026 = $lb_actions{'LB_NOBREAK'};
1027 $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}]
1028 = $lb_actions{'LB_NOBREAK'};
1029 $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}]
1030 = $lb_actions{'LB_NOBREAK'};
1031 $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}]
1032 = $lb_actions{'LB_NOBREAK'};
1033 $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}]
1034 = $lb_actions{'LB_NOBREAK'};
1036 # (JL | JV | JT | H2 | H3) × PO
1037 $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}]
1038 = $lb_actions{'LB_NOBREAK'};
1039 $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}]
1040 = $lb_actions{'LB_NOBREAK'};
1041 $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}]
1042 = $lb_actions{'LB_NOBREAK'};
1043 $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}]
1044 = $lb_actions{'LB_NOBREAK'};
1045 $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}]
1046 = $lb_actions{'LB_NOBREAK'};
1048 # PR × (JL | JV | JT | H2 | H3)
1049 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}]
1050 = $lb_actions{'LB_NOBREAK'};
1051 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}]
1052 = $lb_actions{'LB_NOBREAK'};
1053 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}]
1054 = $lb_actions{'LB_NOBREAK'};
1055 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}]
1056 = $lb_actions{'LB_NOBREAK'};
1057 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}]
1058 = $lb_actions{'LB_NOBREAK'};
1060 # LB26 Do not break a Korean syllable.
1061 # JL × (JL | JV | H2 | H3)
1062 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
1063 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
1064 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
1065 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
1067 # (JV | H2) × (JV | JT)
1068 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
1069 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
1070 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
1071 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
1074 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
1075 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
1077 # LB25 Do not break between the following pairs of classes relevant to
1078 # numbers, as tailored by example 7 in
1079 # http://www.unicode.org/reports/tr14/#Examples
1080 # We follow that tailoring because Unicode's test cases expect it
1081 # (PR | PO) × ( OP | HY )? NU
1082 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}]
1083 = $lb_actions{'LB_NOBREAK'};
1084 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}]
1085 = $lb_actions{'LB_NOBREAK'};
1087 # Given that (OP | HY )? is optional, we have to test for it in code.
1088 # We add in the action (instead of overriding) for this, so that in
1089 # the code we can recover the underlying break value.
1090 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}]
1091 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1092 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}]
1093 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1094 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}]
1095 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1096 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}]
1097 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
1100 $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}]
1101 = $lb_actions{'LB_NOBREAK'};
1102 $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}]
1103 = $lb_actions{'LB_NOBREAK'};
1105 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
1106 # which can be rewritten as:
1107 # NU (SY | IS)* × (NU | SY | IS | CL | CP )
1108 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}]
1109 = $lb_actions{'LB_NOBREAK'};
1110 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}]
1111 = $lb_actions{'LB_NOBREAK'};
1112 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}]
1113 = $lb_actions{'LB_NOBREAK'};
1114 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}]
1115 = $lb_actions{'LB_NOBREAK'};
1116 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}]
1117 = $lb_actions{'LB_NOBREAK'};
1119 # Like earlier where we have to test in code, we add in the action so
1120 # that we can recover the underlying values. This is done in rules
1121 # below, as well. The code assumes that we haven't added 2 actions.
1122 # Shoul a later Unicode release break that assumption, then tests
1123 # should start failing.
1124 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}]
1125 += $lb_actions{'LB_SY_or_IS_then_various'};
1126 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}]
1127 += $lb_actions{'LB_SY_or_IS_then_various'};
1128 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}]
1129 += $lb_actions{'LB_SY_or_IS_then_various'};
1130 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}]
1131 += $lb_actions{'LB_SY_or_IS_then_various'};
1132 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}]
1133 += $lb_actions{'LB_SY_or_IS_then_various'};
1134 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}]
1135 += $lb_actions{'LB_SY_or_IS_then_various'};
1136 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}]
1137 += $lb_actions{'LB_SY_or_IS_then_various'};
1138 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}]
1139 += $lb_actions{'LB_SY_or_IS_then_various'};
1140 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}]
1141 += $lb_actions{'LB_SY_or_IS_then_various'};
1142 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}]
1143 += $lb_actions{'LB_SY_or_IS_then_various'};
1145 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
1146 # which can be rewritten as:
1147 # NU (SY | IS)* (CL | CP)? × (PO | PR)
1148 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}]
1149 = $lb_actions{'LB_NOBREAK'};
1150 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}]
1151 = $lb_actions{'LB_NOBREAK'};
1153 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}]
1154 += $lb_actions{'LB_various_then_PO_or_PR'};
1155 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}]
1156 += $lb_actions{'LB_various_then_PO_or_PR'};
1157 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}]
1158 += $lb_actions{'LB_various_then_PO_or_PR'};
1159 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}]
1160 += $lb_actions{'LB_various_then_PO_or_PR'};
1162 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}]
1163 += $lb_actions{'LB_various_then_PO_or_PR'};
1164 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}]
1165 += $lb_actions{'LB_various_then_PO_or_PR'};
1166 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}]
1167 += $lb_actions{'LB_various_then_PO_or_PR'};
1168 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}]
1169 += $lb_actions{'LB_various_then_PO_or_PR'};
1171 # LB24 Do not break between numeric prefix/postfix and letters, or between
1172 # letters and prefix/postfix.
1173 # (PR | PO) × (AL | HL)
1174 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}]
1175 = $lb_actions{'LB_NOBREAK'};
1176 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1177 = $lb_actions{'LB_NOBREAK'};
1178 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}]
1179 = $lb_actions{'LB_NOBREAK'};
1180 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1181 = $lb_actions{'LB_NOBREAK'};
1183 # (AL | HL) × (PR | PO)
1184 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Prefix_Numeric'}]
1185 = $lb_actions{'LB_NOBREAK'};
1186 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Prefix_Numeric'}]
1187 = $lb_actions{'LB_NOBREAK'};
1188 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Postfix_Numeric'}]
1189 = $lb_actions{'LB_NOBREAK'};
1190 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Postfix_Numeric'}]
1191 = $lb_actions{'LB_NOBREAK'};
1193 # LB23a Do not break between numeric prefixes and ideographs, or between
1194 # ideographs and numeric postfixes.
1195 # PR × (ID | EB | EM)
1196 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}]
1197 = $lb_actions{'LB_NOBREAK'};
1198 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Base'}]
1199 = $lb_actions{'LB_NOBREAK'};
1200 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'E_Modifier'}]
1201 = $lb_actions{'LB_NOBREAK'};
1203 # (ID | EB | EM) × PO
1204 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}]
1205 = $lb_actions{'LB_NOBREAK'};
1206 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Postfix_Numeric'}]
1207 = $lb_actions{'LB_NOBREAK'};
1208 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Postfix_Numeric'}]
1209 = $lb_actions{'LB_NOBREAK'};
1211 # LB23 Do not break between digits and letters
1213 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}]
1214 = $lb_actions{'LB_NOBREAK'};
1215 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}]
1216 = $lb_actions{'LB_NOBREAK'};
1219 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}]
1220 = $lb_actions{'LB_NOBREAK'};
1221 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}]
1222 = $lb_actions{'LB_NOBREAK'};
1224 # LB22 Do not break between two ellipses, or between letters, numbers or
1225 # exclamations and ellipsis.
1227 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}]
1228 = $lb_actions{'LB_NOBREAK'};
1229 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}]
1230 = $lb_actions{'LB_NOBREAK'};
1233 $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}]
1234 = $lb_actions{'LB_NOBREAK'};
1236 # (ID | EB | EM) × IN
1237 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}]
1238 = $lb_actions{'LB_NOBREAK'};
1239 $lb_table[$lb_enums{'E_Base'}][$lb_enums{'Inseparable'}]
1240 = $lb_actions{'LB_NOBREAK'};
1241 $lb_table[$lb_enums{'E_Modifier'}][$lb_enums{'Inseparable'}]
1242 = $lb_actions{'LB_NOBREAK'};
1245 $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}]
1246 = $lb_actions{'LB_NOBREAK'};
1249 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}]
1250 = $lb_actions{'LB_NOBREAK'};
1252 # LB21b Don’t break between Solidus and Hebrew letters.
1254 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}]
1255 = $lb_actions{'LB_NOBREAK'};
1257 # LB21a Don't break after Hebrew + Hyphen.
1259 for my $i (0 .. @lb_table - 1) {
1260 $lb_table[$lb_enums{'Hyphen'}][$i]
1261 += $lb_actions{'LB_HY_or_BA_then_foo'};
1262 $lb_table[$lb_enums{'Break_After'}][$i]
1263 += $lb_actions{'LB_HY_or_BA_then_foo'};
1266 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
1267 # spaces, small kana, and other non-starters, or after acute accents.
1272 for my $i (0 .. @lb_table - 1) {
1273 $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'};
1274 $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'};
1275 $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'};
1276 $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'};
1279 # LB20 Break before and after unresolved CB.
1282 # Conditional breaks should be resolved external to the line breaking
1283 # rules. However, the default action is to treat unresolved CB as breaking
1285 for my $i (0 .. @lb_table - 1) {
1286 $lb_table[$i][$lb_enums{'Contingent_Break'}]
1287 = $lb_actions{'LB_BREAKABLE'};
1288 $lb_table[$lb_enums{'Contingent_Break'}][$i]
1289 = $lb_actions{'LB_BREAKABLE'};
1292 # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
1295 for my $i (0 .. @lb_table - 1) {
1296 $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'};
1297 $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'};
1300 # LB18 Break after spaces
1302 for my $i (0 .. @lb_table - 1) {
1303 $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'};
1306 # LB17 Do not break within ‘——’, even with intervening spaces.
1308 $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}]
1309 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1311 # LB16 Do not break between closing punctuation and a nonstarter even with
1312 # intervening spaces.
1313 # (CL | CP) SP* × NS
1314 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}]
1315 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1316 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}]
1317 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1320 # LB15 Do not break within ‘”[’, even with intervening spaces.
1322 $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}]
1323 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1325 # LB14 Do not break after ‘[’, even after spaces.
1327 for my $i (0 .. @lb_table - 1) {
1328 $lb_table[$lb_enums{'Open_Punctuation'}][$i]
1329 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1332 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
1333 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
1339 for my $i (0 .. @lb_table - 1) {
1340 $lb_table[$i][$lb_enums{'Exclamation'}]
1341 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1343 next if $i == $lb_enums{'Numeric'};
1345 $lb_table[$i][$lb_enums{'Close_Punctuation'}]
1346 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1347 $lb_table[$i][$lb_enums{'Close_Parenthesis'}]
1348 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1349 $lb_table[$i][$lb_enums{'Infix_Numeric'}]
1350 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1351 $lb_table[$i][$lb_enums{'Break_Symbols'}]
1352 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1355 # LB12a Do not break before NBSP and related characters, except after
1356 # spaces and hyphens.
1358 for my $i (0 .. @lb_table - 1) {
1359 next if $i == $lb_enums{'Space'}
1360 || $i == $lb_enums{'Break_After'}
1361 || $i == $lb_enums{'Hyphen'};
1363 # We don't break, but if a property above has said don't break even
1364 # with space between, don't override that (also in the next few rules)
1365 next if $lb_table[$i][$lb_enums{'Glue'}]
1366 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1367 $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'};
1370 # LB12 Do not break after NBSP and related characters.
1372 for my $i (0 .. @lb_table - 1) {
1373 next if $lb_table[$lb_enums{'Glue'}][$i]
1374 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1375 $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'};
1378 # LB11 Do not break before or after Word joiner and related characters.
1381 for my $i (0 .. @lb_table - 1) {
1382 if ($lb_table[$i][$lb_enums{'Word_Joiner'}]
1383 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1385 $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'};
1387 if ($lb_table[$lb_enums{'Word_Joiner'}][$i]
1388 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1390 $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'};
1394 # Special case this here to avoid having to do a special case in the code,
1395 # by making this the same as other things with a SP in front of them that
1396 # don't break, we avoid an extra test
1397 $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}]
1398 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1400 # LB9 and LB10 are done in the same loop
1402 # LB9 Do not break a combining character sequence; treat it as if it has
1403 # the line breaking class of the base character in all of the
1404 # higher-numbered rules. Treat ZWJ as if it were CM
1405 # Treat X (CM|ZWJ)* as if it were X.
1406 # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
1408 # LB10 Treat any remaining combining mark or ZWJ as AL. This catches the
1409 # case where a CM or ZWJ is the first character on the line or follows SP,
1410 # BK, CR, LF, NL, or ZW.
1411 for my $i (0 .. @lb_table - 1) {
1413 # When the CM or ZWJ is the first in the pair, we don't know without
1414 # looking behind whether the CM or ZWJ is going to attach to an
1415 # earlier character, or not. So have to figure this out at runtime in
1417 $lb_table[$lb_enums{'Combining_Mark'}][$i]
1418 = $lb_actions{'LB_CM_ZWJ_foo'};
1419 $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_CM_ZWJ_foo'};
1421 if ( $i == $lb_enums{'Mandatory_Break'}
1422 || $i == $lb_enums{'EDGE'}
1423 || $i == $lb_enums{'Carriage_Return'}
1424 || $i == $lb_enums{'Line_Feed'}
1425 || $i == $lb_enums{'Next_Line'}
1426 || $i == $lb_enums{'Space'}
1427 || $i == $lb_enums{'ZWSpace'})
1429 # For these classes, a following CM doesn't combine, and should do
1430 # whatever 'Alphabetic' would do.
1431 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1432 = $lb_table[$i][$lb_enums{'Alphabetic'}];
1433 $lb_table[$i][$lb_enums{'ZWJ'}]
1434 = $lb_table[$i][$lb_enums{'Alphabetic'}];
1437 # For these classes, the CM or ZWJ combines, so doesn't break,
1438 # inheriting the type of nobreak from the master character.
1439 if ($lb_table[$i][$lb_enums{'Combining_Mark'}]
1440 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1442 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1443 = $lb_actions{'LB_NOBREAK'};
1445 if ($lb_table[$i][$lb_enums{'ZWJ'}]
1446 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1448 $lb_table[$i][$lb_enums{'ZWJ'}]
1449 = $lb_actions{'LB_NOBREAK'};
1454 # LB8a Do not break between a zero width joiner and an ideograph, emoji
1455 # base or emoji modifier. This rule prevents breaks within emoji joiner
1457 # ZWJ × (ID | EB | EM)
1458 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}]
1459 = $lb_actions{'LB_NOBREAK'};
1460 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}]
1461 = $lb_actions{'LB_NOBREAK'};
1462 $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}]
1463 = $lb_actions{'LB_NOBREAK'};
1465 # LB8 Break before any character following a zero-width space, even if one
1466 # or more spaces intervene.
1468 for my $i (0 .. @lb_table - 1) {
1469 $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'};
1472 # Because of LB8-10, we need to look at context for "SP x", and this must
1473 # be done in the code. So override the existing rules for that, by adding
1474 # a constant to get new rules that tell the code it needs to look at
1475 # context. By adding this action instead of replacing the existing one,
1476 # we can get back to the original rule if necessary.
1477 for my $i (0 .. @lb_table - 1) {
1478 $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'};
1481 # LB7 Do not break before spaces or zero width space.
1484 for my $i (0 .. @lb_table - 1) {
1485 $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'};
1486 $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'};
1489 # LB6 Do not break before hard line breaks.
1490 # × ( BK | CR | LF | NL )
1491 for my $i (0 .. @lb_table - 1) {
1492 $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'};
1493 $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'};
1494 $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'};
1495 $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'};
1498 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
1503 for my $i (0 .. @lb_table - 1) {
1504 $lb_table[$lb_enums{'Carriage_Return'}][$i]
1505 = $lb_actions{'LB_BREAKABLE'};
1506 $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'};
1507 $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'};
1509 $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}]
1510 = $lb_actions{'LB_NOBREAK'};
1512 # LB4 Always break after hard line breaks.
1514 for my $i (0 .. @lb_table - 1) {
1515 $lb_table[$lb_enums{'Mandatory_Break'}][$i]
1516 = $lb_actions{'LB_BREAKABLE'};
1519 # LB3 Always break at the end of text.
1521 # LB2 Never break at the start of text.
1523 for my $i (0 .. @lb_table - 1) {
1524 $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'};
1525 $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'};
1528 # LB1 Assign a line breaking class to each code point of the input.
1529 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
1530 # depending on criteria outside the scope of this algorithm.
1532 # In the absence of such criteria all characters with a specific
1533 # combination of original class and General_Category property value are
1534 # resolved as follows:
1535 # Original Resolved General_Category
1537 # SA CM Only Mn or Mc
1538 # SA AL Any except Mn and Mc
1541 # This is done in mktables, so we never see any of the remapped-from
1544 output_table_common('LB', \%lb_actions,
1545 \@lb_table, \@lb_short_enums, \%lb_abbreviations);
1548 sub output_WB_table() {
1550 # Create and output the enums, #defines, and pair table for use in
1551 # determining Word Breaks, given in http://www.unicode.org/reports/tr29/.
1553 # This uses the same mechanism in the other bounds tables generated by
1554 # this file. The actions that could override a 0 or 1 are added to those
1555 # numbers; the actions that clearly don't depend on the underlying rule
1561 WB_Ex_or_FO_or_ZWJ_then_foo => 3,
1564 WB_LE_or_HL_then_MB_or_ML_or_SQ => 8,
1565 WB_MB_or_ML_or_SQ_then_LE_or_HL => 10,
1566 WB_MB_or_MN_or_SQ_then_NU => 12,
1567 WB_NU_then_MB_or_MN_or_SQ => 14,
1568 WB_RI_then_RI => 16,
1571 # Construct the WB pair table.
1572 # The table is constructed in reverse order of the rules, to make the
1573 # lower-numbered, higher priority ones override the later ones, as the
1574 # algorithm stops at the earliest matching rule
1577 my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
1578 die "UNKNOWN must be final WB enum" unless $wb_short_enums[-1] =~ /unk/i;
1580 # Otherwise, break everywhere (including around ideographs).
1582 for my $i (0 .. $table_size - 1) {
1583 for my $j (0 .. $table_size - 1) {
1584 $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'};
1588 # Do not break within emoji flag sequences. That is, do not break between
1589 # regional indicator (RI) symbols if there is an odd number of RI
1590 # characters before the break point.
1591 # WB16 [^RI] (RI RI)* RI × RI
1592 # WB15 sot (RI RI)* RI × RI
1593 $wb_table[$wb_enums{'Regional_Indicator'}]
1594 [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_RI_then_RI'};
1596 # Do not break within emoji modifier sequences.
1597 # WB14 ( E_Base | EBG ) × E_Modifier
1598 $wb_table[$wb_enums{'E_Base'}][$wb_enums{'E_Modifier'}]
1599 = $wb_actions{'WB_NOBREAK'};
1600 $wb_table[$wb_enums{'E_Base_GAZ'}][$wb_enums{'E_Modifier'}]
1601 = $wb_actions{'WB_NOBREAK'};
1603 # Do not break from extenders.
1604 # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
1605 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
1606 = $wb_actions{'WB_NOBREAK'};
1607 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
1608 = $wb_actions{'WB_NOBREAK'};
1609 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
1610 = $wb_actions{'WB_NOBREAK'};
1611 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}]
1612 = $wb_actions{'WB_NOBREAK'};
1614 # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
1616 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
1617 = $wb_actions{'WB_NOBREAK'};
1618 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
1619 = $wb_actions{'WB_NOBREAK'};
1620 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
1621 = $wb_actions{'WB_NOBREAK'};
1622 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}]
1623 = $wb_actions{'WB_NOBREAK'};
1624 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}]
1625 = $wb_actions{'WB_NOBREAK'};
1627 # Do not break between Katakana.
1628 # WB13 Katakana × Katakana
1629 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}]
1630 = $wb_actions{'WB_NOBREAK'};
1632 # Do not break within sequences, such as “3.2” or “3,456.789”.
1633 # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
1634 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}]
1635 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1636 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}]
1637 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1638 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}]
1639 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1641 # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
1642 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}]
1643 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1644 $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}]
1645 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1646 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}]
1647 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1649 # Do not break within sequences of digits, or digits adjacent to letters
1651 # WB10 Numeric × (ALetter | Hebrew_Letter)
1652 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
1653 = $wb_actions{'WB_NOBREAK'};
1654 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
1655 = $wb_actions{'WB_NOBREAK'};
1657 # WB9 (ALetter | Hebrew_Letter) × Numeric
1658 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
1659 = $wb_actions{'WB_NOBREAK'};
1660 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
1661 = $wb_actions{'WB_NOBREAK'};
1663 # WB8 Numeric × Numeric
1664 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}]
1665 = $wb_actions{'WB_NOBREAK'};
1667 # Do not break letters across certain punctuation.
1668 # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
1669 $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}]
1670 += $wb_actions{'WB_DQ_then_HL'};
1672 # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
1673 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}]
1674 += $wb_actions{'WB_HL_then_DQ'};
1676 # WB7a Hebrew_Letter × Single_Quote
1677 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1678 = $wb_actions{'WB_NOBREAK'};
1680 # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)
1681 # × (ALetter | Hebrew_Letter)
1682 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
1683 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1684 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
1685 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1686 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
1687 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1688 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
1689 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1690 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
1691 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1692 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
1693 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1695 # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
1696 # | Single_Quote) (ALetter | Hebrew_Letter)
1697 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
1698 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1699 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
1700 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1701 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
1702 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1703 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
1704 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1705 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
1706 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1707 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1708 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1710 # Do not break between most letters.
1711 # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
1712 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
1713 = $wb_actions{'WB_NOBREAK'};
1714 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
1715 = $wb_actions{'WB_NOBREAK'};
1716 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
1717 = $wb_actions{'WB_NOBREAK'};
1718 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
1719 = $wb_actions{'WB_NOBREAK'};
1721 # Ignore Format and Extend characters, except after sot, CR, LF, and
1722 # Newline. This also has the effect of: Any × (Format | Extend | ZWJ)
1723 # WB4 X (Extend | Format | ZWJ)* → X
1724 for my $i (0 .. @wb_table - 1) {
1725 $wb_table[$wb_enums{'Extend'}][$i]
1726 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1727 $wb_table[$wb_enums{'Format'}][$i]
1728 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1729 $wb_table[$wb_enums{'ZWJ'}][$i]
1730 = $wb_actions{'WB_Ex_or_FO_or_ZWJ_then_foo'};
1732 for my $i (0 .. @wb_table - 1) {
1733 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1734 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
1735 $wb_table[$i][$wb_enums{'ZWJ'}] = $wb_actions{'WB_NOBREAK'};
1738 # Implied is that these attach to the character before them, except for
1739 # the characters that mark the end of a region of text. The rules below
1740 # override the ones set up here, for all the characters that need
1742 for my $i (0 .. @wb_table - 1) {
1743 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1744 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
1747 # Do not break within emoji zwj sequences.
1748 # WB3c ZWJ × ( Glue_After_Zwj | EBG )
1749 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}]
1750 = $wb_actions{'WB_NOBREAK'};
1751 $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}]
1752 = $wb_actions{'WB_NOBREAK'};
1754 # Break before and after white space
1755 # WB3b ÷ (Newline | CR | LF)
1756 # WB3a (Newline | CR | LF) ÷
1758 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1759 for my $j (0 .. @wb_table - 1) {
1760 $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'};
1761 $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'};
1765 # But do not break within white space.
1768 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1769 for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1770 $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'};
1774 # And do not break horizontal space followed by Extend or Format or ZWJ
1775 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}]
1776 = $wb_actions{'WB_NOBREAK'};
1777 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}]
1778 = $wb_actions{'WB_NOBREAK'};
1779 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'ZWJ'}]
1780 = $wb_actions{'WB_NOBREAK'};
1781 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}]
1782 [$wb_enums{'Perl_Tailored_HSpace'}]
1783 = $wb_actions{'WB_hs_then_hs'};
1785 # Break at the start and end of text, unless the text is empty
1788 for my $i (0 .. @wb_table - 1) {
1789 $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'};
1790 $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'};
1792 $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0;
1794 output_table_common('WB', \%wb_actions,
1795 \@wb_table, \@wb_short_enums, \%wb_abbreviations);
1798 output_invlist("Latin1", [ 0, 256 ]);
1799 output_invlist("AboveLatin1", [ 256 ]);
1803 # We construct lists for all the POSIX and backslash sequence character
1804 # classes in two forms:
1805 # 1) ones which match only in the ASCII range
1806 # 2) ones which match either in the Latin1 range, or the entire Unicode range
1808 # These get compiled in, and hence affect the memory footprint of every Perl
1809 # program, even those not using Unicode. To minimize the size, currently
1810 # the Latin1 version is generated for the beyond ASCII range except for those
1811 # lists that are quite small for the entire range, such as for \s, which is 22
1812 # UVs long plus 4 UVs (currently) for the header.
1814 # To save even more memory, the ASCII versions could be derived from the
1815 # larger ones at runtime, saving some memory (minus the expense of the machine
1816 # instructions to do so), but these are all small anyway, so their total is
1819 # In the list of properties below that get generated, the L1 prefix is a fake
1820 # property that means just the Latin1 range of the full property (whose name
1821 # has an X prefix instead of L1).
1823 # An initial & means to use the subroutine from this file instead of an
1824 # official inversion list.
1826 for my $charset (get_supported_code_pages()) {
1827 print $out_fh "\n" . get_conditional_compile_line_start($charset);
1829 @a2n = @{get_a2n($charset)};
1830 # Below is the list of property names to generate. '&' means to use the
1831 # subroutine to generate the inversion list instead of the generic code
1832 # below. Some properties have a comma-separated list after the name,
1833 # These are extra enums to add to those found in the Unicode tables.
1835 # Ignore non-alpha in sort
1836 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
1857 &NonL1_Perl_Non_Final_Folds
1858 _Perl_Folds_To_Multi_Char
1862 _Perl_GCB,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,EDGE
1863 _Perl_LB,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner,EDGE,
1864 _Perl_SB,SContinue,CR,Extend,LF,EDGE
1865 _Perl_WB,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,EDGE,UNKNOWN
1866 _Perl_SCX,Latin,Inherited,Unknown,Kore,Jpan,Hanb,INVALID
1868 # NOTE that the convention is that extra enum
1869 # values come after the property name, separated by
1870 # commas, with the enums that aren't ever defined
1871 # by Unicode coming last, at least 4 all-uppercase
1872 # characters. The others are enum names that are
1873 # needed by perl, but aren't in all Unicode
1877 # For the Latin1 properties, we change to use the eXtended version of the
1878 # base property, then go through the result and get rid of everything not
1879 # in Latin1 (above 255). Actually, we retain the element for the range
1880 # that crosses the 255/256 boundary if it is one that matches the
1881 # property. For example, in the Word property, there is a range of code
1882 # points that start at U+00F8 and goes through U+02C1. Instead of
1883 # artificially cutting that off at 256 because 256 is the first code point
1884 # above Latin1, we let the range go to its natural ending. That gives us
1885 # extra information with no added space taken. But if the range that
1886 # crosses the boundary is one that doesn't match the property, we don't
1887 # start a new range above 255, as that could be construed as going to
1888 # infinity. For example, the Upper property doesn't include the character
1889 # at 255, but does include the one at 256. We don't include the 256 one.
1890 my $prop_name = $prop;
1891 my $is_local_sub = $prop_name =~ s/^&//;
1892 my $extra_enums = "";
1893 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
1894 my $lookup_prop = $prop_name;
1895 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1896 or $lookup_prop =~ s/^L1//);
1898 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
1899 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
1905 my $maps_to_code_point;
1907 if ($is_local_sub) {
1908 @invlist = eval $lookup_prop;
1912 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
1915 # If couldn't find a non-empty inversion list, see if it is
1916 # instead an inversion map
1917 my ($list_ref, $map_ref, $format, $default)
1918 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
1920 # An empty return here could mean an unknown property, or
1921 # merely that the original inversion list is empty. Call
1922 # in scalar context to differentiate
1923 my $count = prop_invlist($lookup_prop,
1924 '_perl_core_internal_ok');
1925 die "Could not find inversion list for '$lookup_prop'"
1926 unless defined $count;
1929 @invlist = @$list_ref;
1930 @invmap = @$map_ref;
1931 $map_format = $format;
1932 $map_default = $default;
1933 $maps_to_code_point = $map_format =~ /x/;
1934 $to_adjust = $map_format =~ /a/;
1940 # Short-circuit an empty inversion list.
1942 output_invlist($prop_name, \@invlist, $charset);
1946 # Re-order the Unicode code points to native ones for this platform.
1947 # This is only needed for code points below 256, because native code
1948 # points are only in that range. For inversion maps of properties
1949 # where the mappings are adjusted (format =~ /a/), this reordering
1950 # could mess up the adjustment pattern that was in the input, so that
1951 # has to be dealt with.
1953 # And inversion maps that map to code points need to eventually have
1954 # all those code points remapped to native, and it's better to do that
1955 # here, going through the whole list not just those below 256. This
1956 # is because some inversion maps have adjustments (format =~ /a/)
1957 # which may be affected by the reordering. This code needs to be done
1958 # both for when we are translating the inversion lists for < 256, and
1959 # for the inversion maps for everything. By doing both in this loop,
1960 # we can share that code.
1962 # So, we go through everything for an inversion map to code points;
1963 # otherwise, we can skip any remapping at all if we are going to
1964 # output only the above-Latin1 values, or if the range spans the whole
1965 # of 0..256, as the remap will also include all of 0..256 (256 not
1966 # 255 because a re-ordering could cause 256 to need to be in the same
1968 if ((@invmap && $maps_to_code_point)
1969 || (! $nonl1_only || ($invlist[0] < 256
1970 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
1973 if (! @invmap) { # Straight inversion list
1974 # Look at all the ranges that start before 257.
1977 last if $invlist[0] > 256;
1978 my $upper = @invlist > 1
1979 ? $invlist[1] - 1 # In range
1981 # To infinity. You may want to stop much much
1982 # earlier; going this high may expose perl
1983 # deficiencies with very large numbers.
1984 : $Unicode::UCD::MAX_CP;
1985 for my $j ($invlist[0] .. $upper) {
1986 push @latin1_list, a2n($j);
1989 shift @invlist; # Shift off the range that's in the list
1990 shift @invlist; # Shift off the range not in the list
1993 # Here @invlist contains all the ranges in the original that start
1994 # at code points above 256, and @latin1_list contains all the
1995 # native code points for ranges that start with a Unicode code
1996 # point below 257. We sort the latter and convert it to inversion
1997 # list format. Then simply prepend it to the list of the higher
1999 @latin1_list = sort { $a <=> $b } @latin1_list;
2000 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
2001 unshift @invlist, @latin1_list;
2003 else { # Is an inversion map
2005 # This is a similar procedure as plain inversion list, but has
2006 # multiple buckets. A plain inversion list just has two
2007 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
2008 # pretty much can ignore the 2nd bucket, as it is completely
2009 # defined by the 1st. But here, what we do is create buckets
2010 # which contain the code points that map to each, translated
2011 # to native and turned into an inversion list. Thus each
2012 # bucket is an inversion list of native code points that map
2013 # to it or don't map to it. We use these to create an
2014 # inversion map for the whole property.
2016 # As mentioned earlier, we use this procedure to not just
2017 # remap the inversion list to native values, but also the maps
2018 # of code points to native ones. In the latter case we have
2019 # to look at the whole of the inversion map (or at least to
2020 # above Unicode; as the maps of code points above that should
2021 # all be to the default).
2022 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
2024 my %mapped_lists; # A hash whose keys are the buckets.
2026 last if $invlist[0] > $upper_limit;
2028 # This shouldn't actually happen, as prop_invmap() returns
2029 # an extra element at the end that is beyond $upper_limit
2030 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
2034 # A hash key can't be a ref (we are only expecting arrays
2035 # of scalars here), so convert any such to a string that
2036 # will be converted back later (using a vertical tab as
2037 # the separator). Even if the mapping is to code points,
2038 # we don't translate to native here because the code
2039 # output_invmap() calls to output these arrays assumes the
2040 # input is Unicode, not native.
2041 if (ref $invmap[0]) {
2042 $bucket = join "\cK", @{$invmap[0]};
2044 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
2046 # Do convert to native for maps to single code points.
2047 # There are some properties that have a few outlier
2048 # maps that aren't code points, so the above test
2050 $bucket = a2n($invmap[0]);
2052 $bucket = $invmap[0];
2055 # We now have the bucket that all code points in the range
2056 # map to, though possibly they need to be adjusted. Go
2057 # through the range and put each translated code point in
2058 # it into its bucket.
2059 my $base_map = $invmap[0];
2060 for my $j ($invlist[0] .. $invlist[1] - 1) {
2062 # The 1st code point doesn't need adjusting
2065 # Skip any non-numeric maps: these are outliers
2066 # that aren't code points.
2067 && $base_map =~ $numeric_re
2069 # 'ne' because the default can be a string
2070 && $base_map ne $map_default)
2072 # We adjust, by incrementing each the bucket and
2073 # the map. For code point maps, translate to
2076 $bucket = ($maps_to_code_point)
2081 # Add the native code point to the bucket for the
2083 push @{$mapped_lists{$bucket}}, a2n($j);
2084 } # End of loop through all code points in the range
2086 # Get ready for the next range
2089 } # End of loop through all ranges in the map.
2091 # Here, @invlist and @invmap retain all the ranges from the
2092 # originals that start with code points above $upper_limit.
2093 # Each bucket in %mapped_lists contains all the code points
2094 # that map to that bucket. If the bucket is for a map to a
2095 # single code point is a single code point, the bucket has
2096 # been converted to native. If something else (including
2097 # multiple code points), no conversion is done.
2099 # Now we recreate the inversion map into %xlated, but this
2100 # time for the native character set.
2102 foreach my $bucket (keys %mapped_lists) {
2104 # Sort and convert this bucket to an inversion list. The
2105 # result will be that ranges that start with even-numbered
2106 # indexes will be for code points that map to this bucket;
2107 # odd ones map to some other bucket, and are discarded
2109 @{$mapped_lists{$bucket}}
2110 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
2111 @{$mapped_lists{$bucket}}
2112 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
2114 # Add each even-numbered range in the bucket to %xlated;
2115 # so that the keys of %xlated become the range start code
2116 # points, and the values are their corresponding maps.
2117 while (@{$mapped_lists{$bucket}}) {
2118 my $range_start = $mapped_lists{$bucket}->[0];
2119 if ($bucket =~ /\cK/) {
2120 @{$xlated{$range_start}} = split /\cK/, $bucket;
2123 $xlated{$range_start} = $bucket;
2125 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
2126 shift @{$mapped_lists{$bucket}}; # Get ready for next
2129 } # End of loop through all the buckets.
2131 # Here %xlated's keys are the range starts of all the code
2132 # points in the inversion map. Construct an inversion list
2134 my @new_invlist = sort { $a <=> $b } keys %xlated;
2136 # If the list is adjusted, we want to munge this list so that
2137 # we only have one entry for where consecutive code points map
2138 # to consecutive values. We just skip the subsequent entries
2139 # where this is the case.
2142 for my $i (0 .. @new_invlist - 1) {
2144 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
2145 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
2146 && $xlated{$new_invlist[$i]} =~ $numeric_re
2147 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
2148 push @temp, $new_invlist[$i];
2150 @new_invlist = @temp;
2153 # The inversion map comes from %xlated's values. We can
2154 # unshift each onto the front of the untouched portion, in
2155 # reverse order of the portion we did process.
2156 foreach my $start (reverse @new_invlist) {
2157 unshift @invmap, $xlated{$start};
2160 # Finally prepend the inversion list we have just constructed to the
2161 # one that contains anything we didn't process.
2162 unshift @invlist, @new_invlist;
2166 # prop_invmap() returns an extra final entry, which we can now
2174 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
2175 for my $i (0 .. @invlist - 1 - 1) {
2176 if ($invlist[$i] > 255) {
2178 # In an inversion list, even-numbered elements give the code
2179 # points that begin ranges that match the property;
2180 # odd-numbered give ones that begin ranges that don't match.
2181 # If $i is odd, we are at the first code point above 255 that
2182 # doesn't match, which means the range it is ending does
2183 # match, and crosses the 255/256 boundary. We want to include
2184 # this ending point, so increment $i, so the splice below
2185 # includes it. Conversely, if $i is even, it is the first
2186 # code point above 255 that matches, which means there was no
2187 # matching range that crossed the boundary, and we don't want
2188 # to include this code point, so splice before it.
2189 $i++ if $i % 2 != 0;
2191 # Remove everything past this.
2192 splice @invlist, $i;
2193 splice @invmap, $i if @invmap;
2198 elsif ($nonl1_only) {
2199 my $found_nonl1 = 0;
2200 for my $i (0 .. @invlist - 1 - 1) {
2201 next if $invlist[$i] < 256;
2203 # Here, we have the first element in the array that indicates an
2204 # element above Latin1. Get rid of all previous ones.
2205 splice @invlist, 0, $i;
2206 splice @invmap, 0, $i if @invmap;
2208 # If this one's index is not divisible by 2, it means that this
2209 # element is inverting away from being in the list, which means
2210 # all code points from 256 to this one are in this list (or
2211 # map to the default for inversion maps)
2213 unshift @invlist, 256;
2214 unshift @invmap, $map_default if @invmap;
2219 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
2222 output_invlist($prop_name, \@invlist, $charset);
2223 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
2226 print $out_fh "\n" . get_conditional_compile_line_end();
2229 switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
2237 my $sources_list = "lib/unicore/mktables.lst";
2238 my @sources = ($0, qw(lib/unicore/mktables
2240 regen/charset_translations.pl
2243 # Depend on mktables’ own sources. It’s a shorter list of files than
2244 # those that Unicode::UCD uses.
2245 if (! open my $mktables_list, '<', $sources_list) {
2247 # This should force a rebuild once $sources_list exists
2248 push @sources, $sources_list;
2251 while(<$mktables_list>) {
2254 push @sources, "lib/unicore/$_" if /^[^#]/;
2259 read_only_bottom_close_and_rename($out_fh, \@sources);