5 use Unicode::UCD qw(prop_aliases
9 prop_invmap search_invlist
11 require 'regen/regen_lib.pl';
12 require 'regen/charset_translations.pl';
14 # This program outputs charclass_invlists.h, which contains various inversion
15 # lists in the form of C arrays that are to be used as-is for inversion lists.
16 # Thus, the lists it contains are essentially pre-compiled, and need only a
17 # light-weight fast wrapper to make them usable at run-time.
19 # As such, this code knows about the internal structure of these lists, and
20 # any change made to that has to be done here as well. A random number stored
21 # in the headers is used to minimize the possibility of things getting
22 # out-of-sync, or the wrong data structure being passed. Currently that
25 # charclass_invlists.h now also has a partial implementation of inversion
26 # maps; enough to generate tables for the line break properties, such as GCB
28 my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
31 my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
33 # Matches valid C language enum names: begins with ASCII alphabetic, then any
35 my $enum_name_re = qr / ^ [[:alpha:]] \w* $ /ax;
37 my $out_fh = open_new('charclass_invlists.h', '>',
38 {style => '*', by => $0,
39 from => "Unicode::UCD"});
41 my $in_file_pound_if = 0;
43 my $max_hdr_len = 3; # In headings, how wide a name is allowed?
45 print $out_fh "/* See the generating file for comments */\n\n";
47 # The symbols generated by this program are all currently defined only in a
48 # single dot c each. The code knows where most of them go, but this hash
49 # gives overrides for the exceptions to the typical place
50 my %exceptions_to_where_to_define =
51 ( NonL1_Perl_Non_Final_Folds => 'PERL_IN_REGCOMP_C',
52 AboveLatin1 => 'PERL_IN_REGCOMP_C',
53 Latin1 => 'PERL_IN_REGCOMP_C',
54 UpperLatin1 => 'PERL_IN_REGCOMP_C',
55 _Perl_Any_Folds => 'PERL_IN_REGCOMP_C',
56 _Perl_Folds_To_Multi_Char => 'PERL_IN_REGCOMP_C',
57 _Perl_IDCont => 'PERL_IN_UTF8_C',
58 _Perl_IDStart => 'PERL_IN_UTF8_C',
61 # This hash contains the properties with enums that have hard-coded references
62 # to them in C code. It is neeed to make sure that if perl is compiled
63 # with an older Unicode data set, that all the enum values the code is
64 # expecting will still be in the enum typedef. Thus the code doesn't have to
65 # change. The Unicode version won't have any code points that have the enum
66 # values not in that version, so the code that handles them will not get
67 # exercised. This is far better than having to #ifdef things. The names here
68 # should be the long names of the respective property values. The reason for
69 # this is because regexec.c uses them as case labels, and the long name is
70 # generally more understandable than the short.
71 my %hard_coded_enums =
119 'Regional_Indicator',
157 'Perl_Tailored_HSpace',
158 'Regional_Indicator',
165 my %gcb_abbreviations;
168 my %lb_abbreviations;
171 my %wb_abbreviations;
176 # Returns non-duplicated input values. From "Perl Best Practices:
177 # Encapsulated Cleverness". p. 455 in first edition.
180 return grep { ! $seen{$_}++ } @_;
186 # Returns the input Unicode code point translated to native.
188 return $cp if $cp !~ $numeric_re || $cp > 255;
192 sub end_file_pound_if {
193 if ($in_file_pound_if) {
194 print $out_fh "\n#endif\t/* $in_file_pound_if */\n";
195 $in_file_pound_if = 0;
199 sub switch_pound_if ($$) {
201 my $new_pound_if = shift;
203 # Switch to new #if given by the 2nd argument. If there is an override
204 # for this, it instead switches to that. The 1st argument is the
205 # static's name, used to look up the overrides
207 if (exists $exceptions_to_where_to_define{$name}) {
208 $new_pound_if = $exceptions_to_where_to_define{$name};
211 # Exit current #if if the new one is different from the old
212 if ($in_file_pound_if
213 && $in_file_pound_if !~ /$new_pound_if/)
218 # Enter new #if, if not already in it.
219 if (! $in_file_pound_if) {
220 $in_file_pound_if = "defined($new_pound_if)";
221 print $out_fh "\n#if $in_file_pound_if\n";
225 sub output_invlist ($$;$) {
227 my $invlist = shift; # Reference to inversion list array
228 my $charset = shift // ""; # name of character set for comment
230 die "No inversion list for $name" unless defined $invlist
231 && ref $invlist eq 'ARRAY';
233 # Output the inversion list $invlist using the name $name for it.
234 # It is output in the exact internal form for inversion lists.
236 # Is the last element of the header 0, or 1 ?
238 if (@$invlist && $invlist->[0] != 0) {
239 unshift @$invlist, 0;
242 my $count = @$invlist;
244 switch_pound_if ($name, 'PERL_IN_PERL_C');
246 print $out_fh "\nstatic const UV ${name}_invlist[] = {";
247 print $out_fh " /* for $charset */" if $charset;
250 print $out_fh "\t$count,\t/* Number of elements */\n";
251 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
252 print $out_fh "\t", $zero_or_one,
253 ",\t/* 0 if the list starts at 0;",
254 "\n\t\t 1 if it starts at the element beyond 0 */\n";
256 # The main body are the UVs passed in to this routine. Do the final
258 for my $i (0 .. @$invlist - 1) {
259 printf $out_fh "\t0x%X", $invlist->[$i];
260 print $out_fh "," if $i < @$invlist - 1;
264 print $out_fh "};\n";
267 sub output_invmap ($$$$$$$) {
269 my $invmap = shift; # Reference to inversion map array
270 my $prop_name = shift;
271 my $input_format = shift; # The inversion map's format
272 my $default = shift; # The property value for code points who
273 # otherwise don't have a value specified.
274 my $extra_enums = shift; # comma-separated list of our additions to the
275 # property's standard possible values
276 my $charset = shift // ""; # name of character set for comment
278 # Output the inversion map $invmap for property $prop_name, but use $name
279 # as the actual data structure's name.
281 my $count = @$invmap;
284 my $declaration_type;
288 if ($input_format eq 's') {
289 my $orig_prop_name = $prop_name;
290 $prop_name = (prop_aliases($prop_name))[1] // $prop_name =~ s/^_Perl_//r; # Get full name
291 my $short_name = (prop_aliases($prop_name))[0] // $prop_name;
293 if ($orig_prop_name eq $prop_name) {
294 @enums = prop_values($prop_name);
297 @enums = uniques(@$invmap);
301 die "Only enum properties are currently handled; '$prop_name' isn't one";
304 my @expected_enums = @{$hard_coded_enums{lc $short_name}};
305 my @canonical_input_enums;
306 if (@expected_enums) {
307 if (@expected_enums < @enums) {
308 die 'You need to update %hard_coded_enums to reflect new'
309 . " entries in this Unicode version\n"
310 . "Expected: " . join(", ", sort @expected_enums) . "\n"
311 . " Got: " . join(", ", sort @enums);
314 if (! defined prop_aliases($prop_name)) {
316 # Convert the input enums into canonical form and
318 @canonical_input_enums = map { lc ($_ =~ s/_//gr) }
321 @enums = sort @expected_enums;
324 # The internal enums come last, and in the order specified
326 if ($extra_enums ne "") {
327 @extras = split /,/, $extra_enums;
328 push @enums, @extras;
331 # Assign a value to each element of the enum. The default
332 # value always gets 0; the others are arbitrarily assigned.
334 my $canonical_default = prop_value_aliases($prop_name, $default);
335 $default = $canonical_default if defined $canonical_default;
336 $enums{$default} = $enum_val++;
337 for my $enum (@enums) {
338 $enums{$enum} = $enum_val++ unless exists $enums{$enum};
341 # Calculate the enum values for certain properties like
342 # _Perl_GCB and _Perl_LB, because we output special tables for
344 if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) {
346 # We use string evals to allow the same code to work on
347 # all tables we're doing.
348 my $type = lc $prop_name;
350 # We use lowercase single letter names for any property
351 # values not in the release of Unicode being compiled now.
352 my $placeholder = "a";
354 # Skip if we've already done this code, which populated
356 if (eval "! \%${type}_enums") {
359 foreach my $enum (sort keys %enums) {
360 my $value = $enums{$enum};
362 my $abbreviated_from;
364 # Special case this wb property value to make the
366 if ($enum eq 'Perl_Tailored_HSpace') {
368 $abbreviated_from = $enum;
370 elsif (grep { $_ eq $enum } @extras) {
372 # The 'short' name for one of the property
373 # values added by this file is just the
377 elsif (grep {$_ eq lc ( $enum =~ s/_//gr) }
378 @canonical_input_enums)
379 { # On Unicode versions that predate the
380 # official property, we have set up this array
381 # to be the canonical form of each enum in the
382 # substitute property. If the enum we're
383 # looking at is canonically the same as one of
384 # these, use its name instead of generating a
385 # placeholder one in the next clause (which
386 # will happen because prop_value_aliases()
387 # will fail because it only works on official
392 # Use the official short name for the other
393 # property values, which should all be
395 ($short) = prop_value_aliases($type, $enum);
397 # But create a placeholder for ones not in
398 # this Unicode version.
399 $short = $placeholder++ unless defined $short;
402 # If our short name is too long, or we already
403 # know that the name is an abbreviation, truncate
404 # to make sure it's short enough, and remember
405 # that we did this so we can later place in a
406 # comment in the generated file
407 if ( $abbreviated_from
408 || length $short > $max_hdr_len)
410 $short = substr($short, 0, $max_hdr_len);
411 $abbreviated_from = $enum
412 unless $abbreviated_from;
413 # If the name we are to display conflicts, try
416 \$${type}_abbreviations{$short}")
422 eval "\$${type}_abbreviations{$short} = '$enum'";
426 # Remember the mapping from the property value
427 # (enum) name to its value.
428 eval "\$${type}_enums{$enum} = $value";
431 # Remember the inverse mapping to the short name
432 # so that we can properly label the generated
433 # table's rows and columns
434 eval "\$${type}_short_enums[$value] = '$short'";
441 # Inversion map stuff is currently used only by regexec
442 switch_pound_if($name, 'PERL_IN_REGEXEC_C');
445 # The short names tend to be two lower case letters, but it looks
446 # better for those if they are upper. XXX
447 $short_name = uc($short_name) if length($short_name) < 3
448 || substr($short_name, 0, 1) =~ /[[:lower:]]/;
449 $name_prefix = "${short_name}_";
450 my $enum_count = keys %enums;
451 print $out_fh "\n#define ${name_prefix}ENUM_COUNT ", scalar keys %enums, "\n";
453 print $out_fh "\ntypedef enum {\n";
455 foreach my $enum (keys %enums) {
456 $enum_list[$enums{$enum}] = $enum;
458 foreach my $i (0 .. @enum_list - 1) {
459 my $name = $enum_list[$i];
460 print $out_fh "\t${name_prefix}$name = $i";
461 print $out_fh "," if $i < $enum_count - 1;
464 $declaration_type = "${name_prefix}enum";
465 print $out_fh "} $declaration_type;\n";
467 $output_format = "${name_prefix}%s";
471 die "'$input_format' invmap() format for '$prop_name' unimplemented";
474 die "No inversion map for $prop_name" unless defined $invmap
475 && ref $invmap eq 'ARRAY'
478 print $out_fh "\nstatic const $declaration_type ${name}_invmap[] = {";
479 print $out_fh " /* for $charset */" if $charset;
482 # The main body are the scalars passed in to this routine.
483 for my $i (0 .. $count - 1) {
484 my $element = $invmap->[$i];
485 my $full_element_name = prop_value_aliases($prop_name, $element);
486 $element = $full_element_name if defined $full_element_name;
487 $element = $name_prefix . $element;
488 print $out_fh "\t$element";
489 print $out_fh "," if $i < $count - 1;
492 print $out_fh "};\n";
495 sub mk_invlist_from_sorted_cp_list {
497 # Returns an inversion list constructed from the sorted input array of
500 my $list_ref = shift;
502 return unless @$list_ref;
504 # Initialize to just the first element
505 my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
507 # For each succeeding element, if it extends the previous range, adjust
508 # up, otherwise add it.
509 for my $i (1 .. @$list_ref - 1) {
510 if ($invlist[-1] == $list_ref->[$i]) {
514 push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
520 # Read in the Case Folding rules, and construct arrays of code points for the
521 # properties we need.
522 my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
523 die "Could not find inversion map for Case_Folding" unless defined $format;
524 die "Incorrect format '$format' for Case_Folding inversion map"
525 unless $format eq 'al'
527 my @has_multi_char_fold;
528 my @is_non_final_fold;
530 for my $i (0 .. @$folds_ref - 1) {
531 next unless ref $folds_ref->[$i]; # Skip single-char folds
532 push @has_multi_char_fold, $cp_ref->[$i];
534 # Add to the non-finals list each code point that is in a non-final
536 for my $j (0 .. @{$folds_ref->[$i]} - 2) {
537 push @is_non_final_fold, $folds_ref->[$i][$j]
538 unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
542 sub _Perl_Non_Final_Folds {
543 @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
544 return mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
547 sub prop_name_for_cmp ($) { # Sort helper
550 # Returns the input lowercased, with non-alphas removed, as well as
551 # everything starting with a comma
554 $name =~ s/[[:^alpha:]]//g;
559 return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]);
562 sub output_table_common {
564 # Common subroutine to actually output the generated rules table.
567 $table_value_defines_ref,
570 $abbreviations_ref) = @_;
571 my $size = @$table_ref;
573 # Output the #define list, sorted by numeric value
574 if ($table_value_defines_ref) {
575 my $max_name_length = 0;
578 # Put in order, and at the same time find the longest name
579 while (my ($enum, $value) = each %$table_value_defines_ref) {
580 $defines[$value] = $enum;
582 my $length = length $enum;
583 $max_name_length = $length if $length > $max_name_length;
588 # Output, so that the values are vertically aligned in a column after
590 foreach my $i (0 .. @defines - 1) {
591 next unless defined $defines[$i];
592 printf $out_fh "#define %-*s %2d\n",
599 my $column_width = 2; # We currently allow 2 digits for the number
601 # If the maximum value in the table is 1, it can be a bool. (Being above
602 # a U8 is not currently handled
604 for my $i (0 .. $size - 1) {
605 for my $j (0 .. $size - 1) {
606 next if $max_element >= $table_ref->[$i][$j];
607 $max_element = $table_ref->[$i][$j];
610 die "Need wider table column width given '$max_element"
611 if length $max_element > $column_width;
613 my $table_type = ($max_element == 1)
617 # If a name is longer than the width set aside for a column, its column
618 # needs to have increased spacing so that the name doesn't get truncated
619 # nor run into an adjacent column
622 # If we are being compiled on a Unicode version earlier than that which
623 # this file was designed for, it may be that some of the property values
624 # aren't in the current release, and so would be undefined if we didn't
625 # define them ourselves. Earlier code has done this, making them
626 # lowercase characters of length one. We look to see if any exist, so
627 # that we can add an annotation to the output table
628 my $has_placeholder = 0;
630 for my $i (0 .. $size - 1) {
631 no warnings 'numeric';
632 $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
633 $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
636 print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
638 # Calculate the column heading line
639 my $header_line = "/* "
640 . (" " x $max_hdr_len) # We let the row heading meld to
641 # the '*/' for those that are at
643 . " " x 3; # Space for '*/ '
645 for my $i (0 .. $size - 1) {
646 $header_line .= sprintf "%s%*s",
648 $column_width + 1, # 1 for the ','
651 $header_line .= " */\n";
653 # If we have annotations, output it now.
654 if ($has_placeholder || scalar %$abbreviations_ref) {
656 foreach my $abbr (sort keys %$abbreviations_ref) {
657 $text .= "; " if $text;
658 $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
660 if ($has_placeholder) {
661 $text .= "; other " if $text;
662 $text .= "lowercase names are placeholders for"
663 . " property values not defined until a later Unicode"
664 . " release, so are irrelevant in this one, as they are"
665 . " not assigned to any code points";
668 my $indent = " " x 3;
669 $text = $indent . "/* $text */";
671 # Wrap the text so that it is no wider than the table, which the
673 my $output_width = length $header_line;
674 while (length $text > $output_width) {
675 my $cur_line = substr($text, 0, $output_width);
677 # Find the first blank back from the right end to wrap at.
678 for (my $i = $output_width -1; $i > 0; $i--) {
679 if (substr($text, $i, 1) eq " ") {
680 print $out_fh substr($text, 0, $i), "\n";
682 # Set so will look at just the remaining tail (which will
683 # be indented and have a '*' after the indent
684 $text = $indent . " * " . substr($text, $i + 1);
691 print $out_fh $text, "\n" if $text;
694 # We calculated the header line earlier just to get its width so that we
695 # could make sure the annotations fit into that.
696 print $out_fh $header_line;
698 # Now output the bulk of the table.
699 for my $i (0 .. $size - 1) {
701 # First the row heading.
702 printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i];
703 print $out_fh "{"; # Then the brace for this row
706 for my $j (0 .. $size -1) {
707 print $out_fh $spacers[$j];
708 printf $out_fh "%*d", $column_width, $table_ref->[$i][$j];
709 print $out_fh "," if $j < $size - 1;
712 print $out_fh "," if $i < $size - 1;
716 print $out_fh "};\n";
719 sub output_GCB_table() {
721 # Create and output the pair table for use in determining Grapheme Cluster
722 # Breaks, given in http://www.unicode.org/reports/tr29/.
724 # The table is constructed in reverse order of the rules, to make the
725 # lower-numbered, higher priority ones override the later ones, as the
726 # algorithm stops at the earliest matching rule
729 my $table_size = @gcb_short_enums;
731 # Otherwise, break everywhere.
733 for my $i (0 .. $table_size - 1) {
734 for my $j (0 .. $table_size - 1) {
735 $gcb_table[$i][$j] = 1;
739 # Do not break before extending characters.
740 # Do not break before SpacingMarks, or after Prepend characters.
744 for my $i (0 .. @gcb_table - 1) {
745 $gcb_table[$i][$gcb_enums{'Extend'}] = 0;
746 $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0;
747 $gcb_table[$gcb_enums{'Prepend'}][$i] = 0;
750 # Do not break between regional indicator symbols.
751 # GB8a Regional_Indicator × Regional_Indicator
752 $gcb_table[$gcb_enums{'Regional_Indicator'}]
753 [$gcb_enums{'Regional_Indicator'}] = 0;
755 # Do not break Hangul syllable sequences.
757 $gcb_table[$gcb_enums{'LVT'}][$gcb_enums{'T'}] = 0;
758 $gcb_table[$gcb_enums{'T'}][$gcb_enums{'T'}] = 0;
760 # GB7 ( LV | V ) × ( V | T )
761 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'V'}] = 0;
762 $gcb_table[$gcb_enums{'LV'}][$gcb_enums{'T'}] = 0;
763 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'V'}] = 0;
764 $gcb_table[$gcb_enums{'V'}][$gcb_enums{'T'}] = 0;
766 # GB6 L × ( L | V | LV | LVT )
767 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'L'}] = 0;
768 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'V'}] = 0;
769 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0;
770 $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0;
772 # Do not break between a CR and LF. Otherwise, break before and after
774 # GB5 ÷ ( Control | CR | LF )
775 # GB4 ( Control | CR | LF ) ÷
776 for my $i (0 .. @gcb_table - 1) {
777 $gcb_table[$i][$gcb_enums{'Control'}] = 1;
778 $gcb_table[$i][$gcb_enums{'CR'}] = 1;
779 $gcb_table[$i][$gcb_enums{'LF'}] = 1;
780 $gcb_table[$gcb_enums{'Control'}][$i] = 1;
781 $gcb_table[$gcb_enums{'CR'}][$i] = 1;
782 $gcb_table[$gcb_enums{'LF'}][$i] = 1;
786 $gcb_table[$gcb_enums{'CR'}][$gcb_enums{'LF'}] = 0;
788 # Break at the start and end of text.
791 for my $i (0 .. @gcb_table - 1) {
792 $gcb_table[$i][$gcb_enums{'EDGE'}] = 1;
793 $gcb_table[$gcb_enums{'EDGE'}][$i] = 1;
796 # But, unspecified by Unicode, we shouldn't break on an empty string.
797 $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0;
799 output_table_common('GCB', undef,
800 \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations);
803 sub output_LB_table() {
805 # Create and output the enums, #defines, and pair table for use in
806 # determining Line Breaks. This uses the default line break algorithm,
807 # given in http://www.unicode.org/reports/tr14/, but tailored by example 7
808 # in that page, as the Unicode-furnished tests assume that tailoring.
810 # The result is really just true or false. But we follow along with tr14,
811 # creating a rule which is false for something like X SP* X. That gets
812 # encoding 2. The rest of the actions are synthetic ones that indicate
813 # some context handling is required. These each are added to the
814 # underlying 0, 1, or 2, instead of replacing them, so that the underlying
815 # value can be retrieved. Actually only rules from 7 through 18 (which
816 # are the ones where space matter) are possible to have 2 added to them.
817 # The others below add just 0 or 1. It might be possible for one
818 # synthetic rule to be added to another, yielding a larger value. This
819 # doesn't happen in the Unicode 8.0 rule set, and as you can see from the
820 # names of the middle grouping below, it is impossible for that to occur
821 # for them because they all start with mutually exclusive classes. That
822 # the final rule can't be added to any of the others isn't obvious from
823 # its name, so it is assigned a power of 2 higher than the others can get
824 # to so any addition would preserve all data. (And the code will reach an
825 # assert(0) on debugging builds should this happen.)
829 LB_NOBREAK_EVEN_WITH_SP_BETWEEN => 2,
831 LB_CM_foo => 3, # Rule 9
832 LB_SP_foo => 6, # Rule 18
833 LB_PR_or_PO_then_OP_or_HY => 9, # Rule 25
834 LB_SY_or_IS_then_various => 11, # Rule 25
835 LB_HY_or_BA_then_foo => 13, # Rule 21
837 LB_various_then_PO_or_PR => (1<<4), # Rule 25
840 # Construct the LB pair table. This is based on the rules in
841 # http://www.unicode.org/reports/tr14/, but modified as those rules are
842 # designed for someone taking a string of text and sequentially going
843 # through it to find the break opportunities, whereas, Perl requires
844 # determining if a given random spot is a break opportunity, without
845 # knowing all the entire string before it.
847 # The table is constructed in reverse order of the rules, to make the
848 # lower-numbered, higher priority ones override the later ones, as the
849 # algorithm stops at the earliest matching rule
852 my $table_size = @lb_short_enums;
854 # LB31. Break everywhere else
855 for my $i (0 .. $table_size - 1) {
856 for my $j (0 .. $table_size - 1) {
857 $lb_table[$i][$j] = $lb_actions{'LB_BREAKABLE'};
861 # LB30a. Don't break between Regional Indicators
862 $lb_table[$lb_enums{'Regional_Indicator'}]
863 [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_NOBREAK'};
865 # LB30 Do not break between letters, numbers, or ordinary symbols and
866 # opening or closing parentheses.
867 # (AL | HL | NU) × OP
868 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}]
869 = $lb_actions{'LB_NOBREAK'};
870 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}]
871 = $lb_actions{'LB_NOBREAK'};
872 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}]
873 = $lb_actions{'LB_NOBREAK'};
875 # CP × (AL | HL | NU)
876 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}]
877 = $lb_actions{'LB_NOBREAK'};
878 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}]
879 = $lb_actions{'LB_NOBREAK'};
880 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}]
881 = $lb_actions{'LB_NOBREAK'};
883 # LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
885 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}]
886 = $lb_actions{'LB_NOBREAK'};
887 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
888 = $lb_actions{'LB_NOBREAK'};
890 # LB28 Do not break between alphabetics (“at”).
891 # (AL | HL) × (AL | HL)
892 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}]
893 = $lb_actions{'LB_NOBREAK'};
894 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}]
895 = $lb_actions{'LB_NOBREAK'};
896 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}]
897 = $lb_actions{'LB_NOBREAK'};
898 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}]
899 = $lb_actions{'LB_NOBREAK'};
901 # LB27 Treat a Korean Syllable Block the same as ID.
902 # (JL | JV | JT | H2 | H3) × IN
903 $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}]
904 = $lb_actions{'LB_NOBREAK'};
905 $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}]
906 = $lb_actions{'LB_NOBREAK'};
907 $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}]
908 = $lb_actions{'LB_NOBREAK'};
909 $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}]
910 = $lb_actions{'LB_NOBREAK'};
911 $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}]
912 = $lb_actions{'LB_NOBREAK'};
914 # (JL | JV | JT | H2 | H3) × PO
915 $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}]
916 = $lb_actions{'LB_NOBREAK'};
917 $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}]
918 = $lb_actions{'LB_NOBREAK'};
919 $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}]
920 = $lb_actions{'LB_NOBREAK'};
921 $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}]
922 = $lb_actions{'LB_NOBREAK'};
923 $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}]
924 = $lb_actions{'LB_NOBREAK'};
926 # PR × (JL | JV | JT | H2 | H3)
927 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}]
928 = $lb_actions{'LB_NOBREAK'};
929 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}]
930 = $lb_actions{'LB_NOBREAK'};
931 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}]
932 = $lb_actions{'LB_NOBREAK'};
933 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}]
934 = $lb_actions{'LB_NOBREAK'};
935 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}]
936 = $lb_actions{'LB_NOBREAK'};
938 # LB26 Do not break a Korean syllable.
939 # JL × (JL | JV | H2 | H3)
940 $lb_table[$lb_enums{'JL'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'};
941 $lb_table[$lb_enums{'JL'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
942 $lb_table[$lb_enums{'JL'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'};
943 $lb_table[$lb_enums{'JL'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'};
945 # (JV | H2) × (JV | JT)
946 $lb_table[$lb_enums{'JV'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
947 $lb_table[$lb_enums{'H2'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'};
948 $lb_table[$lb_enums{'JV'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
949 $lb_table[$lb_enums{'H2'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
952 $lb_table[$lb_enums{'JT'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
953 $lb_table[$lb_enums{'H3'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'};
955 # LB25 Do not break between the following pairs of classes relevant to
956 # numbers, as tailored by example 7 in
957 # http://www.unicode.org/reports/tr14/#Examples
958 # We follow that tailoring because Unicode's test cases expect it
959 # (PR | PO) × ( OP | HY )? NU
960 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}]
961 = $lb_actions{'LB_NOBREAK'};
962 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}]
963 = $lb_actions{'LB_NOBREAK'};
965 # Given that (OP | HY )? is optional, we have to test for it in code.
966 # We add in the action (instead of overriding) for this, so that in
967 # the code we can recover the underlying break value.
968 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}]
969 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
970 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}]
971 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
972 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}]
973 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
974 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}]
975 += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'};
978 $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}]
979 = $lb_actions{'LB_NOBREAK'};
980 $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}]
981 = $lb_actions{'LB_NOBREAK'};
983 # NU (NU | SY | IS)* × (NU | SY | IS | CL | CP )
984 # which can be rewritten as:
985 # NU (SY | IS)* × (NU | SY | IS | CL | CP )
986 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}]
987 = $lb_actions{'LB_NOBREAK'};
988 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}]
989 = $lb_actions{'LB_NOBREAK'};
990 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}]
991 = $lb_actions{'LB_NOBREAK'};
992 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}]
993 = $lb_actions{'LB_NOBREAK'};
994 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}]
995 = $lb_actions{'LB_NOBREAK'};
997 # Like earlier where we have to test in code, we add in the action so
998 # that we can recover the underlying values. This is done in rules
999 # below, as well. The code assumes that we haven't added 2 actions.
1000 # Shoul a later Unicode release break that assumption, then tests
1001 # should start failing.
1002 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}]
1003 += $lb_actions{'LB_SY_or_IS_then_various'};
1004 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}]
1005 += $lb_actions{'LB_SY_or_IS_then_various'};
1006 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}]
1007 += $lb_actions{'LB_SY_or_IS_then_various'};
1008 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}]
1009 += $lb_actions{'LB_SY_or_IS_then_various'};
1010 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}]
1011 += $lb_actions{'LB_SY_or_IS_then_various'};
1012 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}]
1013 += $lb_actions{'LB_SY_or_IS_then_various'};
1014 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}]
1015 += $lb_actions{'LB_SY_or_IS_then_various'};
1016 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}]
1017 += $lb_actions{'LB_SY_or_IS_then_various'};
1018 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}]
1019 += $lb_actions{'LB_SY_or_IS_then_various'};
1020 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}]
1021 += $lb_actions{'LB_SY_or_IS_then_various'};
1023 # NU (NU | SY | IS)* (CL | CP)? × (PO | PR)
1024 # which can be rewritten as:
1025 # NU (SY | IS)* (CL | CP)? × (PO | PR)
1026 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}]
1027 = $lb_actions{'LB_NOBREAK'};
1028 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}]
1029 = $lb_actions{'LB_NOBREAK'};
1031 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}]
1032 += $lb_actions{'LB_various_then_PO_or_PR'};
1033 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}]
1034 += $lb_actions{'LB_various_then_PO_or_PR'};
1035 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}]
1036 += $lb_actions{'LB_various_then_PO_or_PR'};
1037 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}]
1038 += $lb_actions{'LB_various_then_PO_or_PR'};
1040 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}]
1041 += $lb_actions{'LB_various_then_PO_or_PR'};
1042 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}]
1043 += $lb_actions{'LB_various_then_PO_or_PR'};
1044 $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}]
1045 += $lb_actions{'LB_various_then_PO_or_PR'};
1046 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}]
1047 += $lb_actions{'LB_various_then_PO_or_PR'};
1049 # LB24 Do not break between prefix and letters or ideographs.
1051 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}]
1052 = $lb_actions{'LB_NOBREAK'};
1055 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}]
1056 = $lb_actions{'LB_NOBREAK'};
1057 $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1058 = $lb_actions{'LB_NOBREAK'};
1061 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}]
1062 = $lb_actions{'LB_NOBREAK'};
1063 $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}]
1064 = $lb_actions{'LB_NOBREAK'};
1066 # LB23 Do not break within ‘a9’, ‘3a’, or ‘H%’.
1068 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}]
1069 = $lb_actions{'LB_NOBREAK'};
1072 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}]
1073 = $lb_actions{'LB_NOBREAK'};
1074 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}]
1075 = $lb_actions{'LB_NOBREAK'};
1078 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}]
1079 = $lb_actions{'LB_NOBREAK'};
1080 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}]
1081 = $lb_actions{'LB_NOBREAK'};
1083 # LB22 Do not break between two ellipses, or between letters, numbers or
1084 # exclamations and ellipsis.
1086 $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}]
1087 = $lb_actions{'LB_NOBREAK'};
1088 $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}]
1089 = $lb_actions{'LB_NOBREAK'};
1092 $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}]
1093 = $lb_actions{'LB_NOBREAK'};
1096 $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}]
1097 = $lb_actions{'LB_NOBREAK'};
1100 $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}]
1101 = $lb_actions{'LB_NOBREAK'};
1104 $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}]
1105 = $lb_actions{'LB_NOBREAK'};
1107 # LB21b Don’t break between Solidus and Hebrew letters.
1109 $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}]
1110 = $lb_actions{'LB_NOBREAK'};
1112 # LB21a Don't break after Hebrew + Hyphen.
1114 for my $i (0 .. @lb_table - 1) {
1115 $lb_table[$lb_enums{'Hyphen'}][$i]
1116 += $lb_actions{'LB_HY_or_BA_then_foo'};
1117 $lb_table[$lb_enums{'Break_After'}][$i]
1118 += $lb_actions{'LB_HY_or_BA_then_foo'};
1121 # LB21 Do not break before hyphen-minus, other hyphens, fixed-width
1122 # spaces, small kana, and other non-starters, or after acute accents.
1127 for my $i (0 .. @lb_table - 1) {
1128 $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'};
1129 $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'};
1130 $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'};
1131 $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'};
1134 # LB20 Break before and after unresolved CB.
1137 # Conditional breaks should be resolved external to the line breaking
1138 # rules. However, the default action is to treat unresolved CB as breaking
1140 for my $i (0 .. @lb_table - 1) {
1141 $lb_table[$i][$lb_enums{'Contingent_Break'}]
1142 = $lb_actions{'LB_BREAKABLE'};
1143 $lb_table[$lb_enums{'Contingent_Break'}][$i]
1144 = $lb_actions{'LB_BREAKABLE'};
1147 # LB19 Do not break before or after quotation marks, such as ‘ ” ’.
1150 for my $i (0 .. @lb_table - 1) {
1151 $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'};
1152 $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'};
1155 # LB18 Break after spaces
1157 for my $i (0 .. @lb_table - 1) {
1158 $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'};
1161 # LB17 Do not break within ‘——’, even with intervening spaces.
1163 $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}]
1164 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1166 # LB16 Do not break between closing punctuation and a nonstarter even with
1167 # intervening spaces.
1168 # (CL | CP) SP* × NS
1169 $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}]
1170 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1171 $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}]
1172 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1175 # LB15 Do not break within ‘”[’, even with intervening spaces.
1177 $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}]
1178 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1180 # LB14 Do not break after ‘[’, even after spaces.
1182 for my $i (0 .. @lb_table - 1) {
1183 $lb_table[$lb_enums{'Open_Punctuation'}][$i]
1184 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1187 # LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces, as
1188 # tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
1194 for my $i (0 .. @lb_table - 1) {
1195 $lb_table[$i][$lb_enums{'Exclamation'}]
1196 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1198 next if $i == $lb_enums{'Numeric'};
1200 $lb_table[$i][$lb_enums{'Close_Punctuation'}]
1201 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1202 $lb_table[$i][$lb_enums{'Close_Parenthesis'}]
1203 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1204 $lb_table[$i][$lb_enums{'Infix_Numeric'}]
1205 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1206 $lb_table[$i][$lb_enums{'Break_Symbols'}]
1207 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1210 # LB12a Do not break before NBSP and related characters, except after
1211 # spaces and hyphens.
1213 for my $i (0 .. @lb_table - 1) {
1214 next if $i == $lb_enums{'Space'}
1215 || $i == $lb_enums{'Break_After'}
1216 || $i == $lb_enums{'Hyphen'};
1218 # We don't break, but if a property above has said don't break even
1219 # with space between, don't override that (also in the next few rules)
1220 next if $lb_table[$i][$lb_enums{'Glue'}]
1221 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1222 $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'};
1225 # LB12 Do not break after NBSP and related characters.
1227 for my $i (0 .. @lb_table - 1) {
1228 next if $lb_table[$lb_enums{'Glue'}][$i]
1229 == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1230 $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'};
1233 # LB11 Do not break before or after Word joiner and related characters.
1236 for my $i (0 .. @lb_table - 1) {
1237 if ($lb_table[$i][$lb_enums{'Word_Joiner'}]
1238 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1240 $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'};
1242 if ($lb_table[$lb_enums{'Word_Joiner'}][$i]
1243 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1245 $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'};
1249 # Special case this here to avoid having to do a special case in the code,
1250 # by making this the same as other things with a SP in front of them that
1251 # don't break, we avoid an extra test
1252 $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}]
1253 = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'};
1255 # LB9 and LB10 are done in the same loop
1257 # LB9 Do not break a combining character sequence; treat it as if it has
1258 # the line breaking class of the base character in all of the
1259 # higher-numbered rules.
1260 # Treat X CM* as if it were X.
1261 # where X is any line break class except BK, CR, LF, NL, SP, or ZW.
1263 # LB10 Treat any remaining combining mark as AL. This catches the case
1264 # where a CM is the first character on the line or follows SP, BK, CR, LF,
1266 for my $i (0 .. @lb_table - 1) {
1268 # When the CM is the first in the pair, we don't know without looking
1269 # behind whether the CM is going to inherit from an earlier character,
1270 # or not. So have to figure this out in the code
1271 $lb_table[$lb_enums{'Combining_Mark'}][$i] = $lb_actions{'LB_CM_foo'};
1273 if ( $i == $lb_enums{'Mandatory_Break'}
1274 || $i == $lb_enums{'EDGE'}
1275 || $i == $lb_enums{'Carriage_Return'}
1276 || $i == $lb_enums{'Line_Feed'}
1277 || $i == $lb_enums{'Next_Line'}
1278 || $i == $lb_enums{'Space'}
1279 || $i == $lb_enums{'ZWSpace'})
1281 # For these classes, a following CM doesn't combine, and should do
1282 # whatever 'Alphabetic' would do.
1283 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1284 = $lb_table[$i][$lb_enums{'Alphabetic'}];
1287 # For these classes, the CM combines, so doesn't break, inheriting
1288 # the type of nobreak from the master character.
1289 if ($lb_table[$i][$lb_enums{'Combining_Mark'}]
1290 != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'})
1292 $lb_table[$i][$lb_enums{'Combining_Mark'}]
1293 = $lb_actions{'LB_NOBREAK'};
1298 # LB8 Break before any character following a zero-width space, even if one
1299 # or more spaces intervene.
1301 for my $i (0 .. @lb_table - 1) {
1302 $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'};
1305 # Because of LB8-10, we need to look at context for "SP x", and this must
1306 # be done in the code. So override the existing rules for that, by adding
1307 # a constant to get new rules that tell the code it needs to look at
1308 # context. By adding this action instead of replacing the existing one,
1309 # we can get back to the original rule if necessary.
1310 for my $i (0 .. @lb_table - 1) {
1311 $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'};
1314 # LB7 Do not break before spaces or zero width space.
1317 for my $i (0 .. @lb_table - 1) {
1318 $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'};
1319 $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'};
1322 # LB6 Do not break before hard line breaks.
1323 # × ( BK | CR | LF | NL )
1324 for my $i (0 .. @lb_table - 1) {
1325 $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'};
1326 $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'};
1327 $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'};
1328 $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'};
1331 # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
1336 for my $i (0 .. @lb_table - 1) {
1337 $lb_table[$lb_enums{'Carriage_Return'}][$i]
1338 = $lb_actions{'LB_BREAKABLE'};
1339 $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'};
1340 $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'};
1342 $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}]
1343 = $lb_actions{'LB_NOBREAK'};
1345 # LB4 Always break after hard line breaks.
1347 for my $i (0 .. @lb_table - 1) {
1348 $lb_table[$lb_enums{'Mandatory_Break'}][$i]
1349 = $lb_actions{'LB_BREAKABLE'};
1352 # LB2 Never break at the start of text.
1354 # LB3 Always break at the end of text.
1356 # but these are reversed in the loop below, so that won't break if there
1358 for my $i (0 .. @lb_table - 1) {
1359 $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'};
1360 $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'};
1363 # LB1 Assign a line breaking class to each code point of the input.
1364 # Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
1365 # depending on criteria outside the scope of this algorithm.
1367 # In the absence of such criteria all characters with a specific
1368 # combination of original class and General_Category property value are
1369 # resolved as follows:
1370 # Original Resolved General_Category
1372 # SA CM Only Mn or Mc
1373 # SA AL Any except Mn and Mc
1376 # This is done in mktables, so we never see any of the remapped-from
1379 output_table_common('LB', \%lb_actions,
1380 \@lb_table, \@lb_short_enums, \%lb_abbreviations);
1383 sub output_WB_table() {
1385 # Create and output the enums, #defines, and pair table for use in
1386 # determining Word Breaks, given in http://www.unicode.org/reports/tr29/.
1388 # This uses the same mechanism in the other bounds tables generated by
1389 # this file. The actions that could override a 0 or 1 are added to those
1390 # numbers; the actions that clearly don't depend on the underlying rule
1396 WB_Ex_or_FO_then_foo => 3,
1399 WB_LE_or_HL_then_MB_or_ML_or_SQ => 8,
1400 WB_MB_or_ML_or_SQ_then_LE_or_HL => 10,
1401 WB_MB_or_MN_or_SQ_then_NU => 12,
1402 WB_NU_then_MB_or_MN_or_SQ => 14,
1405 # Construct the WB pair table.
1406 # The table is constructed in reverse order of the rules, to make the
1407 # lower-numbered, higher priority ones override the later ones, as the
1408 # algorithm stops at the earliest matching rule
1411 my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
1413 # Otherwise, break everywhere (including around ideographs).
1415 for my $i (0 .. $table_size - 1) {
1416 for my $j (0 .. $table_size - 1) {
1417 $wb_table[$i][$j] = $wb_actions{'WB_BREAKABLE'};
1421 # Do not break between regional indicator symbols.
1422 # WB13c Regional_Indicator × Regional_Indicator
1423 $wb_table[$wb_enums{'Regional_Indicator'}]
1424 [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_NOBREAK'};
1426 # Do not break from extenders.
1427 # WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
1428 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
1429 = $wb_actions{'WB_NOBREAK'};
1430 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
1431 = $wb_actions{'WB_NOBREAK'};
1432 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
1433 = $wb_actions{'WB_NOBREAK'};
1434 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}]
1435 = $wb_actions{'WB_NOBREAK'};
1437 # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
1439 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
1440 = $wb_actions{'WB_NOBREAK'};
1441 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
1442 = $wb_actions{'WB_NOBREAK'};
1443 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
1444 = $wb_actions{'WB_NOBREAK'};
1445 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}]
1446 = $wb_actions{'WB_NOBREAK'};
1447 $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}]
1448 = $wb_actions{'WB_NOBREAK'};
1450 # Do not break between Katakana.
1451 # WB13 Katakana × Katakana
1452 $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}]
1453 = $wb_actions{'WB_NOBREAK'};
1455 # Do not break within sequences, such as “3.2” or “3,456.789”.
1456 # WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
1457 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}]
1458 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1459 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}]
1460 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1461 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}]
1462 += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'};
1464 # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
1465 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}]
1466 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1467 $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}]
1468 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1469 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}]
1470 += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'};
1472 # Do not break within sequences of digits, or digits adjacent to letters
1474 # WB10 Numeric × (ALetter | Hebrew_Letter)
1475 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
1476 = $wb_actions{'WB_NOBREAK'};
1477 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
1478 = $wb_actions{'WB_NOBREAK'};
1480 # WB9 (ALetter | Hebrew_Letter) × Numeric
1481 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
1482 = $wb_actions{'WB_NOBREAK'};
1483 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
1484 = $wb_actions{'WB_NOBREAK'};
1486 # WB8 Numeric × Numeric
1487 $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}]
1488 = $wb_actions{'WB_NOBREAK'};
1490 # Do not break letters across certain punctuation.
1491 # WB7c Hebrew_Letter Double_Quote × Hebrew_Letter
1492 $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}]
1493 += $wb_actions{'WB_DQ_then_HL'};
1495 # WB7b Hebrew_Letter × Double_Quote Hebrew_Letter
1496 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}]
1497 += $wb_actions{'WB_HL_then_DQ'};
1499 # WB7a Hebrew_Letter × Single_Quote
1500 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1501 = $wb_actions{'WB_NOBREAK'};
1503 # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)
1504 # × (ALetter | Hebrew_Letter)
1505 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
1506 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1507 $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
1508 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1509 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
1510 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1511 $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
1512 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1513 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
1514 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1515 $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
1516 += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
1518 # WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
1519 # | Single_Quote) (ALetter | Hebrew_Letter)
1520 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
1521 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1522 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
1523 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1524 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
1525 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1526 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
1527 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1528 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
1529 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1530 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
1531 += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
1533 # Do not break between most letters.
1534 # WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
1535 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
1536 = $wb_actions{'WB_NOBREAK'};
1537 $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
1538 = $wb_actions{'WB_NOBREAK'};
1539 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
1540 = $wb_actions{'WB_NOBREAK'};
1541 $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
1542 = $wb_actions{'WB_NOBREAK'};
1544 # Ignore Format and Extend characters, except when they appear at the
1545 # beginning of a region of text.
1546 # WB4 X (Extend | Format)* → X
1547 for my $i (0 .. @wb_table - 1) {
1548 $wb_table[$wb_enums{'Extend'}][$i]
1549 = $wb_actions{'WB_Ex_or_FO_then_foo'};
1550 $wb_table[$wb_enums{'Format'}][$i]
1551 = $wb_actions{'WB_Ex_or_FO_then_foo'};
1554 # Implied is that these attach to the character before them, except for
1555 # the characters that mark the end of a region of text. The rules below
1556 # override the ones set up here, for all the characters that need
1558 for my $i (0 .. @wb_table - 1) {
1559 $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'};
1560 $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
1563 # Break before and after white space
1564 # WB3b ÷ (Newline | CR | LF)
1565 # WB3a (Newline | CR | LF) ÷
1567 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1568 for my $j (0 .. @wb_table - 1) {
1569 $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'};
1570 $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'};
1574 # But do not break within white space.
1577 for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1578 for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') {
1579 $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'};
1583 # And do not break horizontal space followed by Extend or Format
1584 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}]
1585 = $wb_actions{'WB_NOBREAK'};
1586 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}]
1587 = $wb_actions{'WB_NOBREAK'};
1588 $wb_table[$wb_enums{'Perl_Tailored_HSpace'}]
1589 [$wb_enums{'Perl_Tailored_HSpace'}]
1590 = $wb_actions{'WB_hs_then_hs'};
1592 # Break at the start and end of text.
1595 for my $i (0 .. @wb_table - 1) {
1596 $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'};
1597 $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'};
1600 # But, unspecified by Unicode, we shouldn't break on an empty string.
1601 $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0;
1603 output_table_common('WB', \%wb_actions,
1604 \@wb_table, \@wb_short_enums, \%wb_abbreviations);
1607 output_invlist("Latin1", [ 0, 256 ]);
1608 output_invlist("AboveLatin1", [ 256 ]);
1612 # We construct lists for all the POSIX and backslash sequence character
1613 # classes in two forms:
1614 # 1) ones which match only in the ASCII range
1615 # 2) ones which match either in the Latin1 range, or the entire Unicode range
1617 # These get compiled in, and hence affect the memory footprint of every Perl
1618 # program, even those not using Unicode. To minimize the size, currently
1619 # the Latin1 version is generated for the beyond ASCII range except for those
1620 # lists that are quite small for the entire range, such as for \s, which is 22
1621 # UVs long plus 4 UVs (currently) for the header.
1623 # To save even more memory, the ASCII versions could be derived from the
1624 # larger ones at runtime, saving some memory (minus the expense of the machine
1625 # instructions to do so), but these are all small anyway, so their total is
1628 # In the list of properties below that get generated, the L1 prefix is a fake
1629 # property that means just the Latin1 range of the full property (whose name
1630 # has an X prefix instead of L1).
1632 # An initial & means to use the subroutine from this file instead of an
1633 # official inversion list.
1635 for my $charset (get_supported_code_pages()) {
1636 print $out_fh "\n" . get_conditional_compile_line_start($charset);
1638 @a2n = @{get_a2n($charset)};
1640 # Ignore non-alpha in sort
1641 for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
1660 &NonL1_Perl_Non_Final_Folds
1661 _Perl_Folds_To_Multi_Char
1668 _Perl_WB,EDGE,UNKNOWN
1672 # For the Latin1 properties, we change to use the eXtended version of the
1673 # base property, then go through the result and get rid of everything not
1674 # in Latin1 (above 255). Actually, we retain the element for the range
1675 # that crosses the 255/256 boundary if it is one that matches the
1676 # property. For example, in the Word property, there is a range of code
1677 # points that start at U+00F8 and goes through U+02C1. Instead of
1678 # artificially cutting that off at 256 because 256 is the first code point
1679 # above Latin1, we let the range go to its natural ending. That gives us
1680 # extra information with no added space taken. But if the range that
1681 # crosses the boundary is one that doesn't match the property, we don't
1682 # start a new range above 255, as that could be construed as going to
1683 # infinity. For example, the Upper property doesn't include the character
1684 # at 255, but does include the one at 256. We don't include the 256 one.
1685 my $prop_name = $prop;
1686 my $is_local_sub = $prop_name =~ s/^&//;
1687 my $extra_enums = "";
1688 $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
1689 my $lookup_prop = $prop_name;
1690 my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
1691 or $lookup_prop =~ s/^L1//);
1693 $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
1694 ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
1700 my $maps_to_code_point;
1702 if ($is_local_sub) {
1703 @invlist = eval $lookup_prop;
1707 @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
1710 # If couldn't find a non-empty inversion list, see if it is
1711 # instead an inversion map
1712 my ($list_ref, $map_ref, $format, $default)
1713 = prop_invmap($lookup_prop, '_perl_core_internal_ok');
1715 # An empty return here could mean an unknown property, or
1716 # merely that the original inversion list is empty. Call
1717 # in scalar context to differentiate
1718 my $count = prop_invlist($lookup_prop,
1719 '_perl_core_internal_ok');
1720 die "Could not find inversion list for '$lookup_prop'"
1721 unless defined $count;
1724 @invlist = @$list_ref;
1725 @invmap = @$map_ref;
1726 $map_format = $format;
1727 $map_default = $default;
1728 $maps_to_code_point = $map_format =~ /x/;
1729 $to_adjust = $map_format =~ /a/;
1735 # Short-circuit an empty inversion list.
1737 output_invlist($prop_name, \@invlist, $charset);
1741 # Re-order the Unicode code points to native ones for this platform.
1742 # This is only needed for code points below 256, because native code
1743 # points are only in that range. For inversion maps of properties
1744 # where the mappings are adjusted (format =~ /a/), this reordering
1745 # could mess up the adjustment pattern that was in the input, so that
1746 # has to be dealt with.
1748 # And inversion maps that map to code points need to eventually have
1749 # all those code points remapped to native, and it's better to do that
1750 # here, going through the whole list not just those below 256. This
1751 # is because some inversion maps have adjustments (format =~ /a/)
1752 # which may be affected by the reordering. This code needs to be done
1753 # both for when we are translating the inversion lists for < 256, and
1754 # for the inversion maps for everything. By doing both in this loop,
1755 # we can share that code.
1757 # So, we go through everything for an inversion map to code points;
1758 # otherwise, we can skip any remapping at all if we are going to
1759 # output only the above-Latin1 values, or if the range spans the whole
1760 # of 0..256, as the remap will also include all of 0..256 (256 not
1761 # 255 because a re-ordering could cause 256 to need to be in the same
1763 if ((@invmap && $maps_to_code_point)
1764 || (! $nonl1_only || ($invlist[0] < 256
1765 && ! ($invlist[0] == 0 && $invlist[1] > 256))))
1768 if (! @invmap) { # Straight inversion list
1769 # Look at all the ranges that start before 257.
1772 last if $invlist[0] > 256;
1773 my $upper = @invlist > 1
1774 ? $invlist[1] - 1 # In range
1776 # To infinity. You may want to stop much much
1777 # earlier; going this high may expose perl
1778 # deficiencies with very large numbers.
1779 : $Unicode::UCD::MAX_CP;
1780 for my $j ($invlist[0] .. $upper) {
1781 push @latin1_list, a2n($j);
1784 shift @invlist; # Shift off the range that's in the list
1785 shift @invlist; # Shift off the range not in the list
1788 # Here @invlist contains all the ranges in the original that start
1789 # at code points above 256, and @latin1_list contains all the
1790 # native code points for ranges that start with a Unicode code
1791 # point below 257. We sort the latter and convert it to inversion
1792 # list format. Then simply prepend it to the list of the higher
1794 @latin1_list = sort { $a <=> $b } @latin1_list;
1795 @latin1_list = mk_invlist_from_sorted_cp_list(\@latin1_list);
1796 unshift @invlist, @latin1_list;
1798 else { # Is an inversion map
1800 # This is a similar procedure as plain inversion list, but has
1801 # multiple buckets. A plain inversion list just has two
1802 # buckets, 1) 'in' the list; and 2) 'not' in the list, and we
1803 # pretty much can ignore the 2nd bucket, as it is completely
1804 # defined by the 1st. But here, what we do is create buckets
1805 # which contain the code points that map to each, translated
1806 # to native and turned into an inversion list. Thus each
1807 # bucket is an inversion list of native code points that map
1808 # to it or don't map to it. We use these to create an
1809 # inversion map for the whole property.
1811 # As mentioned earlier, we use this procedure to not just
1812 # remap the inversion list to native values, but also the maps
1813 # of code points to native ones. In the latter case we have
1814 # to look at the whole of the inversion map (or at least to
1815 # above Unicode; as the maps of code points above that should
1816 # all be to the default).
1817 my $upper_limit = ($maps_to_code_point) ? 0x10FFFF : 256;
1819 my %mapped_lists; # A hash whose keys are the buckets.
1821 last if $invlist[0] > $upper_limit;
1823 # This shouldn't actually happen, as prop_invmap() returns
1824 # an extra element at the end that is beyond $upper_limit
1825 die "inversion map that extends to infinity is unimplemented" unless @invlist > 1;
1829 # A hash key can't be a ref (we are only expecting arrays
1830 # of scalars here), so convert any such to a string that
1831 # will be converted back later (using a vertical tab as
1832 # the separator). Even if the mapping is to code points,
1833 # we don't translate to native here because the code
1834 # output_map() calls to output these arrays assumes the
1835 # input is Unicode, not native.
1836 if (ref $invmap[0]) {
1837 $bucket = join "\cK", @{$invmap[0]};
1839 elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
1841 # Do convert to native for maps to single code points.
1842 # There are some properties that have a few outlier
1843 # maps that aren't code points, so the above test
1845 $bucket = a2n($invmap[0]);
1847 $bucket = $invmap[0];
1850 # We now have the bucket that all code points in the range
1851 # map to, though possibly they need to be adjusted. Go
1852 # through the range and put each translated code point in
1853 # it into its bucket.
1854 my $base_map = $invmap[0];
1855 for my $j ($invlist[0] .. $invlist[1] - 1) {
1857 # The 1st code point doesn't need adjusting
1860 # Skip any non-numeric maps: these are outliers
1861 # that aren't code points.
1862 && $base_map =~ $numeric_re
1864 # 'ne' because the default can be a string
1865 && $base_map ne $map_default)
1867 # We adjust, by incrementing each the bucket and
1868 # the map. For code point maps, translate to
1871 $bucket = ($maps_to_code_point)
1876 # Add the native code point to the bucket for the
1878 push @{$mapped_lists{$bucket}}, a2n($j);
1879 } # End of loop through all code points in the range
1881 # Get ready for the next range
1884 } # End of loop through all ranges in the map.
1886 # Here, @invlist and @invmap retain all the ranges from the
1887 # originals that start with code points above $upper_limit.
1888 # Each bucket in %mapped_lists contains all the code points
1889 # that map to that bucket. If the bucket is for a map to a
1890 # single code point is a single code point, the bucket has
1891 # been converted to native. If something else (including
1892 # multiple code points), no conversion is done.
1894 # Now we recreate the inversion map into %xlated, but this
1895 # time for the native character set.
1897 foreach my $bucket (keys %mapped_lists) {
1899 # Sort and convert this bucket to an inversion list. The
1900 # result will be that ranges that start with even-numbered
1901 # indexes will be for code points that map to this bucket;
1902 # odd ones map to some other bucket, and are discarded
1904 @{$mapped_lists{$bucket}}
1905 = sort{ $a <=> $b} @{$mapped_lists{$bucket}};
1906 @{$mapped_lists{$bucket}}
1907 = mk_invlist_from_sorted_cp_list(\@{$mapped_lists{$bucket}});
1909 # Add each even-numbered range in the bucket to %xlated;
1910 # so that the keys of %xlated become the range start code
1911 # points, and the values are their corresponding maps.
1912 while (@{$mapped_lists{$bucket}}) {
1913 my $range_start = $mapped_lists{$bucket}->[0];
1914 if ($bucket =~ /\cK/) {
1915 @{$xlated{$range_start}} = split /\cK/, $bucket;
1918 $xlated{$range_start} = $bucket;
1920 shift @{$mapped_lists{$bucket}}; # Discard odd ranges
1921 shift @{$mapped_lists{$bucket}}; # Get ready for next
1924 } # End of loop through all the buckets.
1926 # Here %xlated's keys are the range starts of all the code
1927 # points in the inversion map. Construct an inversion list
1929 my @new_invlist = sort { $a <=> $b } keys %xlated;
1931 # If the list is adjusted, we want to munge this list so that
1932 # we only have one entry for where consecutive code points map
1933 # to consecutive values. We just skip the subsequent entries
1934 # where this is the case.
1937 for my $i (0 .. @new_invlist - 1) {
1939 && $new_invlist[$i-1] + 1 == $new_invlist[$i]
1940 && $xlated{$new_invlist[$i-1]} =~ $numeric_re
1941 && $xlated{$new_invlist[$i]} =~ $numeric_re
1942 && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
1943 push @temp, $new_invlist[$i];
1945 @new_invlist = @temp;
1948 # The inversion map comes from %xlated's values. We can
1949 # unshift each onto the front of the untouched portion, in
1950 # reverse order of the portion we did process.
1951 foreach my $start (reverse @new_invlist) {
1952 unshift @invmap, $xlated{$start};
1955 # Finally prepend the inversion list we have just constructed to the
1956 # one that contains anything we didn't process.
1957 unshift @invlist, @new_invlist;
1961 # prop_invmap() returns an extra final entry, which we can now
1969 die "Unimplemented to do a Latin-1 only inversion map" if @invmap;
1970 for my $i (0 .. @invlist - 1 - 1) {
1971 if ($invlist[$i] > 255) {
1973 # In an inversion list, even-numbered elements give the code
1974 # points that begin ranges that match the property;
1975 # odd-numbered give ones that begin ranges that don't match.
1976 # If $i is odd, we are at the first code point above 255 that
1977 # doesn't match, which means the range it is ending does
1978 # match, and crosses the 255/256 boundary. We want to include
1979 # this ending point, so increment $i, so the splice below
1980 # includes it. Conversely, if $i is even, it is the first
1981 # code point above 255 that matches, which means there was no
1982 # matching range that crossed the boundary, and we don't want
1983 # to include this code point, so splice before it.
1984 $i++ if $i % 2 != 0;
1986 # Remove everything past this.
1987 splice @invlist, $i;
1988 splice @invmap, $i if @invmap;
1993 elsif ($nonl1_only) {
1994 my $found_nonl1 = 0;
1995 for my $i (0 .. @invlist - 1 - 1) {
1996 next if $invlist[$i] < 256;
1998 # Here, we have the first element in the array that indicates an
1999 # element above Latin1. Get rid of all previous ones.
2000 splice @invlist, 0, $i;
2001 splice @invmap, 0, $i if @invmap;
2003 # If this one's index is not divisible by 2, it means that this
2004 # element is inverting away from being in the list, which means
2005 # all code points from 256 to this one are in this list (or
2006 # map to the default for inversion maps)
2008 unshift @invlist, 256;
2009 unshift @invmap, $map_default if @invmap;
2014 die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
2017 output_invlist($prop_name, \@invlist, $charset);
2018 output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
2021 print $out_fh "\n" . get_conditional_compile_line_end();
2024 switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
2032 my $sources_list = "lib/unicore/mktables.lst";
2033 my @sources = ($0, qw(lib/unicore/mktables
2035 regen/charset_translations.pl
2038 # Depend on mktables’ own sources. It’s a shorter list of files than
2039 # those that Unicode::UCD uses.
2040 if (! open my $mktables_list, $sources_list) {
2042 # This should force a rebuild once $sources_list exists
2043 push @sources, $sources_list;
2046 while(<$mktables_list>) {
2049 push @sources, "lib/unicore/$_" if /^[^#]/;
2054 read_only_bottom_close_and_rename($out_fh, \@sources);