mktables: Correct L<> for perluniprops; rmv trail space

[perl5.git] / lib / unicore / mktables
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 57c2e43..a66a913 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -937,9 +937,9 @@ my %why_obsolete;    # Documentation only
      my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
  
      %why_deprecated = (
-        'Grapheme_Link' => 'Deprecated by Unicode:  Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
+        'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
          'Jamo_Short_Name' => $contributory,
-        'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
+        'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
          'Other_Alphabetic' => $contributory,
          'Other_Default_Ignorable_Code_Point' => $contributory,
          'Other_Grapheme_Extend' => $contributory,
@@ -1436,10 +1436,10 @@ my @missing_early_files;   # Generated list of absent files that we need to
  my @files_actually_output; # List of files we generated.
  my @more_Names;            # Some code point names are compound; this is used
                             # to store the extra components of them.
-my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
-                           # the minimum before we consider it equivalent to a
-                           # candidate rational
-my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
+my $E_FLOAT_PRECISION = 2; # The minimum number of digits after the decimal
+                           # point of a normalized floating point number
+                           # needed to match before we consider it equivalent
+                           # to a candidate rational
  
  # These store references to certain commonly used property objects
  my $age;
@@ -3554,7 +3554,7 @@ sub trace { return main::trace(@_); }
      main::set_access('end', \%end, 'r', 's');
  
      my %value;
-    main::set_access('value', \%value, 'r');
+    main::set_access('value', \%value, 'r', 's');
  
      my %type;
      main::set_access('type', \%type, 'r');
@@ -12955,6 +12955,15 @@ sub register_fraction($) {
      my $rational = shift;
  
      my $float = eval $rational;
+    $float = sprintf "%.*e", $E_FLOAT_PRECISION, $float;
+    if (   defined $nv_floating_to_rational{$float}
+        && $nv_floating_to_rational{$float} ne $rational)
+    {
+        die Carp::my_carp_bug("Both '$rational' and"
+                            . " '$nv_floating_to_rational{$float}' evaluate to"
+                            . " the same floating point number."
+                            . "  \$E_FLOAT_PRECISION must be increased");
+    }
      $nv_floating_to_rational{$float} = $rational;
      return;
  }
@@ -13418,8 +13427,8 @@ sub setup_script_extensions {
      # property.
  
      $scx = property_ref("Script_Extensions");
-    $scx = Property->new("scx", Full_Name => "Script_Extensions")
-                                                            if ! defined $scx;
+    return unless defined $scx;
+
      $scx->_set_format($STRING_WHITE_SPACE_LIST);
      $scx->initialize($script);
      $scx->set_default_map($script->default_map);
@@ -13655,7 +13664,7 @@ sub filter_all_caps_script_names {
  
      my ($range, $script, @remainder)
          = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
-    my @words = split "_", $script;
+    my @words = split /[_-]/, $script;
      for my $word (@words) {
          $word =
              ucfirst(lc($word)) if $word ne 'CJK';
@@ -14469,13 +14478,6 @@ sub compile_perl() {
  
      calculate_Assigned();
  
-    # Our internal-only property should be treated as more than just a
-    # synonym; grandfather it in to the pod.
-    $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
-                            Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
-            ->set_equivalent_to(property_ref('ccc')->table('Above'),
-                                                                Related => 1);
-
      my $ASCII = $perl->add_match_table('ASCII');
      if (defined $block) {   # This is equivalent to the block if have it.
          my $Unicode_ASCII = $block->table('Basic_Latin');
@@ -14617,56 +14619,6 @@ sub compile_perl() {
          $Lower->set_caseless_equivalent($cased);
      }
  
-    # Similarly, set up our own Case_Ignorable property if this Unicode
-    # version doesn't have it.  From Unicode 5.1: Definition D121: A character
-    # C is defined to be case-ignorable if C has the value MidLetter or the
-    # value MidNumLet for the Word_Break property or its General_Category is
-    # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
-    # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
-
-    # Perl has long had an internal-only alias for this property; grandfather
-    # it in to the pod, but discourage its use.
-    my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
-                                                     Re_Pod_Entry => 1,
-                                                     Fate => $INTERNAL_ONLY,
-                                                     Status => $DISCOURAGED);
-    my $case_ignorable = property_ref('Case_Ignorable');
-    if (defined $case_ignorable && ! $case_ignorable->is_empty) {
-        $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
-                                                                Related => 1);
-    }
-    else {
-
-        $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
-
-        # The following three properties are not in early releases
-        $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
-        $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
-        $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
-
-        # For versions 4.1 - 5.0, there is no MidNumLet property, and
-        # correspondingly the case-ignorable definition lacks that one.  For
-        # 4.0, it appears that it was meant to be the same definition, but was
-        # inadvertently omitted from the standard's text, so add it if the
-        # property actually is there
-        my $wb = property_ref('Word_Break');
-        if (defined $wb) {
-            my $midlet = $wb->table('MidLetter');
-            $perl_case_ignorable += $midlet if defined $midlet;
-            my $midnumlet = $wb->table('MidNumLet');
-            $perl_case_ignorable += $midnumlet if defined $midnumlet;
-        }
-        else {
-
-            # In earlier versions of the standard, instead of the above two
-            # properties , just the following characters were used:
-            $perl_case_ignorable +=
-                            ord("'")
-                        +   utf8::unicode_to_native(0xAD)  # SOFT HYPHEN (SHY)
-                        +   0x2019; # RIGHT SINGLE QUOTATION MARK
-        }
-    }
-
      # The remaining perl defined tables are mostly based on Unicode TR 18,
      # "Annex C: Compatibility Properties".  All of these have two versions,
      # one whose name generally begins with Posix that is posix-compliant, and
@@ -15040,33 +14992,6 @@ sub compile_perl() {
          Note => 'Union of all non-canonical decompositions',
          );
  
-    # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
-    # than SD appeared, construct it ourselves, based on the first release SD
-    # was in.  A pod entry is grandfathered in for it
-    my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
-                                           Perl_Extension => 1,
-                                           Fate => $INTERNAL_ONLY,
-                                           Status => $DISCOURAGED);
-    my $soft_dotted = property_ref('Soft_Dotted');
-    if (defined $soft_dotted && ! $soft_dotted->is_empty) {
-        $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
-    }
-    else {
-
-        # This list came from 3.2 Soft_Dotted; all of these code points are in
-        # all releases
-        $CanonDCIJ->initialize([ ord('i'),
-                                 ord('j'),
-                                 0x012F,
-                                 0x0268,
-                                 0x0456,
-                                 0x0458,
-                                 0x1E2D,
-                                 0x1ECB,
-                               ]);
-        $CanonDCIJ = $CanonDCIJ & $Assigned;
-    }
-
      # For backward compatibility, Perl has its own definition for IDStart.
      # It is regular XID_Start plus the underscore, but all characters must be
      # Word characters as well
@@ -15386,6 +15311,9 @@ END
                                                      . $current_age->name
                                                      . ' or earlier',
                                      );
+            foreach my $alias ($current_age->aliases) {
+                $current_in->add_alias($alias->name);
+            }
              $previous_in = $current_in;
  
              # Add clarifying material for the corresponding age file.  This is
@@ -15607,6 +15535,71 @@ END
          }
      }
  
+    # This property is a modification of the scx property
+    my $perl_scx = Property->new('_Perl_SCX',
+                                 Fate => $INTERNAL_ONLY,
+                                 Perl_Extension => 1,
+                                 Directory => $map_directory,
+                                 Type => $ENUM);
+    my $source;
+
+    # Use scx if available; otherwise sc;  if neither is there (a very old
+    # Unicode version, just say that everything is 'Common'
+    if (defined $scx) {
+        $source = $scx;
+        $perl_scx->set_default_map('Unknown');
+    }
+    elsif (defined $script) {
+        $source = $script;
+
+        # Early versions of 'sc', had everything be 'Common'
+        if (defined $script->table('Unknown')) {
+            $perl_scx->set_default_map('Unknown');
+        }
+        else {
+            $perl_scx->set_default_map('Common');
+        }
+    } else {
+        $perl_scx->add_match_table('Common');
+        $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common');
+
+        $perl_scx->add_match_table('Unknown');
+        $perl_scx->set_default_map('Unknown');
+    }
+
+    $perl_scx->_set_format($STRING_WHITE_SPACE_LIST);
+    $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
+
+    if (defined $source) {
+        $perl_scx->initialize($source);
+
+        # UTS 39 says that the scx property should be modified for these
+        # countries where certain mixed scripts are commonly used.
+        for my $range ($perl_scx->ranges) {
+            my $value = $range->value;
+            my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi;
+             $changed |=  $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi;
+             $changed |=  $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi;
+             $changed |=  $value =~ s{ ( \b Katakana_or_Hiragana \b ) }
+                                     {$1 Katakana Hiragana Jpan}xi;
+             $changed |=  $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi;
+             $changed |=  $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi;
+
+            if ($changed) {
+                $value = join " ", uniques split " ", $value;
+                $range->set_value($value)
+            }
+        }
+
+        foreach my $table ($source->tables) {
+            my $scx_table = $perl_scx->add_match_table($table->name,
+                                    Full_Name => $table->full_name);
+            foreach my $alias ($table->aliases) {
+                $scx_table->add_alias($alias->name);
+            }
+        }
+    }
+
      # Here done with all the basic stuff.  Ready to populate the information
      # about each character if annotating them.
      if ($annotate) {
@@ -17197,7 +17190,7 @@ Perl can provide access to all non-provisional Unicode character properties,
  though not all are enabled by default.  The omitted ones are the Unihan
  properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
  deprecated or Unicode-internal properties.  (An installation may choose to
-recompile Perl's tables to change this.  See L<Unicode character
+recompile Perl's tables to change this.  See L</Unicode character
  properties that are NOT accepted by Perl>.)
  
  For most purposes, access to Unicode properties from the Perl core is through
@@ -17672,10 +17665,10 @@ $loose_to_file_of
  $nv_floating_to_rational
  );
  
-# If a floating point number doesn't have enough digits in it to get this
-# close to a fraction, it isn't considered to be that fraction even if all the
-# digits it does have match.
-\$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
+# If a %e floating point number doesn't have this number of digits in it after
+# the decimal point to get this close to a fraction, it isn't considered to be
+# that fraction even if all the digits it does have match.
+\$utf8::e_precision = $E_FLOAT_PRECISION;
  
  # Deprecated tables to generate a warning for.  The key is the file containing
  # the table, so as to avoid duplication, as many property names can map to the
@@ -18998,21 +18991,12 @@ sub make_property_test_script() {
  
      $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
  
-    # Keep going down an order of magnitude
-    # until find that adding this quantity to
-    # 1 remains 1; but put an upper limit on
-    # this so in case this algorithm doesn't
-    # work properly on some platform, that we
-    # won't loop forever.
-    my $digits = 0;
-    my $min_floating_slop = 1;
-    while (1+ $min_floating_slop != 1
-            && $digits++ < 50)
-    {
-        my $next = $min_floating_slop / 10;
-        last if $next == 0; # If underflows,
-                            # use previous one
-        $min_floating_slop = $next;
+    # Create a list of what the %f representation is for each rational number.
+    # This will be used below.
+    my @valid_base_floats = '0.0';
+    foreach my $e_representation (keys %nv_floating_to_rational) {
+        push @valid_base_floats,
+                            eval $nv_floating_to_rational{$e_representation};
      }
  
      # It doesn't matter whether the elements of this array contain single lines
@@ -19041,6 +19025,24 @@ EOF_CODE
                                   lc $a->name cmp lc $b->name
                                 } property_ref('*'))
      {
+        # Non-binary properties should not match \p{};  Test all for that.
+        if ($property->type != $BINARY && $property->type != $FORCED_BINARY) {
+            my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS }
+                                                            $property->aliases;
+            foreach my $property_alias ($property->aliases) {
+                my $name = standardize($property_alias->name);
+
+                # But some names are ambiguous, meaning a binary property with
+                # the same name when used in \p{}, and a different
+                # (non-binary) property in other contexts.
+                next if grep { $name eq $_ } keys %ambiguous_names;
+
+                push @output, <<"EOF_CODE";
+Error('\\p{$name}');
+Error('\\P{$name}');
+EOF_CODE
+            }
+        }
          foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
                                      or
                                    lc $a->name cmp lc $b->name
@@ -19138,77 +19140,111 @@ EOF_CODE
                                                   $warning,
                                               );
  
-                    # If the name is a rational number, add tests for the
-                    # floating point equivalent.
-                    if ($table_name =~ qr{/}) {
+                    if ($property->name eq 'nv') {
+                        if ($table_name !~ qr{/}) {
+                            push @output, generate_tests($property_name,
+                                                sprintf("%.15e", $table_name),
+                                                $valid,
+                                                $invalid,
+                                                $warning,
+                                            );
+                    }
+                    else {
+                    # If the name is a rational number, add tests for a
+                    # non-reduced form, and for a floating point equivalent.
+
+                        # 60 is a number divisible by a bunch of things
+                        my ($numerator, $denominator) = $table_name
+                                                        =~ m! (.+) / (.+) !x;
+                        $numerator *= 60;
+                        $denominator *= 60;
+                        push @output, generate_tests($property_name,
+                                                    "$numerator/$denominator",
+                                                    $valid,
+                                                    $invalid,
+                                                    $warning,
+                                    );
  
-                        # Calculate the float, and find just the fraction.
+                        # Calculate the float, and the %e representation
                          my $float = eval $table_name;
-                        my ($whole, $fraction)
-                                            = $float =~ / (.*) \. (.*) /x;
-
-                        # Starting with one digit after the decimal point,
-                        # create a test for each possible precision (number of
-                        # digits past the decimal point) until well beyond the
-                        # native number found on this machine.  (If we started
-                        # with 0 digits, it would be an integer, which could
-                        # well match an unrelated table)
-                        PLACE:
-                        for my $i (1 .. $min_floating_slop + 3) {
-                            my $table_name = sprintf("%.*f", $i, $float);
-                            if ($i < $MIN_FRACTION_LENGTH) {
-
-                                # If the test case has fewer digits than the
-                                # minimum acceptable precision, it shouldn't
-                                # succeed, so we expect an error for it.
-                                # E.g., 2/3 = .7 at one decimal point, and we
-                                # shouldn't say it matches .7.  We should make
-                                # it be .667 at least before agreeing that the
-                                # intent was to match 2/3.  But at the
-                                # less-than- acceptable level of precision, it
-                                # might actually match an unrelated number.
-                                # So don't generate a test case if this
-                                # conflating is possible.  In our example, we
-                                # don't want 2/3 matching 7/10, if there is
-                                # a 7/10 code point.
-
-                                # First, integers are not in the rationals
-                                # table.  Don't generate an error if this
-                                # rounds to an integer using the given
-                                # precision.
-                                my $round = sprintf "%.0f", $table_name;
-                                next PLACE if abs($table_name - $round)
-                                                        < $MAX_FLOATING_SLOP;
-
-                                # Here, isn't close enough to an integer to be
-                                # confusable with one.  Now, see it it's
-                                # "close" to a known rational
-                                for my $existing
-                                        (keys %nv_floating_to_rational)
+                        my $e_representation = sprintf("%.*e",
+                                                $E_FLOAT_PRECISION, $float);
+                        # Parse that
+                        my ($non_zeros, $zeros, $exponent_sign, $exponent)
+                           = $e_representation
+                               =~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x;
+                        my $min_e_precision;
+                        my $min_f_precision;
+
+                        if ($exponent_sign eq '+' && $exponent != 0) {
+                            Carp::my_carp_bug("Not yet equipped to handle"
+                                            . " positive exponents");
+                            return;
+                        }
+                        else {
+                            # We're trying to find the minimum precision that
+                            # is needed to indicate this particular rational
+                            # for the given $E_FLOAT_PRECISION.  For %e, any
+                            # trailing zeros, like 1.500e-02 aren't needed, so
+                            # the correct value is how many non-trailing zeros
+                            # there are after the decimal point.
+                            $min_e_precision = length $non_zeros;
+
+                            # For %f, like .01500, we want at least
+                            # $E_FLOAT_PRECISION digits, but any trailing
+                            # zeros aren't needed, so we can subtract the
+                            # length of those.  But we also need to include
+                            # the zeros after the decimal point, but before
+                            # the first significant digit.
+                            $min_f_precision = $E_FLOAT_PRECISION
+                                             + $exponent
+                                             - length $zeros;
+                        }
+
+                        # Make tests for each possible precision from 1 to
+                        # just past the worst case.
+                        my $upper_limit = ($min_e_precision > $min_f_precision)
+                                           ? $min_e_precision
+                                           : $min_f_precision;
+
+                        for my $i (1 .. $upper_limit + 1) {
+                            for my $format ("e", "f") {
+                                my $this_table
+                                          = sprintf("%.*$format", $i, $float);
+
+                                # If we don't have enough precision digits,
+                                # make a fail test; otherwise a pass test.
+                                my $pass = ($format eq "e")
+                                            ? $i >= $min_e_precision
+                                            : $i >= $min_f_precision;
+                                if ($pass) {
+                                    push @output, generate_tests($property_name,
+                                                                $this_table,
+                                                                $valid,
+                                                                $invalid,
+                                                                $warning,
+                                                );
+                                }
+                                elsif (   $format eq "e"
+
+                                          # Here we would fail, but in the %f
+                                          # case, the representation at this
+                                          # precision could actually be a
+                                          # valid one for some other rational
+                                       || ! grep { $_ eq $this_table }
+                                                            @valid_base_floats)
                                  {
-                                    next PLACE
-                                        if abs($table_name - $existing)
-                                                < $MAX_FLOATING_SLOP;
+                                    push @output,
+                                        generate_error($property_name,
+                                                       $this_table,
+                                                       1   # 1 => already an
+                                                           # error
+                                                );
                                  }
-                                push @output, generate_error($property_name,
-                                                             $table_name,
-                                                             1   # 1 => already an error
-                                              );
-                            }
-                            else {
-
-                                # Here the number of digits exceeds the
-                                # minimum we think is needed.  So generate a
-                                # success test case for it.
-                                push @output, generate_tests($property_name,
-                                                             $table_name,
-                                                             $valid,
-                                                             $invalid,
-                                                             $warning,
-                                             );
                              }
                          }
                      }
+                    }
                  }
              }
              $table->DESTROY();
@@ -19832,8 +19868,15 @@ my @input_file_objects = (
                      Skip => 'Maps certain Unicode code points to their '
                            . 'legacy Japanese cell-phone values',
                     ),
+    # This file is actually not usable as-is until 6.1.0, because the property
+    # is provisional, so its name is missing from PropertyAliases.txt until
+    # that release, so that further work would have to be done to get it to
+    # work properly
      Input_file->new('ScriptExtensions.txt', v6.0.0,
                      Property => 'Script_Extensions',
+                    Early => [ sub {} ], # Doesn't do anything but ensures
+                                         # that this isn't skipped for early
+                                         # versions
                      Pre_Handler => \&setup_script_extensions,
                      Each_Line_Handler => \&filter_script_extensions_line,
                      Has_Missings_Defaults => (($v_version le v6.0.0)
@@ -19841,10 +19884,9 @@ my @input_file_objects = (
                                              : $IGNORED),
                     ),
      # These two Indic files are actually not usable as-is until 6.1.0,
-    # because their property values are missing from PropValueAliases.txt
-    # until that release, so that further work would have to be done to get
-    # them to work properly, which isn't worth it because of them being
-    # provisional.
+    # because they are provisional, so their property values are missing from
+    # PropValueAliases.txt until that release, so that further work would have
+    # to be done to get them to work properly.
      Input_file->new('IndicMatraCategory.txt', v6.0.0,
                      Withdrawn => v8.0.0,
                      Property => 'Indic_Matra_Category',
@@ -20162,9 +20204,9 @@ if ( $file_list and $make_list ) {
  
      print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
      foreach my $file (@input_files, @files_actually_output) {
-        my (undef, $directories, $file) = File::Spec->splitpath($file);
-        my @directories = File::Spec->splitdir($directories);
-        $file = join '/', @directories, $file;
+        my (undef, $directories, $basefile) = File::Spec->splitpath($file);
+        my @directories = grep length, File::Spec->splitdir($directories);
+        $file = join '/', @directories, $basefile;
      }
  
      my $ofh;