mktables: Handle platforms with 3 digit exponents

[perl5.git] / lib / unicore / mktables
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 6d63c98..da5a919 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -45,7 +45,7 @@ sub NON_ASCII_PLATFORM { ord("A") != 65 }
  # expected, a warning will be generated.  If an older version is being
  # compiled, any bounds tests that fail in the generated test file (-maketest
  # option) will be marked as TODO.
-my $version_of_mk_invlist_bounds = v10.0.0;
+my $version_of_mk_invlist_bounds = v11.0.0;
  
  ##########################################################################
  #
@@ -652,7 +652,7 @@ sub stack_trace() {
  # to use the -annotate option when using this.  Run this program on a unicore
  # containing the starting release you want to compare.  Save that output
  # structure.  Then, switching to a unicore with the ending release, change the
-# 0 in the $string_compare_versions definition just below to a string
+# "" in the $string_compare_versions definition just below to a string
  # containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
  # to the starting release.  This program will then compile, but throw away all
  # code points introduced after the starting release.  Finally use a diff tool
@@ -895,6 +895,19 @@ if ($v_version gt v3.2.0) {
                                  'Canonical_Combining_Class=Attached_Below_Left'
  }
  
+# Obsoleted
+if ($v_version ge v11.0.0) {
+    push @tables_that_may_be_empty, qw(
+                                       Grapheme_Cluster_Break=E_Base
+                                       Grapheme_Cluster_Break=E_Base_GAZ
+                                       Grapheme_Cluster_Break=E_Modifier
+                                       Grapheme_Cluster_Break=Glue_After_Zwj
+                                       Word_Break=E_Base
+                                       Word_Break=E_Base_GAZ
+                                       Word_Break=E_Modifier
+                                       Word_Break=Glue_After_Zwj);
+}
+
  # Enum values for to_output_map() method in the Map_Table package. (0 is don't
  # output)
  my $EXTERNAL_MAP = 1;
@@ -10717,7 +10730,6 @@ END
          );
      }
  
-
      # Add any explicit cjk values
      $file->insert_lines(@cjk_property_values);
  
@@ -12956,6 +12968,19 @@ sub register_fraction($) {
  
      my $float = eval $rational;
      $float = sprintf "%.*e", $E_FLOAT_PRECISION, $float;
+
+    # Strip off any leading zeros beyond 2 digits to make it C99 compliant.
+    # (Windows has 3 digit exponents, contrary to C99)
+    $float =~ s/ ( .* e [-+] ) 0* ( \d{2,}? ) /$1$2/x;
+
+    if (   defined $nv_floating_to_rational{$float}
+        && $nv_floating_to_rational{$float} ne $rational)
+    {
+        die Carp::my_carp_bug("Both '$rational' and"
+                            . " '$nv_floating_to_rational{$float}' evaluate to"
+                            . " the same floating point number."
+                            . "  \$E_FLOAT_PRECISION must be increased");
+    }
      $nv_floating_to_rational{$float} = $rational;
      return;
  }
@@ -13467,6 +13492,24 @@ sub  filter_script_extensions_line {
      return;
  }
  
+sub setup_emojidata {
+    my $prop_ref = Property->new('XPG',
+                                 Full_Name => 'Extended_Pictographic',
+    );
+    $prop_ref->set_fate($PLACEHOLDER,
+                        "Not part of the Unicode Character Database");
+}
+
+sub filter_emojidata_line {
+    # We only are interested in this single property from this non-UCD data
+    # file, and we turn it into a Perl property, so that it isn't accessible
+    # to the users
+
+    $_ = "" unless /\bExtended_Pictographic\b/;
+
+    return;
+}
+
  sub generate_hst {
  
      # Populates the Hangul Syllable Type property from first principles
@@ -15439,33 +15482,52 @@ END
      }
  
      # Perl tailors the WordBreak property so that \b{wb} doesn't split
-    # adjacent spaces into separate words.  First create a copy of the regular
-    # WB property as '_Perl_WB'.  (On Unicode releases earlier than when WB
-    # was defined for, this will already have been done by the substitute file
-    # portion for 'Input_file' code for WB.)
+    # adjacent spaces into separate words.  Unicode 11.0 moved in that
+    # direction, but left TAB,  FIGURE SPACE (U+2007), and (ironically) NO
+    # BREAK SPACE as breaking, so we retained the original Perl customization.
+    # To do this, in the Perl copy of WB, simply replace the mappings of
+    # horizontal space characters that otherwise would map to the default or
+    # the 11.0 'WSegSpace' to instead map to our tailoring.
      my $perl_wb = property_ref('_Perl_WB');
-    if (! defined $perl_wb) {
-        $perl_wb = Property->new('_Perl_WB',
-                                 Fate => $INTERNAL_ONLY,
-                                 Perl_Extension => 1,
-                                 Directory => $map_directory,
-                                 Type => $STRING);
-        my $wb = property_ref('Word_Break');
-        $perl_wb->initialize($wb);
-        $perl_wb->set_default_map($wb->default_map);
-    }
-
-    # And simply replace the mappings of horizontal space characters that
-    # otherwise would map to the default to instead map to our tailoring.
      my $default = $perl_wb->default_map;
      for my $range ($Blank->ranges) {
          for my $i ($range->start .. $range->end) {
-            next unless $perl_wb->value_of($i) eq $default;
+            my $value = $perl_wb->value_of($i);
+
+            next unless $value eq $default || $value eq 'WSegSpace';
              $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
                                Replace => $UNCONDITIONALLY);
          }
      }
  
+    # Also starting in Unicode 11.0, rules for some of the boundary types are
+    # based on a non-UCD property (which we have read in if it exists).
+    # Recall that these boundary properties partition the code points into
+    # equivalence classes (represented as enums).
+    #
+    # The loop below goes through each code point that matches the non-UCD
+    # property, and for each current equivalence class containing such a code
+    # point, splits it so that those that are in both are now in a newly
+    # created equivalence class whose name is a combination of the property
+    # and the old class name, leaving unchanged everything that doesn't match
+    # the non-UCD property.
+    my $pictographic_emoji = property_ref('XPG');
+    if (defined $pictographic_emoji) {
+        foreach my $base_property (property_ref('GCB'),
+                                   property_ref('WB'))
+        {
+            my $property = property_ref('_Perl_' . $base_property->name);
+            foreach my $range ($pictographic_emoji->table('Y')->ranges) {
+                foreach my $i ($range->start .. $range->end) {
+                    my $current = $property->value_of($i);
+                    $current = $property->table($current)->short_name;
+                    $property->add_map($i, $i, 'XPG_' . $current,
+                                       Replace => $UNCONDITIONALLY);
+                }
+            }
+        }
+    }
+
      # Create a version of the LineBreak property with the mappings that are
      # omitted in the default algorithm remapped to what
      # http://www.unicode.org/reports/tr14 says they should be.
@@ -17182,7 +17244,7 @@ Perl can provide access to all non-provisional Unicode character properties,
  though not all are enabled by default.  The omitted ones are the Unihan
  properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
  deprecated or Unicode-internal properties.  (An installation may choose to
-recompile Perl's tables to change this.  See L<Unicode character
+recompile Perl's tables to change this.  See L</Unicode character
  properties that are NOT accepted by Perl>.)
  
  For most purposes, access to Unicode properties from the Perl core is through
@@ -19018,7 +19080,7 @@ EOF_CODE
                                 } property_ref('*'))
      {
          # Non-binary properties should not match \p{};  Test all for that.
-        if ($property->type != $BINARY) {
+        if ($property->type != $BINARY && $property->type != $FORCED_BINARY) {
              my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS }
                                                              $property->aliases;
              foreach my $property_alias ($property->aliases) {
@@ -19084,6 +19146,11 @@ EOF_CODE
                  # already guaranteed to be in error
                  my $already_error = ! $table->file_path;
  
+                # A table that begins with these could actually be a
+                # user-defined property, so won't be compile time errors, as
+                # the definitions of those can be deferred until runtime
+                next if $already_error && $table_name =~ / ^ I[ns] /x;
+
                  # Generate error cases for this alias.
                  push @output, generate_error($property_name,
                                               $table_name,
@@ -19194,7 +19261,7 @@ EOF_CODE
                          }
  
                          # Make tests for each possible precision from 1 to
-                        # just past the worst case.  
+                        # just past the worst case.
                          my $upper_limit = ($min_e_precision > $min_f_precision)
                                             ? $min_e_precision
                                             : $min_f_precision;
@@ -19712,12 +19779,7 @@ my @input_file_objects = (
                      Skip => $Documentation,
                     ),
      Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
-                    Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter',
-
-                               # Don't use _Perl_WB as a synonym for
-                               # Word_Break in later perls, as it is tailored
-                               # and isn't the same as Word_Break
-                               'ONLY_EARLY' ],
+                    Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
                      Property => 'Word_Break',
                      Has_Missings_Defaults => $NOT_IGNORED,
                     ),
@@ -19860,6 +19922,10 @@ my @input_file_objects = (
                      Skip => 'Maps certain Unicode code points to their '
                            . 'legacy Japanese cell-phone values',
                     ),
+    # This file is actually not usable as-is until 6.1.0, because the property
+    # is provisional, so its name is missing from PropertyAliases.txt until
+    # that release, so that further work would have to be done to get it to
+    # work properly
      Input_file->new('ScriptExtensions.txt', v6.0.0,
                      Property => 'Script_Extensions',
                      Early => [ sub {} ], # Doesn't do anything but ensures
@@ -19872,10 +19938,9 @@ my @input_file_objects = (
                                              : $IGNORED),
                     ),
      # These two Indic files are actually not usable as-is until 6.1.0,
-    # because their property values are missing from PropValueAliases.txt
-    # until that release, so that further work would have to be done to get
-    # them to work properly, which isn't worth it because of them being
-    # provisional.
+    # because they are provisional, so their property values are missing from
+    # PropValueAliases.txt until that release, so that further work would have
+    # to be done to get them to work properly.
      Input_file->new('IndicMatraCategory.txt', v6.0.0,
                      Withdrawn => v8.0.0,
                      Property => 'Indic_Matra_Category',
@@ -19923,6 +19988,19 @@ my @input_file_objects = (
      Input_file->new('NushuSources.txt', v10.0.0,
                      Skip => 'Specifies source material for Nushu characters',
                     ),
+    Input_file->new('EquivalentUnifiedIdeograph.txt', v11.0.0,
+                    Property => 'Equivalent_Unified_Ideograph',
+                    Has_Missings_Defaults => $NOT_IGNORED,
+                   ),
+    Input_file->new('EmojiData.txt', v11.0.0,
+                    # Is in UAX #51 and not the UCD, so must be updated
+                    # separately, and the first line edited to indicate the
+                    # UCD release we're pretending it to be in.  The UTC says
+                    # this is a transitional state.
+                    Pre_Handler => \&setup_emojidata,
+                    Has_Missings_Defaults => $NOT_IGNORED,
+                    Each_Line_Handler => \&filter_emojidata_line,
+                   ),
  );
  
  # End of all the preliminaries.
@@ -20259,7 +20337,7 @@ if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
  if ($version_of_mk_invlist_bounds lt $v_version) {
      Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
                  . " to be checked and possibly updated to Unicode"
-                . " $string_version");
+                . " $string_version.  Failing tests will be marked TODO");
  }
  
  exit(0);