mktables: Revamp the compare versions functionality

author Karl Williamson <khw@cpan.org>

Thu, 20 Aug 2015 17:35:21 +0000 (11:35 -0600)

committer Karl Williamson <khw@cpan.org>

Thu, 20 Aug 2015 18:48:20 +0000 (12:48 -0600)
author Karl Williamson <khw@cpan.org>
Thu, 20 Aug 2015 17:35:21 +0000 (11:35 -0600)
committer Karl Williamson <khw@cpan.org>
Thu, 20 Aug 2015 18:48:20 +0000 (12:48 -0600)
diff --git a/charclass_invlists.h b/charclass_invlists.h

index 757f892..3f5c739 100644 (file)
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -99537,7 +99537,7 @@ static const UV XPosixXDigit_invlist[] = { /* for EBCDIC POSIX-BC */
   * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt
   * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt
   * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt
- * 552944c6a848efa825d6683e49b8fda246226239bbb6b8d8f6111f5665c3b279 lib/unicore/mktables
+ * 8342a7b7a0fcf32673d96b5d4423cf4b70b0c20f3e20325d43651d22316fd2b1 lib/unicore/mktables
   * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version
   * c6884f4d629f04d1316f3476cb1050b6a1b98ca30c903262955d4eae337c6b1e regen/charset_translations.pl
   * 8a097f8f726bb1619af2f27f149ab87e60a1602f790147e3a561358be16abd27 regen/mk_invlists.pl
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 8ff762d..33306e4 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -615,15 +615,17 @@ our $to_trace = 0;
  # This is for a rarely used development feature that allows you to compare two
  # versions of the Unicode standard without having to deal with changes caused
  # by the code points introduced in the later version.  You probably also want
-# to use the -annotate option when using this.  Change the 0 to a string
-# containing a SINGLE dotted Unicode release number (e.g. "2.1").  Only code
-# points introduced in that release and earlier will be used; later ones are
-# thrown away.  You use the version number of the earliest one you want to
-# compare; then run this program on directory structures containing each
-# release, and compare the outputs.  These outputs will therefore include only
-# the code points common to both releases, and you can see the changes caused
-# just by the underlying release semantic changes.  For versions earlier than
-# 3.2, you must copy a version of DAge.txt into the directory.
+# to use the -annotate option when using this.  Run this program on a unicore
+# containing the starting release you want to compare.  Save that output
+# structrue.  Then, switching to a unicore with the ending release, change the
+# 0 in the $string_compare_versions definition just below to a string
+# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
+# to the starting release.  This program will then compile, but throw away all
+# code points introduced after the starting release.  Finally use a diff tool
+# to compare the two directory structures.  They include only the code points
+# common to both releases, and you can see the changes caused just by the
+# underlying release semantic changes.  For versions earlier than 3.2, you
+# must copy a version of DAge.txt into the directory.
  my $string_compare_versions = DEBUG && 0; #  e.g., "2.1";
  my $compare_versions = DEBUG
                         && $string_compare_versions
@@ -802,6 +804,11 @@ close $VERSION;
  chomp $string_version;
  my $v_version = pack "C*", split /\./, $string_version;        # v string
  
+my $unicode_version = ($compare_versions)
+                      ? (  "$string_compare_versions (using "
+                         . "$string_version rules)")
+                      : $string_version;
+
  # The following are the complete names of properties with property values that
  # are known to not match any code points in some versions of Unicode, but that
  # may change in the future so they should be matchable, hence an empty file is
@@ -1071,7 +1078,7 @@ my %default_mapping = (
  my $HEADER=<<"EOF";
  # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
  # This file is machine-generated by $0 from the Unicode
-# database, Version $string_version.  Any changes made here will be lost!
+# database, Version $unicode_version.  Any changes made here will be lost!
  EOF
  
  my $INTERNAL_ONLY_HEADER = <<"EOF";
@@ -7147,7 +7154,7 @@ END
              else {
                  $cp = "one of the $code_points";
              }
-            $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
+            $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to";
          }
  
          my $comment = "";
@@ -8393,7 +8400,7 @@ resources, every table that matches the identical set of code points in this
  version of Unicode uses this file.  Each one is listed in a separate group
  below.  It could be that the tables will match the same set of code points in
  other Unicode releases, or it could be purely coincidence that they happen to
-be the same in Unicode $string_version, and hence may not in other versions.
+be the same in Unicode $unicode_version, and hence may not in other versions.
  
  END
          }
@@ -8418,7 +8425,7 @@ END
              Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
              $comment .= <<END;
  This file returns the $code_points in Unicode Version
-$string_version for
+$unicode_version for
  $leader, but it is inaccessible through Perl regular expressions, as
  "\\p{prop=}" is not recognized.
  END
@@ -8426,7 +8433,7 @@ END
          } else {
              $comment .= <<END;
  This file returns the $code_points in Unicode Version
-$string_version that
+$unicode_version that
  $match$synonyms:
  
  $matches_comment
@@ -10735,9 +10742,6 @@ sub output_perl_charnames_line ($$) {
  }
  
  { # Closure
-    # This is used to store the range list of all the code points usable when
-    # the little used $compare_versions feature is enabled.
-    my $compare_versions_range_list;
  
      # These are constants to the $property_info hash in this subroutine, to
      # avoid using a quoted-string which might have a typo.
@@ -10842,73 +10846,6 @@ sub output_perl_charnames_line ($$) {
              my $low = hex $1;
              my $high = (defined $2) ? hex $2 : $low;
  
-            # For the very specialized case of comparing two Unicode
-            # versions...
-            if (DEBUG && $compare_versions) {
-                if ($property_name eq 'Age') {
-
-                    # Only allow code points at least as old as the version
-                    # specified.
-                    my $age = pack "C*", split(/\./, $map);        # v string
-                    next LINE if $age gt $compare_versions;
-                }
-                else {
-
-                    # Again, we throw out code points younger than those of
-                    # the specified version.  By now, the Age property is
-                    # populated.  We use the intersection of each input range
-                    # with this property to find what code points in it are
-                    # valid.   To do the intersection, we have to convert the
-                    # Age property map to a Range_list.  We only have to do
-                    # this once.
-                    if (! defined $compare_versions_range_list) {
-                        my $age = property_ref('Age');
-                        if (! -e 'DAge.txt') {
-                            croak "Need to have 'DAge.txt' file to do version comparison";
-                        }
-                        elsif ($age->count == 0) {
-                            croak "The 'Age' table is empty, but its file exists";
-                        }
-                        $compare_versions_range_list
-                                        = Range_List->new(Initialize => $age);
-                    }
-
-                    # An undefined map is always 'Y'
-                    $map = 'Y' if ! defined $map;
-
-                    # Calculate the intersection of the input range with the
-                    # code points that are known in the specified version
-                    my @ranges = ($compare_versions_range_list
-                                  & Range->new($low, $high))->ranges;
-
-                    # If the intersection is empty, throw away this range
-                    next LINE unless @ranges;
-
-                    # Only examine the first range this time through the loop.
-                    my $this_range = shift @ranges;
-
-                    # Put any remaining ranges in the queue to be processed
-                    # later.  Note that there is unnecessary work here, as we
-                    # will do the intersection again for each of these ranges
-                    # during some future iteration of the LINE loop, but this
-                    # code is not used in production.  The later intersections
-                    # are guaranteed to not splinter, so this will not become
-                    # an infinite loop.
-                    my $line = join ';', $property_name, $map;
-                    foreach my $range (@ranges) {
-                        $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
-                                                            $range->start,
-                                                            $range->end,
-                                                            $line));
-                    }
-
-                    # And set things up so that the below will process this first
-                    # range, like any other.
-                    $low = $this_range->start;
-                    $high = $this_range->end;
-                }
-            } # End of $compare_versions
-
              # If changing to a new property, get the things constant per
              # property
              if ($previous_property_name ne $property_name) {
@@ -13702,6 +13639,285 @@ sub calculate_NChar() {  # Create a Perl extension match table which is the
      }
  }
  
+sub handle_compare_versions () {
+    # This fixes things up for the $compare_versions capability, where we
+    # compare Unicode version X with version Y (with Y > X), and we are
+    # running it on the Unicode Data for version Y.
+    #
+    # It works by calculating the code points whose meaning has been specified
+    # after release X, by using the Age property.  The complement of this set
+    # is the set of code points whose meaning is unchanged between the
+    # releases.  This is the set the program restricts itself to.  It includes
+    # everything whose meaning has been specified by the time version X came
+    # along, plus those still unassigned by the time of version Y.  (We will
+    # continue to use the word 'assigned' to mean 'meaning has been
+    # specified', as it's shorter and is accurate in all cases except the
+    # Noncharacter code points.)
+    #
+    # This function is run after all the properties specified by Unicode have
+    # been calculated for release Y.  This makes sure we get all the nuances
+    # of Y's rules.  (It is done before the Perl extensions are calculated, as
+    # those are based entirely on the Unicode ones.)  But doing it after the
+    # Unicode table calculations means we have to fix up the Unicode tables.
+    # We do this by subtracting the code points that have been assigned since
+    # X (which is actually done by ANDing each table of assigned code points
+    # with the set of unchanged code points).  Most Unicode properties are of
+    # the form such that all unassigned code points have a default, grab-bag,
+    # property value which is changed when the code point gets assigned.  For
+    # these, we just remove the changed code points from the table for the
+    # latter property value, and add them back in to the grab-bag one.  A few
+    # other properties are not entirely of this form and have values for some
+    # or all unassigned code points that are not the grab-bag one.  These have
+    # to be handled specially, and are hard-coded in to this routine based on
+    # manual inspection of the Unicode character database.  A list of the
+    # outlier code points is made for each of these properties, and those
+    # outliers are excluded from adding and removing from tables.
+    #
+    # Note that there are glitches when comparing against Unicode 1.1, as some
+    # Hangul syllables in it were later ripped out and eventually replaced
+    # with other things.
+
+    print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS;
+
+    my $after_first_version = "All matching code points were added after "
+                            . "Unicode $string_compare_versions";
+
+    # Calculate the delta as those code points that have been newly assigned
+    # since the first compare version.
+    my $delta = Range_List->new();
+    foreach my $table ($age->tables) {
+        next if $table == $age->table('Unassigned');
+        next if $table->name le $string_compare_versions;
+        $delta += $table;
+    }
+    if ($delta->is_empty) {
+        die ("No changes; perhaps you need a 'DAge.txt' file?");
+    }
+
+    my $unchanged = ~ $delta;
+
+    calculate_Assigned() if ! defined $Assigned;
+    $Assigned &= $unchanged;
+
+    # $Assigned now contains the code points that were assigned as of Unicode
+    # version X.
+
+    # A block is all or nothing.  If nothing is assigned in it, it all goes
+    # back to the No_Block pool; but if even one code point is assigned, the
+    # block is retained.
+    my $no_block = $block->table('No_Block');
+    foreach my $this_block ($block->tables) {
+        next if     $this_block == $no_block
+                ||  ! ($this_block & $Assigned)->is_empty;
+        $this_block->set_fate($SUPPRESSED, $after_first_version);
+        $no_block += $this_block;
+    }
+
+    my @special_delta_properties;   # List of properties that have to be
+                                    # handled specially.
+    my %restricted_delta;           # Keys are the entries in
+                                    # @special_delta_properties;  values
+                                    # are the range list of the code points
+                                    # that behave normally when they get
+                                    # assigned.
+
+    # In the next three properties, the Default Ignorable code points are
+    # outliers.
+    calculate_DI();
+    $DI &= $unchanged;
+
+    push @special_delta_properties, property_ref('_Perl_GCB');
+    $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
+
+    if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded')))
+    {
+        push @special_delta_properties, $cwnfkcc;
+        $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
+    }
+
+    calculate_NChar();      # Non-character code points
+    $NChar &= $unchanged;
+
+    # This may have to be updated from time-to-time to get the most accurate
+    # results.
+    my $default_BC_non_LtoR = Range_List->new(Initialize =>
+                        # These came from the comments in v8.0 DBidiClass.txt
+                                                        [ # AL
+                                                            0x0600 .. 0x07BF,
+                                                            0x08A0 .. 0x08FF,
+                                                            0xFB50 .. 0xFDCF,
+                                                            0xFDF0 .. 0xFDFF,
+                                                            0xFE70 .. 0xFEFF,
+                                                            0x1EE00 .. 0x1EEFF,
+                                                           # R
+                                                            0x0590 .. 0x05FF,
+                                                            0x07C0 .. 0x089F,
+                                                            0xFB1D .. 0xFB4F,
+                                                            0x10800 .. 0x10FFF,
+                                                            0x1E800 .. 0x1EDFF,
+                                                            0x1EF00 .. 0x1EFFF,
+                                                           # ET
+                                                            0x20A0 .. 0x20CF,
+                                                         ]
+                                          );
+    $default_BC_non_LtoR += $DI + $NChar;
+    push @special_delta_properties, property_ref('BidiClass');
+    $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR;
+
+    if (defined (my $eaw = property_ref('East_Asian_Width'))) {
+
+        my $default_EA_width_W = Range_List->new(Initialize =>
+                                    # From comments in v8.0 EastAsianWidth.txt
+                                                [
+                                                    0x3400 .. 0x4DBF,
+                                                    0x4E00 .. 0x9FFF,
+                                                    0xF900 .. 0xFAFF,
+                                                    0x20000 .. 0x2A6DF,
+                                                    0x2A700 .. 0x2B73F,
+                                                    0x2B740 .. 0x2B81F,
+                                                    0x2B820 .. 0x2CEAF,
+                                                    0x2F800 .. 0x2FA1F,
+                                                    0x20000 .. 0x2FFFD,
+                                                    0x30000 .. 0x3FFFD,
+                                                ]
+                                             );
+        push @special_delta_properties, $eaw;
+        $restricted_delta{$special_delta_properties[-1]}
+                                                       = ~ $default_EA_width_W;
+
+        # Line break came along in the same release as East_Asian_Width, and
+        # the non-grab-bag default set is a superset of the EAW one.
+        if (defined (my $lb = property_ref('Line_Break'))) {
+            my $default_LB_non_XX = Range_List->new(Initialize =>
+                                        # From comments in v8.0 LineBreak.txt
+                                                        [ 0x20A0 .. 0x20CF ]);
+            $default_LB_non_XX += $default_EA_width_W;
+            push @special_delta_properties, $lb;
+            $restricted_delta{$special_delta_properties[-1]}
+                                                        = ~ $default_LB_non_XX;
+        }
+    }
+
+    # Go through every property, skipping those we've already worked on, those
+    # that are immutable, and the perl ones that will be calculated after this
+    # routine has done its fixup.
+    foreach my $property (property_ref('*')) {
+        next if    $property == $perl     # Done later in the program
+                || $property == $block    # Done just above
+                || $property == $DI       # Done just above
+                || $property == $NChar    # Done just above
+
+                   # The next two are invariant across Unicode versions
+                || $property == property_ref('Pattern_Syntax')
+                || $property == property_ref('Pattern_White_Space');
+
+        #  Find the grab-bag value.
+        my $default_map = $property->default_map;
+
+        if (! $property->to_create_match_tables) {
+
+            # Here there aren't any match tables.  So far, all such properties
+            # have a default map, and don't require special handling.  Just
+            # change each newly assigned code point back to the default map,
+            # as if they were unassigned.
+            foreach my $range ($delta->ranges) {
+                $property->add_map($range->start,
+                                $range->end,
+                                $default_map,
+                                Replace => $UNCONDITIONALLY);
+            }
+        }
+        else {  # Here there are match tables.  Find the one (if any) for the
+                # grab-bag value that unassigned code points go to.
+            my $default_table;
+            if (defined $default_map) {
+                $default_table = $property->table($default_map);
+            }
+
+            # If some code points don't go back to the the grab-bag when they
+            # are considered unassigned, exclude them from the list that does
+            # that.
+            my $this_delta = $delta;
+            my $this_unchanged = $unchanged;
+            if (grep { $_ == $property } @special_delta_properties) {
+                $this_delta = $delta & $restricted_delta{$property};
+                $this_unchanged = ~ $this_delta;
+            }
+
+            # Fix up each match table for this property.
+            foreach my $table ($property->tables) {
+                if (defined $default_table && $table == $default_table) {
+
+                    # The code points assigned after release X (the ones we
+                    # are excluding in this routine) go back on to the default
+                    # (grab-bag) table.  However, some of these tables don't
+                    # actually exist, but are specified solely by the other
+                    # tables.  (In a binary property, we don't need to
+                    # actually have an 'N' table, as it's just the complement
+                    # of the 'Y' table.)  Such tables will be locked, so just
+                    # skip those.
+                    $table += $this_delta unless $table->locked;
+                }
+                else {
+
+                    # Here the table is not for the default value.  We need to
+                    # subtract the code points we are ignoring for this
+                    # comparison (the deltas) from it.  But if the table
+                    # started out with nothing, no need to exclude anything,
+                    # and want to skip it here anyway, so it gets listed
+                    # properly in the pod.
+                    next if $table->is_empty;
+
+                    # Save the deltas for later, before we do the subtraction
+                    my $deltas = $table & $this_delta;
+
+                    $table &= $this_unchanged;
+
+                    # Suppress the table if the subtraction left it with
+                    # nothing in it
+                    if ($table->is_empty) {
+                        if ($property->type == $BINARY) {
+                            push @tables_that_may_be_empty, $table->complete_name;
+                        }
+                        else {
+                            $table->set_fate($SUPPRESSED, $after_first_version);
+                        }
+                    }
+
+                    # Now we add the removed code points to the property's
+                    # map, as they should now map to the grab-bag default
+                    # property (which they did in the first comparison
+                    # version).  But we don't have to do this if the map is
+                    # only for internal use.
+                    if (defined $default_map && $property->to_output_map) {
+
+                        # The gc property has pseudo property values whose names
+                        # have length 1.  These are the union of all the
+                        # property values whose name is longer than 1 and
+                        # whose first letter is all the same.  The replacement
+                        # is done once for the longer-named tables.
+                        next if $property == $gc && length $table->name == 1;
+
+                        foreach my $range ($deltas->ranges) {
+                            $property->add_map($range->start,
+                                            $range->end,
+                                            $default_map,
+                                            Replace => $UNCONDITIONALLY);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    # The above code doesn't work on 'gc=C', as it is a superset of the default
+    # ('Cn') table.  It's easiest to just special case it here.
+    my $C = $gc->table('C');
+    $C += $gc->table('Cn');
+
+    return;
+}
+
  sub compile_perl() {
      # Create perl-defined tables.  Almost all are part of the pseudo-property
      # named 'perl' internally to this program.  Many of these are recommended
@@ -16290,7 +16506,7 @@ To change this file, edit $0 instead.
  
  =head1 NAME
  
-$pod_file - Index of Unicode Version $string_version character properties in Perl
+$pod_file - Index of Unicode Version $unicode_version character properties in Perl
  
  =head1 DESCRIPTION
  
@@ -18870,7 +19086,7 @@ if (@missing_early_files) {
  
  The compilation cannot be completed because one or more required input files,
  listed below, are missing.  This is because you are compiling Unicode version
-$string_version, which predates the existence of these file(s).  To fully
+$unicode_version, which predates the existence of these file(s).  To fully
  function, perl needs the data that these files would have contained if they
  had been in this release.  To work around this, create copies of later
  versions of the missing files in the directory containing '$0'.  (Perl will
@@ -19111,6 +19327,11 @@ foreach my $file (@input_file_objects) {
  print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
  finish_Unicode();
  
+# For the very specialized case of comparing two Unicode versions...
+if (DEBUG && $compare_versions) {
+    handle_compare_versions();
+}
+
  print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
  compile_perl();
  
diff --git a/regcharclass.h b/regcharclass.h

index 070caab..2e6eb77 100644 (file)
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -2514,7 +2514,7 @@
   * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt
   * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt
   * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt
- * 552944c6a848efa825d6683e49b8fda246226239bbb6b8d8f6111f5665c3b279 lib/unicore/mktables
+ * 8342a7b7a0fcf32673d96b5d4423cf4b70b0c20f3e20325d43651d22316fd2b1 lib/unicore/mktables
   * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version
   * c6884f4d629f04d1316f3476cb1050b6a1b98ca30c903262955d4eae337c6b1e regen/charset_translations.pl
   * d9c04ac46bdd81bb3e26519f2b8eb6242cb12337205add3f7cf092b0c58dccc4 regen/regcharclass.pl
author	Karl Williamson <khw@cpan.org>
	Thu, 20 Aug 2015 17:35:21 +0000 (11:35 -0600)
committer	Karl Williamson <khw@cpan.org>
	Thu, 20 Aug 2015 18:48:20 +0000 (12:48 -0600)
charclass_invlists.h		patch \| blob \| blame \| history
lib/unicore/mktables		patch \| blob \| blame \| history
regcharclass.h		patch \| blob \| blame \| history