mktables: Refactor if-else series

[perl5.git] / lib / unicore / mktables
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 00a38ef..5755e0a 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -770,7 +770,6 @@ push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
                                                      if $v_version ge v6.0.0;
  push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
                                                      if $v_version ge v6.1.0;
-push @tables_that_may_be_empty, '_stc';
  
  # The lists below are hashes, so the key is the item in the list, and the
  # value is the reason why it is in the list.  This makes generation of
@@ -3015,7 +3014,7 @@ sub trace { return main::trace(@_); }
          # either a constructor or a method.  If called as a method, the result
          # will be a new() instance of the calling object, containing the union
          # of that object with the other parameter's code points;  if called as
-        # a constructor, the first parameter gives the class the new object
+        # a constructor, the first parameter gives the class that the new object
          # should be, and the second parameter gives the code points to go into
          # it.
          # In either case, there are two parameters looked at by this routine;
@@ -3027,14 +3026,20 @@ sub trace { return main::trace(@_); }
          # just a single code point.
          #
          # If they are ranges, this routine doesn't make any effort to preserve
-        # the range values of one input over the other.  Therefore this base
-        # class should not allow _union to be called from other than
+        # the range values and types of one input over the other.  Therefore
+        # this base class should not allow _union to be called from other than
          # initialization code, so as to prevent two tables from being added
          # together where the range values matter.  The general form of this
          # routine therefore belongs in a derived class, but it was moved here
          # to avoid duplication of code.  The failure to overload this in this
          # class keeps it safe.
          #
+        # It does make the effort during initialization to accept tables with
+        # multiple values for the same code point, and to preserve the order
+        # of these.  If there is only one input range or range set, it doesn't
+        # sort (as it should already be sorted to the desired order), and will
+        # accept multiple values per code point.  Otherwise it will merge
+        # multiple values into a single one.
  
          my $self;
          my @args;   # Arguments to pass to the constructor
@@ -3055,6 +3060,7 @@ sub trace { return main::trace(@_); }
  
          # Accumulate all records from both lists.
          my @records;
+        my $input_count = 0;
          for my $arg (@args) {
              #local $to_trace = 0 if main::DEBUG;
              trace "argument = $arg" if main::DEBUG && $to_trace;
@@ -3067,18 +3073,22 @@ sub trace { return main::trace(@_); }
                  Carp::my_carp_bug($message .= "Undefined argument to _union.  No union done.");
                  return;
              }
+
              $arg = [ $arg ] if ! ref $arg;
              my $type = ref $arg;
              if ($type eq 'ARRAY') {
                  foreach my $element (@$arg) {
                      push @records, Range->new($element, $element);
+                    $input_count++;
                  }
              }
              elsif ($arg->isa('Range')) {
                  push @records, $arg;
+                $input_count++;
              }
              elsif ($arg->can('ranges')) {
                  push @records, $arg->ranges;
+                $input_count++;
              }
              else {
                  my $message = "";
@@ -3094,13 +3104,15 @@ sub trace { return main::trace(@_); }
          # Sort with the range containing the lowest ordinal first, but if
          # two ranges start at the same code point, sort with the bigger range
          # of the two first, because it takes fewer cycles.
-        @records = sort { ($a->start <=> $b->start)
+        if ($input_count > 1) {
+            @records = sort { ($a->start <=> $b->start)
                                        or
                                      # if b is shorter than a, b->end will be
                                      # less than a->end, and we want to select
                                      # a, so want to return -1
                                      ($b->end <=> $a->end)
                                     } @records;
+        }
  
          my $new = $class->new(@_);
  
@@ -3108,12 +3120,20 @@ sub trace { return main::trace(@_); }
          for my $set (@records) {
              my $start = $set->start;
              my $end   = $set->end;
-            my $value   = $set->value;
+            my $value = $set->value;
+            my $type  = $set->type;
              if ($start > $new->max) {
-                $new->_add_delete('+', $start, $end, $value);
+                $new->_add_delete('+', $start, $end, $value, Type => $type);
              }
              elsif ($end > $new->max) {
-                $new->_add_delete('+', $new->max +1, $end, $value);
+                $new->_add_delete('+', $new->max +1, $end, $value,
+                                                                Type => $type);
+            }
+            elsif ($input_count == 1) {
+                # Here, overlaps existing range, but is from a single input,
+                # so preserve the multiple values from that input.
+                $new->_add_delete('+', $start, $end, $value, Type => $type,
+                                                Replace => $MULTIPLE_AFTER);
              }
          }
  
@@ -3350,7 +3370,7 @@ sub trace { return main::trace(@_); }
          #                         new and old values are identical, the
          #                         replacement is skipped to save cycles
          #       => $IF_NOT_EQUIVALENT means to replace the existing values
-        #                         with this one if they are not equivalent.
+        #          (the default)  with this one if they are not equivalent.
          #                         Ranges are equivalent if their types are the
          #                         same, and they are the same string; or if
          #                         both are type 0 ranges, if their Unicode
@@ -5204,7 +5224,7 @@ END
  
              if ($annotate) {
  
-                # if annotating each code point, must print 1 per line.
+                # If annotating each code point, must print 1 per line.
                  # The variable could point to a subroutine, and we don't want
                  # to lose that fact, so only set if not set already
                  $range_size_1 = 1 if ! $range_size_1;
@@ -5237,6 +5257,8 @@ END
                  next RANGE if defined $suppress_value
                                && $value eq $suppress_value;
  
+                {
+
                  # If there is a range and doesn't need a single point range
                  # output
                  if ($start != $end && ! $range_size_1) {
@@ -5261,13 +5283,11 @@ END
                                              $count);
                          $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
                      }
-                    next RANGE;
                  }
  
-                # Here to output a single code point per line
-
-                # If not to annotate, use the simple formats
-                if (! $annotate) {
+                    # Here to output a single code point per line.
+                    # If not to annotate, use the simple formats
+                elsif (! $annotate) {
  
                      # Use any passed in subroutine to output.
                      if (ref $range_size_1 eq 'CODE') {
@@ -5282,8 +5302,8 @@ END
                              push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
                          }
                      }
-                    next RANGE;
                  }
+                else {
  
                  # Here, wants annotation.
                  for (my $i = $start; $i <= $end; $i++) {
@@ -5422,6 +5442,8 @@ END
                          $OUT[-1] .= "\n";
                      }
                  }
+                }
+                }
              } # End of loop through all the table's ranges
          }
  
@@ -6266,7 +6288,7 @@ END
  \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
  END
          if ($specials_name) {
-        $return .= <<END;
+            $return .= <<END;
  \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
  END
          }
@@ -8610,15 +8632,6 @@ sub finish_property_setup {
      # Perl adds this alias.
      $gc->add_alias('Category');
  
-    # For backwards compatibility, these property files have particular names.
-    property_ref('Uppercase_Mapping')->set_file('Upper'); # This is what
-                                                          # utf8.c calls it
-    property_ref('Lowercase_Mapping')->set_file('Lower');
-    property_ref('Titlecase_Mapping')->set_file('Title');
-
-    my $fold = property_ref('Case_Folding');
-    $fold->set_file('Fold') if defined $fold;
-
      # Unicode::Normalize expects this file with this name and directory.
      my $ccc = property_ref('Canonical_Combining_Class');
      if (defined $ccc) {
@@ -9819,6 +9832,7 @@ END
      my $input_field_count = $i;
  
      # This routine in addition outputs these extra fields:
+
      my $DECOMP_TYPE = $i++; # Decomposition type
  
      # These fields are modifications of ones above, and are usually
@@ -9978,7 +9992,6 @@ END
          my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
                                          Default_Map => "",
                                          Perl_Extension => 1,
-                                        File => 'Digit',    # Trad. location
                                          Directory => $map_directory,
                                          Type => $STRING,
                                          Range_Size_1 => 1,
@@ -10670,7 +10683,7 @@ sub filter_arabic_shaping_line {
          # simple ones are in UnicodeData.txt, which should already have been
          # read in to the full property data structures, so as to initialize
          # these with the simple ones.  Then the SpecialCasing.txt entries
-        # overwrite the ones which have different full mappings.
+        # add or overwrite the ones which have different full mappings.
  
          # This routine sees if the simple mappings are to be output, and if
          # so, copies what has already been put into the full mapping tables,
@@ -10682,16 +10695,6 @@ sub filter_arabic_shaping_line {
          # relatively few entries in them that have different full mappings,
          # and thus skip the simple mapping tables altogether.
  
-        # New tables with just the simple mappings that are overridden by the
-        # full ones are constructed.  These are for Unicode::UCD, which
-        # requires the simple mappings.  The Case_Folding table is a combined
-        # table of both the simple and full mappings, with the full ones being
-        # in the hash, and the simple ones, even those overridden by the hash,
-        # being in the base table.  That same mechanism could have been
-        # employed here, except that the docs have said that the generated
-        # files are usuable directly by programs, so we dare not change the
-        # format in any way.
-
          my $file= shift;
          Carp::carp_extra_args(\@_) if main::DEBUG && @_;
  
@@ -10700,34 +10703,48 @@ sub filter_arabic_shaping_line {
          $uc = property_ref('uc');
  
          # For each of the case change mappings...
-        foreach my $case_table ($lc, $tc, $uc) {
-            my $case = $case_table->name;
-            my $full = property_ref($case);
-            unless (defined $full && ! $full->is_empty) {
+        foreach my $full_table ($lc, $tc, $uc) {
+            my $full_name = $full_table->name;
+            unless (defined $full_table && ! $full_table->is_empty) {
                  Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
              }
  
-            # The simple version's name in each mapping merely has an 's' in
-            # front of the full one's
-            my $simple_name = 's' . $case;
-            my $simple = property_ref($simple_name);
-            $simple->initialize($full) if $simple->to_output_map();
+            # Create a table in the old-style format and with the original
+            # file name for backwards compatibility with applications that
+            # read it directly.  The new tables contain both the simple and
+            # full maps, and the old are missing simple maps when there is a
+            # conflicting full one.  Probably it would have been ok to add
+            # those to the legacy version, as was already done in 5.14 to the
+            # case folding one, but this was not done, out of an abundance of
+            # caution.  The tables are set up here before we deal with the
+            # full maps so that as we handle those, we can override the simple
+            # maps for them in the legacy table, and merely add them in the
+            # new-style one.
+            my $legacy = Property->new("Legacy_" . $full_table->full_name,
+                                        File => $full_table->full_name =~
+                                                            s/case_Mapping//r,
+                                        Range_Size_1 => 1,
+                                        Format => $HEX_FORMAT,
+                                        Default_Map => $CODE_POINT,
+                                        UCD => 0,
+                                        Initialize => $full_table,
+            );
  
-            my $simple_only = Property->new("_s$case",
-                    Type => $STRING,
-                    Default_Map => $CODE_POINT,
-                    Perl_Extension => 1,
-                    Fate => $INTERNAL_ONLY,
-                    Description => "This contains the simple mappings for $case for just the code points that have different full mappings");
-            $simple_only->set_to_output_map($INTERNAL_MAP);
-            $simple_only->add_comment(join_lines( <<END
-This file is for UCD.pm so that it can construct simple mappings that would
-otherwise be lost because they are overridden by full mappings.
+            $full_table->add_comment(join_lines( <<END
+This file includes both the simple and full case changing maps.  The simple
+ones are in the main body of the table below, and the full ones adding to or
+overriding them are in the hash.
  END
              ));
  
+            # The simple version's name in each mapping merely has an 's' in
+            # front of the full one's
+            my $simple_name = 's' . $full_name;
+            my $simple = property_ref($simple_name);
+            $simple->initialize($full_table) if $simple->to_output_map();
+
              unless ($simple->to_output_map()) {
-                $simple_only->set_proxy_for($simple_name);
+                $full_table->set_proxy_for($simple_name);
              }
          }
  
@@ -10819,19 +10836,22 @@ END
                  }
              }
              else {
-                $file->insert_adjusted_lines("$fields[0]; "
+
+                # The mapping goes into both the legacy table, in which it
+                # replaces the simple one...
+                $file->insert_adjusted_lines("$fields[0]; Legacy_"
                                               . $object->full_name
                                               . "; $fields[$i]");
  
-                # Copy any simple case change to the special tables
-                # constructed if being overridden by a multi-character case
-                # change.
-                if ($value != $decimal_code_point) {
-                    $file->insert_adjusted_lines(sprintf("%s; _s%s; %04X",
-                                                 $fields[0],
-                                                 $object->name,
-                                                 $value));
-                }
+                # ... and, the The regular table, in which it is additional,
+                # beyond the simple mapping.
+                $file->insert_adjusted_lines("$fields[0]; "
+                                             . $object->name
+                                            . "; "
+                                            . $CMD_DELIM
+                                            . "$REPLACE_CMD=$MULTIPLE_BEFORE"
+                                            . $CMD_DELIM
+                                            . $fields[$i]);
              }
          }
  
@@ -10894,7 +10914,7 @@ sub filter_old_style_case_folding {
          $non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds",
                             Perl_Extension => 1,
                             Fate => $INTERNAL_ONLY,
-                           Description => "Code points that particpate in a multi-char fold not in the final position",
+                           Description => "Code points that particpate in a multi-char fold and are not the final character of said fold",
                             );
  
          # If we ever wanted to show that these tables were combined, a new
@@ -11383,7 +11403,7 @@ sub setup_script_extensions {
  
      my $scx = property_ref("Script_Extensions");
      $scx = Property->new("scx", Full_Name => "Script_Extensions")
-            if ! defined $scx;
+                                                            if ! defined $scx;
      $scx->_set_format($STRING_WHITE_SPACE_LIST);
      $scx->initialize($script);
      $scx->set_default_map($script->default_map);
@@ -11476,7 +11496,8 @@ sub finish_Unicode() {
      # 3) Calculates all the regular expression match tables based on the
      #    mappings.
      # 3) Calculates and adds the tables which are defined by Unicode, but
-    #    which aren't derived by them
+    #    which aren't derived by them, and certain derived tables that Perl
+    #    uses.
  
      # For each property, fill in any missing mappings, and calculate the re
      # match tables.  If a property has more than one missing mapping, the
@@ -11714,7 +11735,8 @@ END
                              Lowercase_Mapping
                              Titlecase_Mapping
                              Case_Folding
-                        } ) {
+                        } )
+    {
          my $full = property_ref($map);
          if ($full->is_empty) {
              my $simple = property_ref('Simple_' . $map);
@@ -11728,6 +11750,38 @@ END
          }
      }
  
+    # Create digit and case fold tables with the original file names for
+    # backwards compatibility with applications that read them directly.
+    my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
+                              Default_Map => "",
+                              Perl_Extension => 1,
+                              File => 'Digit',    # Trad. location
+                              Directory => $map_directory,
+                              UCD => 0,
+                              Type => $STRING,
+                              Range_Size_1 => 1,
+                              Initialize => property_ref('Perl_Decimal_Digit'),
+                            );
+    $Digit->add_comment(join_lines(<<END
+This file gives the mapping of all code points which represent a single
+decimal digit [0-9] to their respective digits.  For example, the code point
+U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
+that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
+numerals.
+END
+    ));
+
+    Property->new('Legacy_Case_Folding',
+                    File => "Fold",
+                    Directory => $map_directory,
+                    Default_Map => $CODE_POINT,
+                    UCD => 0,
+                    Range_Size_1 => 1,
+                    Type => $STRING,
+                    Format => $HEX_FORMAT,
+                    Initialize => property_ref('cf'),
+    );
+
      # The Script_Extensions property started out as a clone of the Script
      # property.  But processing its data file caused some elements to be
      # replaced with different data.  (These elements were for the Common and
@@ -13561,7 +13615,7 @@ sub make_ucd_table_pod_entries {
                      || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
                      || $output_this == $perl_extension)
                  {
-                    Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output statuss and perl-extension combinations.  Proceeding anyway.");
+                    Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
                  }
  
                  # We modifiy the info column of the one being output to