Unicode::UCD: Don't read _stc table

[perl5.git] / lib / Unicode / UCD.pm
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm

index 6a966f7..f0481f6 100644 (file)
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -6,7 +6,7 @@ no warnings 'surrogate';    # surrogates can be inputs to this
  use charnames ();
  use Unicode::Normalize qw(getCombinClass NFD);
  
-our $VERSION = '0.39';
+our $VERSION = '0.41';
  
  use Storable qw(dclone);
  
@@ -324,7 +324,9 @@ sub _charinfo_case {
  
      return sprintf("%04X", ord $cased) if length($cased) == 1;
  
-    %$hash_ref =_read_table("unicore/To/$file", 'use_hash') unless %$hash_ref;
+    if ($file) {
+        %$hash_ref =_read_table("unicore/To/$file", 'use_hash') unless %$hash_ref;
+    }
      return $hash_ref->{ord $char} // "";
  }
  
@@ -431,8 +433,7 @@ sub charinfo {
  
      $prop{'upper'} = _charinfo_case($char, uc $char, '_suc.pl', \%SIMPLE_UPPER);
      $prop{'lower'} = _charinfo_case($char, lc $char, '_slc.pl', \%SIMPLE_LOWER);
-    $prop{'title'} = _charinfo_case($char, ucfirst $char, '_stc.pl',
-                                                                \%SIMPLE_TITLE);
+    $prop{'title'} = _charinfo_case($char, ucfirst $char, "", \%SIMPLE_TITLE);
  
      $prop{block}  = charblock($code);
      $prop{script} = charscript($code);
@@ -2252,20 +2253,56 @@ Devanagari, Gurmukhi, and Oriya  scripts.
  
  The Name_Alias property is of this form.  But each scalar consists of two
  components:  1) the name, and 2) the type of alias this is.  They are
-separated by a colon and a space.  In Unicode 6.0, there are two alias types:
-C<"correction">, which indicates that the name is a corrected form for the
-original name (which remains valid) for the same code point; and C<"control">,
-which adds a new name for a control character.
+separated by a colon and a space.  In Unicode 6.1, there are several alias types:
+
+=over
+
+=item C<correction>
+
+indicates that the name is a corrected form for the
+original name (which remains valid) for the same code point.
+
+=item C<control>
+
+adds a new name for a control character.
+
+=item C<alternate>
+
+is an alternate name for a character
+
+=item C<figment>
+
+is a name for a character that has been documented but was never in any
+actual standard.
+
+=item C<abbreviation>
+
+is a common abbreviation for a character
+
+=back
+
+The lists are ordered (roughly) so the most preferred names come before less
+preferred ones.
  
  For example,
  
- @aliases_ranges  @alias_maps
+ @aliases_ranges        @alias_maps
+    ...
+    0x009E        [ 'PRIVACY MESSAGE: control', 'PM: abbreviation' ]
+    0x009F        [ 'APPLICATION PROGRAM COMMAND: control',
+                    'APC: abbreviation'
+                  ]
+    0x00A0        'NBSP: abbreviation'
+    0x00A1        ""
+    0x00AD        'SHY: abbreviation'
+    0x00AE        ""
+    0x01A2        'LATIN CAPITAL LETTER GHA: correction'
+    0x01A3        'LATIN SMALL LETTER GHA: correction'
+    0x01A4        ""
      ...
-    0x01A2        LATIN CAPITAL LETTER GHA: correction
-    0x01A3        LATIN SMALL LETTER GHA: correction
  
-Unicode 6.1 will introduce other types, and some map entries will be lists of
-multiple name-alias pairs for a single code point.
+A map to the empty string means that there is no alias defined for the code
+point.
  
  =item C<r>
  
@@ -2409,7 +2446,9 @@ the function L<charnames/charnames::viacode(code)>.
  
  Note that for control characters (C<Gc=cc>), Unicode's data files have the
  string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty
-string.  This function returns that real name, the empty string.
+string.  This function returns that real name, the empty string.  (There are
+names for these characters, but they are aliases, not the real name, and are
+contained in the C<Name_Alias> property.)
  
  =item C<d>
  
@@ -2593,8 +2632,12 @@ RETRY:
                  $overrides = \%SIMPLE_LOWER;
              }
              else {
-                $file = '_stc.pl';
-                $overrides = \%SIMPLE_TITLE;
+                # There are currently no overrides in this, so treat the same
+                # as 'scf' above.  This is very temporary code that will be
+                # soon be completely stripped out in a future commit.
+                $overrides = -1;
+                $prop = "tc";
+                goto RETRY;
              }
  
              # The files are already handled by the _read_table() function.
@@ -2634,16 +2677,35 @@ RETRY:
              my %names;
              $names{'LIST'} = "";
              my $original = do "unicore/Name.pl";
-            my $previous_hex_code_point = "";
              my $algorithm_names = \@algorithmic_named_code_points;
  
+            # We need to remove the names from it that are aliases.  For that
+            # we need to also read in that table.  Create a hash with the keys
+            # being the code points, and the values being a list of the
+            # aliases for the code point key.
+            my ($aliases_code_points, $aliases_maps, undef, undef) =
+                                                &prop_invmap('Name_Alias');
+            my %aliases;
+            for (my $i = 0; $i < @$aliases_code_points; $i++) {
+                my $code_point = $aliases_code_points->[$i];
+                $aliases{$code_point} = $aliases_maps->[$i];
+
+                # If not already a list, make it into one, so that later we
+                # can treat things uniformly
+                if (! ref $aliases{$code_point}) {
+                    $aliases{$code_point} = [ $aliases{$code_point} ];
+                }
+
+                # Remove the alias type from the entry, retaining just the
+                # name.
+                map { s/:.*// } @{$aliases{$code_point}};
+            }
+
              # We hold off on adding the next entry to the list until we know,
              # that the next line isn't for the same code point.  We only
              # output the final line.  That one is the original Name property
              # value.  The others are the Name_Alias corrections, which are
              # listed first in the file.
-            my $staging = "";
-
              my $i = 0;
              foreach my $line (split "\n", $original) {
                  my ($hex_code_point, $name) = split "\t", $line;
@@ -2659,43 +2721,32 @@ RETRY:
                  next if $code_point <= 0x9F
                          && ($code_point <= 0x1F || $code_point >= 0x7F);
  
-                # Output the last iteration's result, but only output the
-                # final name if a code point has more than one.
-                $names{'LIST'} .= $staging
-                                if $hex_code_point ne $previous_hex_code_point;
+                # If this is a name_alias, it isn't a name
+                next if grep { $_ eq $name } @{$aliases{$code_point}};
  
                  # If we are beyond where one of the special lines needs to
                  # be inserted ...
-                if ($i < @$algorithm_names
+                while ($i < @$algorithm_names
                      && $code_point > $algorithm_names->[$i]->{'low'})
                  {
  
                      # ... then insert it, ahead of what we were about to
                      # output
-                    $staging = sprintf "%x\t%x\t%s\n",
+                    $names{'LIST'} .= sprintf "%x\t%x\t%s\n",
                                              $algorithm_names->[$i]->{'low'},
                                              $algorithm_names->[$i]->{'high'},
                                              $algorithm_names->[$i]->{'name'};
  
-                    # And pretend that what we last saw was the final code
-                    # point of the inserted range.
-                    $previous_hex_code_point = sprintf "%04X",
-                                            $algorithm_names->[$i]->{'high'};
-
                      # Done with this range.
                      $i++;
  
-                    # Except we actually need to output the inserted line.
-                    redo;
+                    # We loop until all special lines that precede the next
+                    # regular one are output.
                  }
  
-                # Normal name.
-                $staging = sprintf "%x\t\t%s\n", $code_point, $name;
-                $previous_hex_code_point = $hex_code_point;
-            }
-
-            # Add the name from the final iteration
-            $names{'LIST'} .= $staging;
+                # Here, is a normal name.
+                $names{'LIST'} .= sprintf "%x\t\t%s\n", $code_point, $name;
+            } # End of loop through all the names
  
              $names{'TYPE'} = "ToNa";
              $utf8::SwashInfo{ToNa}{'missing'} = "";
@@ -3171,6 +3222,9 @@ To convert from new-style to old-style, follow this recipe:
  gets the lower end of the range (0th element) and then looks up the old name
  for its block using C<charblock>).
  
+Note that starting in Unicode 6.1, many of the block names have shorter
+synonyms.  These are always given in the new style.
+
  =head1 BUGS
  
  Does not yet support EBCDIC platforms.