use charnames ();
use Unicode::Normalize qw(getCombinClass NFD);
-our $VERSION = '0.39';
+our $VERSION = '0.41';
use Storable qw(dclone);
return sprintf("%04X", ord $cased) if length($cased) == 1;
- %$hash_ref =_read_table("unicore/To/$file", 'use_hash') unless %$hash_ref;
+ if ($file) {
+ %$hash_ref =_read_table("unicore/To/$file", 'use_hash') unless %$hash_ref;
+ }
return $hash_ref->{ord $char} // "";
}
$prop{'upper'} = _charinfo_case($char, uc $char, '_suc.pl', \%SIMPLE_UPPER);
$prop{'lower'} = _charinfo_case($char, lc $char, '_slc.pl', \%SIMPLE_LOWER);
- $prop{'title'} = _charinfo_case($char, ucfirst $char, '_stc.pl',
- \%SIMPLE_TITLE);
+ $prop{'title'} = _charinfo_case($char, ucfirst $char, "", \%SIMPLE_TITLE);
$prop{block} = charblock($code);
$prop{script} = charscript($code);
The Name_Alias property is of this form. But each scalar consists of two
components: 1) the name, and 2) the type of alias this is. They are
-separated by a colon and a space. In Unicode 6.0, there are two alias types:
-C<"correction">, which indicates that the name is a corrected form for the
-original name (which remains valid) for the same code point; and C<"control">,
-which adds a new name for a control character.
+separated by a colon and a space. In Unicode 6.1, there are several alias types:
+
+=over
+
+=item C<correction>
+
+indicates that the name is a corrected form for the
+original name (which remains valid) for the same code point.
+
+=item C<control>
+
+adds a new name for a control character.
+
+=item C<alternate>
+
+is an alternate name for a character
+
+=item C<figment>
+
+is a name for a character that has been documented but was never in any
+actual standard.
+
+=item C<abbreviation>
+
+is a common abbreviation for a character
+
+=back
+
+The lists are ordered (roughly) so the most preferred names come before less
+preferred ones.
For example,
- @aliases_ranges @alias_maps
+ @aliases_ranges @alias_maps
+ ...
+ 0x009E [ 'PRIVACY MESSAGE: control', 'PM: abbreviation' ]
+ 0x009F [ 'APPLICATION PROGRAM COMMAND: control',
+ 'APC: abbreviation'
+ ]
+ 0x00A0 'NBSP: abbreviation'
+ 0x00A1 ""
+ 0x00AD 'SHY: abbreviation'
+ 0x00AE ""
+ 0x01A2 'LATIN CAPITAL LETTER GHA: correction'
+ 0x01A3 'LATIN SMALL LETTER GHA: correction'
+ 0x01A4 ""
...
- 0x01A2 LATIN CAPITAL LETTER GHA: correction
- 0x01A3 LATIN SMALL LETTER GHA: correction
-Unicode 6.1 will introduce other types, and some map entries will be lists of
-multiple name-alias pairs for a single code point.
+A map to the empty string means that there is no alias defined for the code
+point.
=item C<r>
Note that for control characters (C<Gc=cc>), Unicode's data files have the
string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty
-string. This function returns that real name, the empty string.
+string. This function returns that real name, the empty string. (There are
+names for these characters, but they are aliases, not the real name, and are
+contained in the C<Name_Alias> property.)
=item C<d>
$overrides = \%SIMPLE_LOWER;
}
else {
- $file = '_stc.pl';
- $overrides = \%SIMPLE_TITLE;
+ # There are currently no overrides in this, so treat the same
+ # as 'scf' above. This is very temporary code that will be
+ # soon be completely stripped out in a future commit.
+ $overrides = -1;
+ $prop = "tc";
+ goto RETRY;
}
# The files are already handled by the _read_table() function.
my %names;
$names{'LIST'} = "";
my $original = do "unicore/Name.pl";
- my $previous_hex_code_point = "";
my $algorithm_names = \@algorithmic_named_code_points;
+ # We need to remove the names from it that are aliases. For that
+ # we need to also read in that table. Create a hash with the keys
+ # being the code points, and the values being a list of the
+ # aliases for the code point key.
+ my ($aliases_code_points, $aliases_maps, undef, undef) =
+ &prop_invmap('Name_Alias');
+ my %aliases;
+ for (my $i = 0; $i < @$aliases_code_points; $i++) {
+ my $code_point = $aliases_code_points->[$i];
+ $aliases{$code_point} = $aliases_maps->[$i];
+
+ # If not already a list, make it into one, so that later we
+ # can treat things uniformly
+ if (! ref $aliases{$code_point}) {
+ $aliases{$code_point} = [ $aliases{$code_point} ];
+ }
+
+ # Remove the alias type from the entry, retaining just the
+ # name.
+ map { s/:.*// } @{$aliases{$code_point}};
+ }
+
# We hold off on adding the next entry to the list until we know,
# that the next line isn't for the same code point. We only
# output the final line. That one is the original Name property
# value. The others are the Name_Alias corrections, which are
# listed first in the file.
- my $staging = "";
-
my $i = 0;
foreach my $line (split "\n", $original) {
my ($hex_code_point, $name) = split "\t", $line;
next if $code_point <= 0x9F
&& ($code_point <= 0x1F || $code_point >= 0x7F);
- # Output the last iteration's result, but only output the
- # final name if a code point has more than one.
- $names{'LIST'} .= $staging
- if $hex_code_point ne $previous_hex_code_point;
+ # If this is a name_alias, it isn't a name
+ next if grep { $_ eq $name } @{$aliases{$code_point}};
# If we are beyond where one of the special lines needs to
# be inserted ...
- if ($i < @$algorithm_names
+ while ($i < @$algorithm_names
&& $code_point > $algorithm_names->[$i]->{'low'})
{
# ... then insert it, ahead of what we were about to
# output
- $staging = sprintf "%x\t%x\t%s\n",
+ $names{'LIST'} .= sprintf "%x\t%x\t%s\n",
$algorithm_names->[$i]->{'low'},
$algorithm_names->[$i]->{'high'},
$algorithm_names->[$i]->{'name'};
- # And pretend that what we last saw was the final code
- # point of the inserted range.
- $previous_hex_code_point = sprintf "%04X",
- $algorithm_names->[$i]->{'high'};
-
# Done with this range.
$i++;
- # Except we actually need to output the inserted line.
- redo;
+ # We loop until all special lines that precede the next
+ # regular one are output.
}
- # Normal name.
- $staging = sprintf "%x\t\t%s\n", $code_point, $name;
- $previous_hex_code_point = $hex_code_point;
- }
-
- # Add the name from the final iteration
- $names{'LIST'} .= $staging;
+ # Here, is a normal name.
+ $names{'LIST'} .= sprintf "%x\t\t%s\n", $code_point, $name;
+ } # End of loop through all the names
$names{'TYPE'} = "ToNa";
$utf8::SwashInfo{ToNa}{'missing'} = "";
gets the lower end of the range (0th element) and then looks up the old name
for its block using C<charblock>).
+Note that starting in Unicode 6.1, many of the block names have shorter
+synonyms. These are always given in the new style.
+
=head1 BUGS
Does not yet support EBCDIC platforms.