package _charnames;
use strict;
use warnings;
-use File::Spec;
-our $VERSION = '1.31';
+our $VERSION = '1.42';
use unicore::Name; # mktables-generated algorithmically-defined names
use bytes (); # for $bytes::hint_bits
my %system_aliases = (
- 'SINGLE-SHIFT 2' => pack("U", 0x8E),
- 'SINGLE-SHIFT 3' => pack("U", 0x8F),
- 'PRIVATE USE 1' => pack("U", 0x91),
- 'PRIVATE USE 2' => pack("U", 0x92),
+ 'SINGLE-SHIFT 2' => pack("U", utf8::unicode_to_native(0x8E)),
+ 'SINGLE-SHIFT 3' => pack("U", utf8::unicode_to_native(0x8F)),
+ 'PRIVATE USE 1' => pack("U", utf8::unicode_to_native(0x91)),
+ 'PRIVATE USE 2' => pack("U", utf8::unicode_to_native(0x92)),
);
# These are the aliases above that differ under :loose and :full matching
#my %loose_system_aliases = (
#);
-my %deprecated_aliases = (
- # Use of these gives deprecated message.
- # Unicode 6.0 co-opted this for U+1F514, so deprecate it for now.
- 'BELL' => pack("U", 0x07),
-);
+#my %deprecated_aliases;
+#$deprecated_aliases{'BELL'} = pack("U", utf8::unicode_to_native(0x07)) if $^V lt v5.17.0;
#my %loose_deprecated_aliases = (
#);
sub alias (@) # Set up a single alias
{
+ my @errors;
+ my $nbsp = chr utf8::unicode_to_native(0xA0);
+
my $alias = ref $_[0] ? $_[0] : { @_ };
- foreach my $name (keys %$alias) {
+ foreach my $name (sort keys %$alias) { # Sort only because it helps having
+ # deterministic output for
+ # t/lib/charnames/alias
my $value = $alias->{$name};
next unless defined $value; # Omit if screwed up.
# hex, but makes the code easier to maintain, and is called
# infrequently, only at compile-time
if ($value !~ $decimal_qr && $value =~ $hex_qr) {
- $value = CORE::hex $1;
+ my $temp = CORE::hex $1;
+ $temp = utf8::unicode_to_native($temp) if $value =~ /^[Uu]\+/;
+ $value = $temp;
}
if ($value =~ $decimal_qr) {
- no warnings qw(non_unicode surrogate nonchar); # Allow any non-malformed
+ no warnings qw(non_unicode surrogate nonchar); # Allow any of these
$^H{charnames_ord_aliases}{$name} = pack("U", $value);
# Use a canonical form.
$^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name;
}
else {
- # XXX validate syntax when deprecation cycle complete. ie. start
- # with an alpha only, etc.
- $^H{charnames_name_aliases}{$name} = $value;
+ my $ok_portion = "";
+ $ok_portion = $1 if $name =~ / ^ (
+ \p{_Perl_Charname_Begin}
+ \p{_Perl_Charname_Continue}*
+ ) /x;
+
+ # If the name was fully correct, the above should have matched all of
+ # it.
+ if (length $ok_portion < length $name) {
+ my $first_bad = substr($name, length($ok_portion), 1);
+ push @errors, "Invalid character in charnames alias definition; "
+ . "marked by <-- HERE in '$ok_portion$first_bad<-- HERE "
+ . substr($name, length($ok_portion) + 1)
+ . "'";
+ }
+ else {
+ if ($name =~ / ( .* \s ) ( \s* ) $ /x) {
+ push @errors, "charnames alias definitions may not contain "
+ . "trailing white-space; marked by <-- HERE in "
+ . "'$1 <-- HERE " . $2 . "'";
+ next;
+ }
+
+ # Use '+' instead of '*' in this regex, because any trailing
+ # blanks have already been found
+ if ($name =~ / ( .*? \s{2} ) ( .+ ) /x) {
+ push @errors, "charnames alias definitions may not contain a "
+ . "sequence of multiple spaces; marked by <-- HERE "
+ . "in '$1 <-- HERE " . $2 . "'";
+ next;
+ }
+
+ $^H{charnames_name_aliases}{$name} = $value;
+ if (warnings::enabled('deprecated')
+ && $name =~ / ( .* $nbsp ) ( .* ) $ /x)
+ {
+ carp "NO-BREAK SPACE in a charnames alias definition is "
+ . "deprecated; marked by <-- HERE in '$1 <-- HERE "
+ . $2 . "'";
+ }
+ }
}
}
+
+ # We find and output all errors from this :alias definition, rather than
+ # failing on the first one, so fewer runs are needed to get it to compile
+ if (@errors) {
+ croak join "\n", @errors;
+ }
+
+ return;
} # alias
sub not_legal_use_bytes_msg {
sub alias_file ($) # Reads a file containing alias definitions
{
+ require File::Spec;
my ($arg, $file) = @_;
if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
$file = $arg;
}
- elsif ($arg =~ m/^\w+$/) {
+ elsif ($arg =~ m/ ^ \p{_Perl_IDStart} \p{_Perl_IDCont}* $/x) {
$file = "unicore/${arg}_alias.pl";
}
else {
- croak "Charnames alias files can only have identifier characters";
+ croak "Charnames alias file names can only have identifier characters";
}
if (my @alias = do $file) {
@alias == 1 && !defined $alias[0] and
# elsif ($loose && exists $loose_system_aliases{$lookup_name}) {
# $utf8 = $loose_system_aliases{$lookup_name};
# }
- if (exists $deprecated_aliases{$lookup_name}) {
- require warnings;
- warnings::warnif('deprecated',
- "Unicode character name \"$name\" is deprecated, use \""
- . viacode(ord $deprecated_aliases{$lookup_name})
- . "\" instead");
- $utf8 = $deprecated_aliases{$lookup_name};
- }
+# if (exists $deprecated_aliases{$lookup_name}) {
+# require warnings;
+# warnings::warnif('deprecated',
+# "Unicode character name \"$name\" is deprecated, use \""
+# . viacode(ord $deprecated_aliases{$lookup_name})
+# . "\" instead");
+# $utf8 = $deprecated_aliases{$lookup_name};
+# }
# There are currently no entries in this hash, so don't waste time looking
# for them. But the code is retained for the unlikely possibility that
# some will be added in the future.
/xs)
{
# Even in non-loose matching, the script traditionally has been
- # case insensitve
+ # case insensitive
$scripts_trie = "\U$1";
$lookup_name = $2;
}
my $case = $name_has_uppercase ? "CAPITAL" : "SMALL";
- if (! $scripts_trie
- || $txt !~
- /\t (?: $scripts_trie ) \ (?:$case\ )? LETTER \ \U$lookup_name $/xm)
- {
- # Here we still don't have it, give up.
- return if $runtime;
-
- # May have zapped input name, get it again.
- $name = (defined $save_input) ? $save_input : $_[0];
- carp "Unknown charname '$name'";
- return ($wants_ord) ? 0xFFFD : pack("U", 0xFFFD);
- }
+ return if (! $scripts_trie || $txt !~
+ /\t (?: $scripts_trie ) \ (?:$case\ )? LETTER \ \U$lookup_name $/xm);
# Here have found the input name in the table.
@off = ($-[0] + 1, $+[0]); # The 1 is for the tab
ref $alias eq "HASH" or
croak "Only HASH reference supported as argument to :alias";
alias ($alias);
+ $promote = 1;
next;
}
if ($alias =~ m{:(\w+)$}) {
alias_file ($1) and $promote = 1;
next;
}
- alias_file ($alias);
+ alias_file ($alias) and $promote = 1;
next;
}
if (substr($arg, 0, 1) eq ':'
# not an issue.
my %viacode;
+my $no_name_code_points_re = join "|", map { sprintf("%05X",
+ utf8::unicode_to_native($_)) }
+ 0x80, 0x81, 0x84, 0x99;
+$no_name_code_points_re = qr/$no_name_code_points_re/;
+
sub viacode {
# Returns the name of the code point argument
if ($arg =~ $decimal_qr) {
$hex = sprintf "%05X", $arg;
} elsif ($arg =~ $hex_qr) {
+ $hex = CORE::hex $1;
+ $hex = utf8::unicode_to_native($hex) if $arg =~ /^[Uu]\+/;
# Below is the line that differs from the _getcode() source
- $hex = sprintf "%05X", hex $1;
+ $hex = sprintf "%05X", $hex;
} else {
carp("unexpected arg \"$arg\" to charnames::viacode()");
return;
$return = substr($txt, $+[0], index($txt, "\n", $+[0]) - $+[0]);
# If not one of these 4 code points, return what we've found.
- if ($hex !~ / ^ 000 (?: 8[014] | 99 ) $ /x) {
+ if ($hex !~ / ^ $no_name_code_points_re $ /x) {
$viacode{$hex} = $return;
return $return;
}
# Here there is no user-defined alias, return any official one.
return $return if defined $return;
- if (CORE::hex($hex) > 0x10FFFF) {
+ if (CORE::hex($hex) > 0x10FFFF
+ && warnings::enabled('non_unicode'))
+ {
carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)";
}
return;
-} # _viacode
+} # viacode
1;