use strict;
use warnings;
use File::Spec;
-our $VERSION = '1.33';
+our $VERSION = '1.39';
use unicore::Name; # mktables-generated algorithmically-defined names
use bytes (); # for $bytes::hint_bits
my %system_aliases = (
- 'SINGLE-SHIFT 2' => pack("U", 0x8E),
- 'SINGLE-SHIFT 3' => pack("U", 0x8F),
- 'PRIVATE USE 1' => pack("U", 0x91),
- 'PRIVATE USE 2' => pack("U", 0x92),
+ 'SINGLE-SHIFT 2' => pack("U", utf8::unicode_to_native(0x8E)),
+ 'SINGLE-SHIFT 3' => pack("U", utf8::unicode_to_native(0x8F)),
+ 'PRIVATE USE 1' => pack("U", utf8::unicode_to_native(0x91)),
+ 'PRIVATE USE 2' => pack("U", utf8::unicode_to_native(0x92)),
);
# These are the aliases above that differ under :loose and :full matching
#);
#my %deprecated_aliases;
-#$deprecated_aliases{'BELL'} = pack("U", 0x07) if $^V lt v5.17.0;
+#$deprecated_aliases{'BELL'} = pack("U", utf8::unicode_to_native(0x07)) if $^V lt v5.17.0;
#my %loose_deprecated_aliases = (
#);
# hex, but makes the code easier to maintain, and is called
# infrequently, only at compile-time
if ($value !~ $decimal_qr && $value =~ $hex_qr) {
- $value = CORE::hex $1;
+ my $temp = CORE::hex $1;
+ $temp = utf8::unicode_to_native($temp) if $value =~ /^[Uu]\+/;
+ $value = $temp;
}
if ($value =~ $decimal_qr) {
- no warnings qw(non_unicode surrogate nonchar); # Allow any non-malformed
+ no warnings qw(non_unicode surrogate nonchar); # Allow any of these
$^H{charnames_ord_aliases}{$name} = pack("U", $value);
# Use a canonical form.
$^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name;
}
else {
+ # This regex needs to be sync'd with the code in toke.c that checks
+ # for the same thing
if ($name !~ / ^
\p{_Perl_Charname_Begin}
\p{_Perl_Charname_Continue}*
$ /x) {
+
push @errors, $name;
}
else {
$^H{charnames_name_aliases}{$name} = $value;
+
+ if (warnings::enabled('deprecated')) {
+ if ($name =~ / ( .* \s ) ( \s* ) $ /x) {
+ carp "Trailing white-space in a charnames alias definition is deprecated; marked by <-- HERE in '$1 <-- HERE " . $2 . "'";
+ }
+
+ # Use '+' instead of '*' in this regex, because any trailing
+ # blanks have already been warned about.
+ if ($name =~ / ( .*? \s{2} ) ( .+ ) /x) {
+ carp "A sequence of multiple spaces in a charnames alias definition is deprecated; marked by <-- HERE in '$1 <-- HERE " . $2 . "'";
+ }
+ }
}
}
}
if (@errors) {
foreach my $name (@errors) {
my $ok = "";
- $ok = $1 if $name =~ / ^ ( \p{Alpha} [-\p{XPosixWord} ():\xa0]* ) /x;
+ my $nbsp = chr utf8::unicode_to_native(0xa0);
+ $ok = $1 if $name =~ / ^ ( \p{Alpha} [-\p{XPosixWord} ():$nbsp]* ) /x;
my $first_bad = substr($name, length($ok), 1);
$name = "Invalid character in charnames alias definition; marked by <-- HERE in '$ok$first_bad<-- HERE " . substr($name, length($ok) + 1) . "'";
}
if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
$file = $arg;
}
- elsif ($arg =~ m/^\w+$/) {
+ elsif ($arg =~ m/ ^ \p{_Perl_IDStart} \p{_Perl_IDCont}* $/x) {
$file = "unicore/${arg}_alias.pl";
}
else {
- croak "Charnames alias files can only have identifier characters";
+ croak "Charnames alias file names can only have identifier characters";
}
if (my @alias = do $file) {
@alias == 1 && !defined $alias[0] and
/xs)
{
# Even in non-loose matching, the script traditionally has been
- # case insensitve
+ # case insensitive
$scripts_trie = "\U$1";
$lookup_name = $2;
ref $alias eq "HASH" or
croak "Only HASH reference supported as argument to :alias";
alias ($alias);
+ $promote = 1;
next;
}
if ($alias =~ m{:(\w+)$}) {
alias_file ($1) and $promote = 1;
next;
}
- alias_file ($alias);
+ alias_file ($alias) and $promote = 1;
next;
}
if (substr($arg, 0, 1) eq ':'
# not an issue.
my %viacode;
+my $no_name_code_points_re = join "|", map { sprintf("%05X",
+ utf8::unicode_to_native($_)) }
+ 0x80, 0x81, 0x84, 0x99;
+$no_name_code_points_re = qr/$no_name_code_points_re/;
+
sub viacode {
# Returns the name of the code point argument
if ($arg =~ $decimal_qr) {
$hex = sprintf "%05X", $arg;
} elsif ($arg =~ $hex_qr) {
+ $hex = CORE::hex $1;
+ $hex = utf8::unicode_to_native($hex) if $arg =~ /^[Uu]\+/;
# Below is the line that differs from the _getcode() source
- $hex = sprintf "%05X", hex $1;
+ $hex = sprintf "%05X", $hex;
} else {
carp("unexpected arg \"$arg\" to charnames::viacode()");
return;
$return = substr($txt, $+[0], index($txt, "\n", $+[0]) - $+[0]);
# If not one of these 4 code points, return what we've found.
- if ($hex !~ / ^ 000 (?: 8[014] | 99 ) $ /x) {
+ if ($hex !~ / ^ $no_name_code_points_re $ /x) {
$viacode{$hex} = $return;
return $return;
}
# Here there is no user-defined alias, return any official one.
return $return if defined $return;
- if (CORE::hex($hex) > 0x10FFFF) {
+ if (CORE::hex($hex) > 0x10FFFF
+ && warnings::enabled('non_unicode'))
+ {
carp "Unicode characters only allocated up to U+10FFFF (you asked for U+$hex)";
}
return;
-} # _viacode
+} # viacode
1;