# 5.8: needs pack "U". But almost all occurrences of objaddr have been
# removed in favor of using 'no overloading'. You also would have to go
# through and replace occurrences like:
-# my $addr; { no overloading; $addr = 0+$self; }
+# my $addr = do { no overloading; pack 'J', $self; }
# with
# my $addr = main::objaddr $self;
# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
-# that instituted this change.)
+# that instituted the change to main::objaddr, and subsequent commits that
+# changed 0+$self to pack 'J', $self.)
+
+my $start_time;
+BEGIN { # Get the time the script started running; do it at compiliation to
+ # get it as close as possible
+ $start_time= time;
+}
+
require 5.010_001;
use strict;
# is nonsensical.
#
# There are no match tables generated for matches of the null string. These
-# would like like qr/\p{JSN=}/ currently without modifying the regex code.
+# would look like qr/\p{JSN=}/ currently without modifying the regex code.
# Perhaps something like them could be added if necessary. The JSN does have
# a real code point U+110B that maps to the null string, but it is a
# contributory property, and therefore not output by default. And it's easily
# kPrimaryNumeric property have commas and an unexpected comment. A filter
# could be added for these; or for a particular installation, the Unihan.txt
# file could be edited to fix them.
-# have to be
#
# HOW TO ADD A FILE TO BE PROCESSED
#
# Encapsulated Cleverness". p. 455 in first edition.
my %seen;
+ # Arguably this breaks encapsulation, if the goal is to permit multiple
+ # distinct objects to stringify to the same value, and be interchangeable.
+ # However, for this program, no two objects stringify identically, and all
+ # lists passed to this function are either objects or strings. So this
+ # doesn't affect correctness, but it does give a couple of percent speedup.
+ no overloading;
return grep { ! $seen{$_}++ } @_;
}
# in the input.
my $output_range_counts = 1; # ? Should we include the number of code points
# in ranges in the output
+my $output_names = 0; # ? Should character names be in the output
+my @viacode; # Contains the 1 million character names, if
+ # $output_names is true
+
# Verbosity levels; 0 is quiet
my $NORMAL_VERBOSITY = 1;
my $PROGRESS = 2;
elsif ($arg eq '-c') {
$output_range_counts = ! $output_range_counts
}
+ elsif ($arg eq '-output_names') {
+ $output_names = 1;
+ }
else {
my $with_c = 'with';
$with_c .= 'out' if $output_range_counts; # Complements the state
-maketest : Make test script 'TestProp.pl' in current (or -C directory),
overrides -T
-makelist : Rewrite the file list $file_list based on current setup
+ -output_names : Output each character's name in the table files; useful for
+ doing what-ifs, looking at diffs; is slow, memory intensive,
+ resulting tables are usable but very large.
-check A B : Executes $0 only if A and B are the same
END
}
# Stores the most-recently changed file. If none have changed, can skip the
# build
-my $youngest = -M $0; # Do this before the chdir!
+my $youngest = (stat $0)[9]; # Do this before the chdir!
# Change directories now, because need to read 'version' early.
if ($use_directory) {
# unless explicitly added.
if ($v_version ge v5.2.0) {
my $unihan = 'Unihan; remove from list if using Unihan';
- foreach my $table qw (
+ foreach my $table (qw (
kAccountingNumeric
kOtherNumeric
kPrimaryNumeric
kIRG_USource
kIRG_VSource
kRSUnicode
- )
+ ))
{
$why_suppress_if_empty_warn_if_not{$table} = $unihan;
}
$INTEGER_FORMAT => 'integer',
$HEX_FORMAT => 'positive hex whole number; a code point',
$RATIONAL_FORMAT => 'rational: an integer or a fraction',
- $STRING_FORMAT => 'arbitrary string',
+ $STRING_FORMAT => 'string',
);
# Unicode didn't put such derived files in a separate directory at first.
# These constants names and values were taken from the Unicode standard,
# version 5.1, section 3.12. They are used in conjunction with Hangul
-# syllables
-my $SBase = 0xAC00;
-my $LBase = 0x1100;
-my $VBase = 0x1161;
-my $TBase = 0x11A7;
+# syllables. The '_string' versions are so generated tables can retain the
+# hex format, which is the more familiar value
+my $SBase_string = "0xAC00";
+my $SBase = CORE::hex $SBase_string;
+my $LBase_string = "0x1100";
+my $LBase = CORE::hex $LBase_string;
+my $VBase_string = "0x1161";
+my $VBase = CORE::hex $VBase_string;
+my $TBase_string = "0x11A7";
+my $TBase = CORE::hex $TBase_string;
my $SCount = 11172;
my $LCount = 19;
my $VCount = 21;
my $gc;
my $perl;
my $block;
+my $perl_charname;
+my $print;
# Are there conflicting names because of beginning with 'In_', or 'Is_'
my $has_In_conflicts = 0;
no overloading; # If overloaded, numifying below won't work.
# Numifying a ref gives its address.
- return 0 + $_[0];
+ return pack 'J', $_[0];
}
# Commented code below should work on Perl 5.8.
# bless $_[0], 'main::Fake';
#
# # Numifying a ref gives its address.
-# my $addr = 0 + $_[0];
+# my $addr = pack 'J', $_[0];
#
# # Return to original class
# bless $_[0], $pkg;
# Use typeglob to give the anonymous subroutine the name we want
*$destroy_name = sub {
my $self = shift;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
$self->$destroy_callback if $destroy_callback;
foreach my $field (keys %{$package_fields{$package}}) {
# "protection" is only by convention. All that happens is that the
# accessor functions' names begin with an underscore. So instead of
# calling set_foo, the call is _set_foo. (Real protection could be
- # accomplished by having a new subroutine, end_package called at the
+ # accomplished by having a new subroutine, end_package, called at the
# end of each package, and then storing the __LINE__ ranges and
# checking them on every accessor. But that is way overkill.)
return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
my $self = shift;
my $value = shift;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
if (ref $value) {
return if grep { $value == $_ } @{$field->{$addr}};
*$subname = sub {
use strict "refs";
Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
- my $addr; { no overloading; $addr = 0+$_[0]; }
+ my $addr = do { no overloading; pack 'J', $_[0]; };
if (ref $field->{$addr} ne 'ARRAY') {
my $type = ref $field->{$addr};
$type = 'scalar' unless $type;
use strict "refs";
Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
no overloading;
- return $field->{0+$_[0]};
+ return $field->{pack 'J', $_[0]};
}
}
}
}
# $self is $_[0]; $value is $_[1]
no overloading;
- $field->{0+$_[0]} = $_[1];
+ $field->{pack 'J', $_[0]} = $_[1];
return;
}
}
my $class = shift;
my $self = bless \do{ my $anonymous_scalar }, $class;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# Set defaults
$handler{$addr} = \&main::process_generic_property_file;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
my $file = $file{$addr};
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# Here the file is open (or if the handle is not a ref, is an open
# 'virtual' file). Get the next line; any inserted lines get priority
# # an each_line_handler() on the line.
#
# my $self = shift;
-# my $addr; { no overloading; $addr = 0+$self; }
+# my $addr = do { no overloading; pack 'J', $self; };
#
# foreach my $inserted_ref (@{$added_lines{$addr}}) {
# my ($adjusted, $line) = @{$inserted_ref};
# indicate that this line hasn't been adjusted, and needs to be
# processed.
no overloading;
- push @{$added_lines{0+$self}}, map { [ 0, $_ ] } @_;
+ push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
return;
}
# Each inserted line is an array, with the first element being 1 to
# indicate that this line has been adjusted
no overloading;
- push @{$added_lines{0+$self}}, map { [ 1, $_ ] } @_;
+ push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
return;
}
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# If not accepting a list return, just return the first one.
return shift @{$missings{$addr}} unless wantarray;
# Add a property field to $_, if this file requires it.
my $self = shift;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
my $property = $property{$addr};
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my $message = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
$message = 'Unexpected line' unless $message;
my $class = shift;
my $self = bless \do{my $anonymous_scalar}, $class;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
while (@_ > 1) {
my $default = shift;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
return each %{$class_defaults{$addr}};
}
my $class = shift;
my $self = bless \do { my $anonymous_scalar }, $class;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
$name{$addr} = shift;
$loose_match{$addr} = shift;
my $class = shift;
my $self = bless \do { my $anonymous_scalar }, $class;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
$start{$addr} = shift;
$end{$addr} = shift;
sub _operator_stringify {
my $self = shift;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# Output it like '0041..0065 (value)'
my $return = sprintf("%04X", $start{$addr})
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
return $standard_form{$addr} if defined $standard_form{$addr};
return $value{$addr};
my $indent = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
my $return = $indent
. sprintf("%04X", $start{$addr})
return _union($class, $initialize, %args) if defined $initialize;
$self = bless \do { my $anonymous_scalar }, $class;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# Optional parent object, only for debug info.
$owner_name_of{$addr} = delete $args{'Owner'};
sub _operator_stringify {
my $self = shift;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
return "Range_List attached to '$owner_name_of{$addr}'"
if $owner_name_of{$addr};
my $message = "";
if (defined $self) {
no overloading;
- $message .= $owner_name_of{0+$self};
+ $message .= $owner_name_of{pack 'J', $self};
}
Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
return;
my $message = "";
if (defined $self) {
no overloading;
- $message .= $owner_name_of{0+$self};
+ $message .= $owner_name_of{pack 'J', $self};
}
Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
return;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
no overloading;
- return scalar @{$ranges{0+$self}};
+ return scalar @{$ranges{pack 'J', $self}};
}
sub min {
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# If the range list is empty, return a large value that isn't adjacent
# to any that could be in the range list, for simpler tests
# So is in the table if and only iff it is at least the start position
# of range $i.
no overloading;
- return 0 if $ranges{0+$self}->[$i]->start > $codepoint;
+ return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
return $i + 1;
}
- sub value_of {
- # Returns the value associated with the code point, undef if none
+ sub containing_range {
+ # Returns the range object that contains the code point, undef if none
my $self = shift;
my $codepoint = shift;
# contains() returns 1 beyond where we should look
no overloading;
- return $ranges{0+$self}->[$i-1]->value;
+ return $ranges{pack 'J', $self}->[$i-1];
+ }
+
+ sub value_of {
+ # Returns the value associated with the code point, undef if none
+
+ my $self = shift;
+ my $codepoint = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my $range = $self->containing_range($codepoint);
+ return unless defined $range;
+
+ return $range->value;
+ }
+
+ sub type_of {
+ # Returns the type of the range containing the code point, undef if
+ # the code point is not in the table
+
+ my $self = shift;
+ my $codepoint = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my $range = $self->containing_range($codepoint);
+ return unless defined $range;
+
+ return $range->type;
}
sub _search_ranges {
my $code_point = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
return if $code_point > $max{$addr};
my $r = $ranges{$addr}; # The current list of ranges
#
# The range list is kept sorted so that the range with the lowest
# starting position is first in the list, and generally, adjacent
- # ranges with the same values are merged into single larger one (see
+ # ranges with the same values are merged into a single larger one (see
# exceptions below).
#
- # There are more parameters, all are key => value pairs:
+ # There are more parameters; all are key => value pairs:
# Type gives the type of the value. It is only valid for '+'.
# All ranges have types; if this parameter is omitted, 0 is
# assumed. Ranges with type 0 are assumed to obey the
# => $IF_NOT_EQUIVALENT means to replace the existing values
# with this one if they are not equivalent.
# Ranges are equivalent if their types are the
- # same, and they are the same string, or if
+ # same, and they are the same string; or if
# both are type 0 ranges, if their Unicode
# standard forms are identical. In this last
# case, the routine chooses the more "modern"
# multiple times.
# => anything else is the same as => $IF_NOT_EQUIVALENT
#
- # "same value" means identical for type-0 ranges, and it means having
- # the same standard forms for non-type-0 ranges.
+ # "same value" means identical for non-type-0 ranges, and it means
+ # having the same standard forms for type-0 ranges.
return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
Carp::carp_extra_args(\%args) if main::DEBUG && %args;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
if ($operation ne '+' && $operation ne '-') {
Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
no overloading;
- undef $each_range_iterator{0+$self};
+ undef $each_range_iterator{pack 'J', $self};
return;
}
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
return if $self->is_empty;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
my $count = 0;
foreach my $range (@{$ranges{$addr}}) {
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
no overloading;
- return scalar @{$ranges{0+$self}} == 0;
+ return scalar @{$ranges{pack 'J', $self}} == 0;
}
sub hash {
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# These are quickly computable. Return looks like 'min..max;count'
return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
return $self->_add_delete('+', $start, $end, "");
}
- my $non_ASCII = (ord('A') != 65); # Assumes test on same platform
-
sub is_code_point_usable {
# This used only for making the test script. See if the input
# proposed trial code point is one that Perl will handle. If second
return 0 if $code < 0; # Never use a negative
- # For non-ASCII, we shun the characters that don't have Perl encoding-
- # independent symbols for them. 'A' is such a symbol, so is "\n".
- return $try_hard if $non_ASCII
- && $code <= 0xFF
- && ($code >= 0x7F
- || ($code >= 0x0E && $code <= 0x1F)
- || ($code >= 0x01 && $code <= 0x06)
- || $code == 0x0B);
-
# shun null. I'm (khw) not sure why this was done, but NULL would be
# the character very frequently used.
return $try_hard if $code == 0x0000;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# On first pass, don't choose less desirable code points; if no good
# one is found, repeat, allowing a less desirable one to be selected.
my $class = shift;
my $self = bless \do { my $anonymous_scalar }, $class;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
my %args = @_;
$status{$addr} = delete $args{'Status'} || $NORMAL;
$status_info{$addr} = delete $args{'_Status_Info'} || "";
$range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
+ $range_size_1{$addr} = 1 if $output_names; # Make sure 1 name per line
my $description = delete $args{'Description'};
my $externally_ok = delete $args{'Externally_Ok'};
# Here are the methods that are required to be defined by any derived
# class
- for my $sub qw(
+ for my $sub (qw(
append_to_body
pre_body
- )
+ ))
# append_to_body and pre_body are called in the write() method
# to add stuff after the main body of the table, but before
# its close; and to prepend stuff before the beginning of the
# Returns the array of ranges associated with this table.
no overloading;
- return $range_list{0+shift}->ranges;
+ return $range_list{pack 'J', shift}->ranges;
}
sub add_alias {
# release
$name = ucfirst($name) unless $name =~ /^k[A-Z]/;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# Figure out if should be loosely matched if not already specified.
if (! defined $loose_match) {
# This name may be shorter than any existing ones, so clear the cache
# of the shortest, so will have to be recalculated.
no overloading;
- undef $short_name{0+$self};
+ undef $short_name{pack 'J', $self};
return;
}
my $nominal_length_ptr = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# For efficiency, don't recalculate, but this means that adding new
# aliases could change what the shortest is, so the code that does
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
no overloading;
- push @{$description{0+$self}}, $description;
+ push @{$description{pack 'J', $self}}, $description;
return;
}
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
no overloading;
- push @{$note{0+$self}}, $note;
+ push @{$note{pack 'J', $self}}, $note;
return;
}
chomp $comment;
no overloading;
- push @{$comment{0+$self}}, $comment;
+ push @{$comment{pack 'J', $self}}, $comment;
return;
}
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
my @list = @{$comment{$addr}};
return @list if wantarray;
my $return = "";
# initialization for range lists.
my $self = shift;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
my $initialization = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
$return .= $DEVELOPMENT_ONLY if $compare_versions;
$return .= $HEADER;
no overloading;
- $return .= $INTERNAL_ONLY if $internal_only{0+$self};
+ $return .= $INTERNAL_ONLY if $internal_only{pack 'J', $self};
return $return;
}
# the range
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# Start with the header
my @OUT = $self->header;
# If has or wants a single point range output
if ($start == $end || $range_size_1) {
- for my $i ($start .. $end) {
- push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
+ if (ref $range_size_1 eq 'CODE') {
+ for my $i ($start .. $end) {
+ push @OUT, &$range_size_1($i, $value);
+ }
+ }
+ else {
+ for my $i ($start .. $end) {
+ push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
+ if ($output_names) {
+ if (! defined $viacode[$i]) {
+ $viacode[$i] =
+ Property::property_ref('Perl_Charnames')
+ ->value_of($i)
+ || "";
+ }
+ $OUT[-1] =~ s/\n/\t# $viacode[$i]\n/;
+ }
+ }
}
}
else {
my $info = shift; # Any message associated with it.
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
$status{$addr} = $status;
$status_info{$addr} = $info;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
$locked{$addr} = "";
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
return 0 if ! $locked{$addr};
Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
# Rest of parameters passed on
no overloading;
- @{$file_path{0+$self}} = @_;
+ @{$file_path{pack 'J', $self}} = @_;
return
}
# Accessors for the range list stored in this table. First for
# unconditional
- for my $sub qw(
+ for my $sub (qw(
+ containing_range
contains
count
each_range
min
range_count
reset_each_range
+ type_of
value_of
- )
+ ))
{
no strict "refs";
*$sub = sub {
use strict "refs";
my $self = shift;
no overloading;
- return $range_list{0+$self}->$sub(@_);
+ return $range_list{pack 'J', $self}->$sub(@_);
}
}
# Then for ones that should fail if locked
- for my $sub qw(
+ for my $sub (qw(
delete_range
- )
+ ))
{
no strict "refs";
*$sub = sub {
return if $self->carp_if_locked;
no overloading;
- return $range_list{0+$self}->$sub(@_);
+ return $range_list{pack 'J', $self}->$sub(@_);
}
}
_Range_List => $range_list,
%args);
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
$anomalous_entries{$addr} = [];
$core_access{$addr} = $core_access;
# Can't change the table if locked.
return if $self->carp_if_locked;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
$has_specials{$addr} = 1 if $type;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
return "" unless @{$anomalous_entries{$addr}};
return join("\n", @{$anomalous_entries{$addr}}) . "\n";
return;
}
- my $addr; { no overloading; $addr = 0+$self; }
- my $other_addr; { no overloading; $other_addr = 0+$other; }
+ my $addr = do { no overloading; pack 'J', $self; };
+ my $other_addr = do { no overloading; pack 'J', $other; };
local $to_trace = 0 if main::DEBUG;
my $map = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# Convert the input to the standard equivalent, if any (won't have any
# for $STRING properties)
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# If overridden, use that
return $to_output_map{$addr} if defined $to_output_map{$addr};
# No sense generating a comment if aren't going to write it out.
return if ! $self->to_output_map;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
my $property = $self->property;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
my $name = $self->property->swash_name;
# multiple code points. These do not appear in the main body, but are defined
# in the hash below.
-# The key: UTF-8 _bytes_, the value: UTF-8 (speed hack)
+# Each key is the string of N bytes that together make up the UTF-8 encoding
+# for the code point. (i.e. the same as looking at the code point's UTF-8
+# under "use bytes"). Each value is the UTF-8 of the translation, for speed.
%utf8::ToSpec$name = (
END
$pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
# These constants names and values were taken from the Unicode standard,
# version 5.1, section 3.12. They are used in conjunction with Hangul
# syllables
- my \$SBase = 0xAC00;
- my \$LBase = 0x1100;
- my \$VBase = 0x1161;
- my \$TBase = 0x11A7;
- my \$SCount = 11172;
- my \$LCount = 19;
- my \$VCount = 21;
- my \$TCount = 28;
+ my \$SBase = $SBase_string;
+ my \$LBase = $LBase_string;
+ my \$VBase = $VBase_string;
+ my \$TBase = $TBase_string;
+ my \$SCount = $SCount;
+ my \$LCount = $LCount;
+ my \$VCount = $VCount;
+ my \$TCount = $TCount;
my \$NCount = \$VCount * \$TCount;
END
} # End of has Jamos
my $L = $LBase + $SIndex / $NCount;
my $V = $VBase + ($SIndex % $NCount) / $TCount;
my $T = $TBase + $SIndex % $TCount;
- $name = "$HANGUL_SYLLABLE $Jamo{$L}$Jamo{$V}";
+ $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
$name .= $Jamo{$T} if $T != $TBase;
return $name;
}
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
return $self->SUPER::write(
($self->property == $block)
}
# Accessors for the underlying list that should fail if locked.
- for my $sub qw(
+ for my $sub (qw(
add_duplicate
- )
+ ))
{
no strict "refs";
*$sub = sub {
_Property => $property,
_Range_List => $range_list,
);
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
$conflicting{$addr} = [ ];
$equivalents{$addr} = [ ];
return if $self->carp_if_locked;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
if (ref $other) {
# be an optional parameter.
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# Check if the conflicting name is exactly the same as any existing
# alias in this table (as long as there is a real object there to
# Two tables are equivalent if they have the same leader.
no overloading;
- return $leader{0+$self} == $leader{0+$other};
+ return $leader{pack 'J', $self} == $leader{pack 'J', $other};
return;
}
my $are_equivalent = $self->is_equivalent_to($other);
return if ! defined $are_equivalent || $are_equivalent;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
if ($related &&
$related = 0;
}
- my $leader; { no overloading; $leader = 0+$current_leader; }
- my $other_addr; { no overloading; $other_addr = 0+$other; }
+ my $leader = do { no overloading; pack 'J', $current_leader; };
+ my $other_addr = do { no overloading; pack 'J', $other; };
# Any tables that are equivalent to or children of this table must now
# instead be equivalent to or (children) to the new leader (parent),
next if $table == $other;
trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
- my $table_addr; { no overloading; $table_addr = 0+$table; }
+ my $table_addr = do { no overloading; pack 'J', $table; };
$leader{$table_addr} = $other;
$matches_all{$table_addr} = $matches_all;
$self->_set_range_list($other->_range_list);
# an equivalent group
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$leader; }
+ my $addr = do { no overloading; pack 'J', $leader; };
if ($leader{$addr} != $leader) {
Carp::my_carp_bug(<<END
&& $parent == $property->table('N')
&& defined (my $yes = $property->table('Y')))
{
- my $yes_addr; { no overloading; $yes_addr = 0+$yes; }
+ my $yes_addr = do { no overloading; pack 'J', $yes; };
@yes_perl_synonyms
= grep { $_->property == $perl }
main::uniques($yes,
my @conflicting; # Will hold the table conflicts.
# Look at the parent, any yes synonyms, and all the children
- my $parent_addr; { no overloading; $parent_addr = 0+$parent; }
+ my $parent_addr = do { no overloading; pack 'J', $parent; };
for my $table ($parent,
@yes_perl_synonyms,
@{$children{$parent_addr}})
{
- my $table_addr; { no overloading; $table_addr = 0+$table; }
+ my $table_addr = do { no overloading; pack 'J', $table; };
my $table_property = $table->property;
# Tables are separated by a blank line to create a grouping.
}
# Accessors for the underlying list
- for my $sub qw(
+ for my $sub (qw(
get_valid_code_point
get_invalid_code_point
- )
+ ))
{
no strict "refs";
*$sub = sub {
my %args = @_;
$self = bless \do { my $anonymous_scalar }, $class;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
$directory{$addr} = delete $args{'Directory'};
$file{$addr} = delete $args{'File'};
}
else {
no overloading;
- $map{0+$self}->delete_range($other, $other);
+ $map{pack 'J', $self}->delete_range($other, $other);
}
return $self;
}
my $name = shift;
my %args = @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
my $table = $table_ref{$addr}{$name};
my $standard_name = main::standardize($name);
my $name = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
# property
no overloading;
- return main::uniques(values %{$table_ref{0+shift}});
+ return main::uniques(values %{$table_ref{pack 'J', shift}});
}
sub directory {
# priority; 'undef' is returned if the type isn't defined;
# or $map_directory for everything else.
- my $addr; { no overloading; $addr = 0+shift; }
+ my $addr = do { no overloading; pack 'J', shift; };
return $directory{$addr} if defined $directory{$addr};
return undef if $type{$addr} == $UNKNOWN;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
return $file{$addr} if defined $file{$addr};
return $map{$addr}->external_name;
# The whole point of this pseudo property is match tables.
return 1 if $self == $perl;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# Don't generate tables of code points that match the property values
# of a string property. Such a list would most likely have many
}
no overloading;
- return $map{0+$self}->map_add_or_replace_non_nulls($map{0+$other});
+ return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
}
sub set_type {
return;
}
- { no overloading; $type{0+$self} = $type; }
+ { no overloading; $type{pack 'J', $self} = $type; }
return if $type != $BINARY;
my $yes = $self->table('Y');
my $map = shift; # What the range maps to.
# Rest of parameters passed on.
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
# If haven't the type of the property, gather information to figure it
# out.
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$self; }
+ my $addr = do { no overloading; pack 'J', $self; };
my $type = $type{$addr};
# Most of the accessors for a property actually apply to its map table.
# Setup up accessor functions for those, referring to %map
- for my $sub qw(
+ for my $sub (qw(
add_alias
add_anomalous_entry
add_comment
aliases
comment
complete_name
+ containing_range
core_access
count
default_map
status
status_info
to_output_map
+ type_of
value_of
write
- )
+ ))
# 'property' above is for symmetry, so that one can take
# the property of a property and get itself, and so don't
# have to distinguish between properties and tables in
use strict "refs";
my $self = shift;
no overloading;
- return $map{0+$self}->$sub(@_);
+ return $map{pack 'J', $self}->$sub(@_);
}
}
push @files_actually_output, $file;
- my $text;
- if (@$lines_ref) {
- $text = join "", @$lines_ref;
- }
- else {
- $text = "";
+ unless (@$lines_ref) {
Carp::my_carp("Output file '$file' is empty; writing it anyway;");
}
Carp::my_carp("can't open $file for output. Skipping this file: $!");
return;
}
+
+ print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
+ close $OUT or die Carp::my_carp("close '$file' failed: $!");
+
print "$file written.\n" if $verbosity >= $VERBOSE;
- print $OUT $text;
- close $OUT;
return;
}
else {
# Keep track of cycles in the input, and refuse to infinitely loop
- my $addr; { no overloading; $addr = 0+$item; }
+ my $addr = do { no overloading; pack 'J', $item; };
if (defined $already_output{$addr}) {
return "${indent}ALREADY OUTPUT: $item\n";
}
my $fields_ref = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr; { no overloading; $addr = 0+$object; }
+ my $addr = do { no overloading; pack 'J', $object; };
my %hash;
foreach my $key (keys %$fields_ref) {
}
else {
my $ref = ref $$which;
- my $addr; { no overloading; $addr = 0+$$which; }
+ my $addr = do { no overloading; pack 'J', $$which; };
$$which = "$ref ($addr)";
}
}
return 0 unless defined $other;
return 0 unless ref $other;
no overloading;
- return 0+$self == 0+$other;
+ return $self == $other;
}
sub _operator_not_equal {
;
# The defaults apply only to unassigned characters
- $default_R .= '$gc->table("Cn") & $default;';
+ $default_R .= '$gc->table("Unassigned") & $default;';
if ($v_version lt v3.0.0) {
$default = Multi_Default->new(R => $default_R, 'L');
if ($v_version ge 3.1.0) {
$default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
}
- $default_AL .= '$gc->table("Cn") & $default';
+ $default_AL .= '$gc->table("Unassigned") & $default';
$default = Multi_Default->new(AL => $default_AL,
R => $default_R,
'L');
return @return;
}
+sub output_perl_charnames_line ($$) {
+
+ # Output the entries in Perl_charnames specially, using 5 digits instead
+ # of four. This makes the entries a constant length, and simplifies
+ # charnames.pm which this table is for. Unicode can have 6 digit
+ # ordinals, but they are all private use or noncharacters which do not
+ # have names, so won't be in this table.
+
+ return sprintf "%05X\t%s\n", $_[0], $_[1];
+}
+
{ # Closure
# This is used to store the range list of all the code points usable when
# the little used $compare_versions feature is enabled.
$file->carp_bad_line("Unexpected property '$property_name'. Skipped");
next LINE;
}
- { no overloading; $property_addr = 0+($property_object); }
+ { no overloading; $property_addr = pack 'J', $property_object; }
# Defer changing names until have a line that is acceptable
# (the 'next' statement above means is unacceptable)
if $file->has_missings_defaults;
foreach my $default_ref (@missings_list) {
my $default = $default_ref->[0];
- my $addr; { no overloading; $addr = 0+property_ref($default_ref->[1]); }
+ my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
# For string properties, the default is just what the
# file says, but non-string properties should already
}
}
-# XXX Unused until revise charnames;
-#sub check_and_handle_compound_name {
-# This looks at Name properties for parenthesized components and splits
-# them off. Thus it finds FF as an equivalent to Form Feed.
-# my $code_point = shift;
-# my $name = shift;
-# if ($name =~ /^ ( .*? ) ( \s* ) \( ( [^)]* ) \) (.*) $/x) {
-# #local $to_trace = 1 if main::DEBUG;
-# trace $1, $2, $3, $4 if main::DEBUG && $to_trace;
-# push @more_Names, "$code_point; $1";
-# push @more_Names, "$code_point; $3";
-# Carp::my_carp_bug("Expecting blank space before left parenthesis in '$_'. Proceeding and assuming it was there;") if $2 ne " ";
-# Carp::my_carp_bug("Not expecting anything after the right parenthesis in '$_'. Proceeding and ignoring that;") if $4 ne "";
-# }
-# return;
-#}
-
{ # Closure for UnicodeData.txt handling
# This file was the first one in the UCD; its design leads to some
# 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
# The fields in order are:
my $i = 0; # The code point is in field 0, and is shifted off.
- my $NAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
+ my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
my $CATEGORY = $i++; # category (e.g. "Lu")
my $CCC = $i++; # Canonical combining class (e.g. "230")
my $BIDI = $i++; # directional class (e.g. "L")
# This routine in addition outputs these extra fields:
my $DECOMP_TYPE = $i++; # Decomposition type
- my $DECOMP_MAP = $i++; # Must be last; another decomposition mapping
+
+ # These fields are modifications of ones above, and are usually
+ # suppressed; they must come last, as for speed, the loop upper bound is
+ # normally set to ignore them
+ my $NAME = $i++; # This is the strict name field, not the one that
+ # charnames uses.
+ my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
+ # by Unicode::Normalize
my $last_field = $i - 1;
# All these are read into an array for each line, with the indices defined
$field_names[$BIDI] = 'Bidi_Class';
$field_names[$CATEGORY] = 'General_Category';
$field_names[$CCC] = 'Canonical_Combining_Class';
+ $field_names[$CHARNAME] = 'Perl_Charnames';
$field_names[$COMMENT] = 'ISO_Comment';
$field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
$field_names[$DECOMP_TYPE] = 'Decomposition_Type';
$field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
$field_names[$UPPER] = 'Uppercase_Mapping';
- # Some of these need a little more explanation. The $PERL_DECIMAL_DIGIT
- # field does not lead to an official Unicode property, but is used in
- # calculating the Numeric_Type. Perl however, creates a file from this
- # field, so a Perl property is created from it. Similarly, the Other
- # Digit field is used only for calculating the Numeric_Type, and so it can
- # be safely re-used as the place to store the value for Numeric_Type;
- # hence it is referred to as $NUMERIC_TYPE_OTHER_DIGIT. The input field
- # named $PERL_DECOMPOSITION is a combination of both the decomposition
- # mapping and its type. Perl creates a file containing exactly this
- # field, so it is used for that. The two properties are separated into
- # two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
+ # Some of these need a little more explanation:
+ # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
+ # property, but is used in calculating the Numeric_Type. Perl however,
+ # creates a file from this field, so a Perl property is created from it.
+ # Similarly, the Other_Digit field is used only for calculating the
+ # Numeric_Type, and so it can be safely re-used as the place to store
+ # the value for Numeric_Type; hence it is referred to as
+ # $NUMERIC_TYPE_OTHER_DIGIT.
+ # The input field named $PERL_DECOMPOSITION is a combination of both the
+ # decomposition mapping and its type. Perl creates a file containing
+ # exactly this field, so it is used for that. The two properties are
+ # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
+ # $DECOMP_MAP is usually suppressed (unless the lists are changed to
+ # output it), as Perl doesn't use it directly.
+ # The input field named here $CHARNAME is used to construct the
+ # Perl_Charnames property, which is a combination of the Name property
+ # (which the input field contains), and the Unicode_1_Name property, and
+ # others from other files. Since, the strict Name property is not used
+ # by Perl, this field is used for the table that Perl does use. The
+ # strict Name property table is usually suppressed (unless the lists are
+ # changed to output it), so it is accumulated in a separate field,
+ # $NAME, which to save time is discarded unless the table is actually to
+ # be output
# This file is processed like most in this program. Control is passed to
# process_generic_property_file() which calls filter_UnicodeData_line()
my $file = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+ # Create a new property specially located that is a combination of the
+ # various Name properties: Name, Unicode_1_Name, Named Sequences, and
+ # Name_Alias properties. (The final duplicates elements of the
+ # first.) A comment for it will later be constructed based on the
+ # actual properties present and used
+ $perl_charname = Property->new('Perl_Charnames',
+ Core_Access => '\N{...} and "use charnames"',
+ Default_Map => "",
+ Directory => File::Spec->curdir(),
+ File => 'Name',
+ Internal_Only_Warning => 1,
+ Perl_Extension => 1,
+ Range_Size_1 => \&output_perl_charnames_line,
+ Type => $STRING,
+ );
+
my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
Directory => File::Spec->curdir(),
File => 'Decomposition',
END
));
- # This property is not used for generating anything else, and is
- # usually not output. By making it last in the list, we can just
+ # These properties are not used for generating anything else, and are
+ # usually not output. By making them last in the list, we can just
# change the high end of the loop downwards to avoid the work of
- # generating a table that is just going to get thrown away.
- if (! property_ref('Decomposition_Mapping')->to_output_map) {
- $last_field--;
+ # generating a table(s) that is/are just going to get thrown away.
+ if (! property_ref('Decomposition_Mapping')->to_output_map
+ && ! property_ref('Name')->to_output_map)
+ {
+ $last_field = min($NAME, $DECOMP_MAP) - 1;
+ } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
+ $last_field = $DECOMP_MAP;
+ } elsif (property_ref('Name')->to_output_map) {
+ $last_field = $NAME;
}
return;
}
# Certain fields just haven't been empty so far in any Unicode
# version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
# $CATEGORY. This leaves just the two fields, and so we hard-code in
- # the defaults; which are verly unlikely to ever change.
+ # the defaults; which are very unlikely to ever change.
$fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
$fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
# D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
# that define ranges. These should be processed after the fields are
# adjusted above, as they may override some of them; but mostly what
- # is left is to possibly adjust the $NAME field. The names of all the
+ # is left is to possibly adjust the $CHARNAME field. The names of all the
# paired lines start with a '<', but this is also true of '<control>,
# which isn't one of these special ones.
- if ($fields[$NAME] eq '<control>') {
+ if ($fields[$CHARNAME] eq '<control>') {
# Some code points in this file have the pseudo-name
# '<control>', but the official name for such ones is the null
- # string.
+ # string. For charnames.pm, we use the Unicode version 1 name
$fields[$NAME] = "";
+ $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
# We had better not be in between range lines.
if ($in_range) {
- $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'. Trying anyway");
+ $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
$in_range = 0;
}
}
- elsif (substr($fields[$NAME], 0, 1) ne '<') {
+ elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
# Here is a non-range line. We had better not be in between range
# lines.
if ($in_range) {
- $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'. Trying anyway");
+ $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
$in_range = 0;
}
- # XXX until charnames catches up.
-# if ($fields[$NAME] =~ s/- $cp $//x) {
-#
-# # These are code points whose names end in their code points,
-# # which means the names are algorithmically derivable from the
-# # code points. To shorten the output Name file, the algorithm
-# # for deriving these is placed in the file instead of each
-# # code point, so they have map type $CP_IN_NAME
-# $fields[$NAME] = $CMD_DELIM
-# . $MAP_TYPE_CMD
-# . '='
-# . $CP_IN_NAME
-# . $CMD_DELIM
-# . $fields[$NAME];
-# }
-
- # Some official names are really two alternate names with one in
- # parentheses. What we do here is use the full official one for
- # the standard property (stored just above), but for the charnames
- # table, we add two more entries, one for each of the alternate
- # ones.
- # elsif name ne ""
- #check_and_handle_compound_name($cp, $fields[$NAME]);
- #check_and_handle_compound_name($cp, $unicode_1_name);
- # XXX until charnames catches up.
- }
- elsif ($fields[$NAME] =~ /^<(.+), First>$/) {
- $fields[$NAME] = $1;
+ if ($fields[$CHARNAME] =~ s/- $cp $//x) {
+
+ # These are code points whose names end in their code points,
+ # which means the names are algorithmically derivable from the
+ # code points. To shorten the output Name file, the algorithm
+ # for deriving these is placed in the file instead of each
+ # code point, so they have map type $CP_IN_NAME
+ $fields[$CHARNAME] = $CMD_DELIM
+ . $MAP_TYPE_CMD
+ . '='
+ . $CP_IN_NAME
+ . $CMD_DELIM
+ . $fields[$CHARNAME];
+ }
+ $fields[$NAME] = $fields[$CHARNAME];
+ }
+ elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
+ $fields[$CHARNAME] = $fields[$NAME] = $1;
# Here we are at the beginning of a range pair.
if ($in_range) {
- $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$NAME]'. Trying anyway");
+ $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
}
$in_range = 1;
$force_output = 1;
}
- elsif ($fields[$NAME] !~ s/^<(.+), Last>$/$1/) {
- $file->carp_bad_line("Unexpected name starting with '<' $fields[$NAME]. Ignoring this line.");
+ elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
+ $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
$_ = "";
return;
}
else { # Here, we are at the last line of a range pair.
if (! $in_range) {
- $file->carp_bad_line("Unexpected end of range $fields[$NAME] when not in one. Ignoring this line.");
+ $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
$_ = "";
return;
}
$in_range = 0;
+ $fields[$NAME] = $fields[$CHARNAME];
+
# Check that the input is valid: that the closing of the range is
# the same as the beginning.
foreach my $i (0 .. $last_field) {
}
# The processing differs depending on the type of range,
- # determined by its $NAME
- if ($fields[$NAME] =~ /^Hangul Syllable/) {
+ # determined by its $CHARNAME
+ if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
# Check that the data looks right.
if ($decimal_previous_cp != $SBase) {
# This range is stored in our internal structure with its
# own map type, different from all others.
- $previous_fields[$NAME] = $CMD_DELIM
+ $previous_fields[$CHARNAME] = $previous_fields[$NAME]
+ = $CMD_DELIM
. $MAP_TYPE_CMD
. '='
. $HANGUL_SYLLABLE
. $CMD_DELIM
- . $fields[$NAME];
+ . $fields[$CHARNAME];
}
- elsif ($fields[$NAME] =~ /^CJK/) {
+ elsif ($fields[$CHARNAME] =~ /^CJK/) {
# The name for these contains the code point itself, and all
# are defined to have the same base name, regardless of what
# is in the file. They are stored in our internal structure
# with a map type of $CP_IN_NAME
- $previous_fields[$NAME] = $CMD_DELIM
+ $previous_fields[$CHARNAME] = $previous_fields[$NAME]
+ = $CMD_DELIM
. $MAP_TYPE_CMD
. '='
. $CP_IN_NAME
# null, as there are no names for the private use and
# surrogate code points.
- $previous_fields[$NAME] = "";
+ $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
}
else {
- $file->carp_bad_line("Unexpected code point range $fields[$NAME] because category is $fields[$CATEGORY]. Attempting to process it.");
+ $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
}
# The first line of the range caused everything else to be output,
# code in this subroutine that does the same thing, but doesn't
# know about these ranges.
$_ = "";
+
return;
}
#
# This just adds the sequence to an array for later handling
- return; # XXX Until charnames catches up
my $file = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
"Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
next;
}
- push @named_sequences, "$sequence\t\t$name";
+
+ # Note single \t in keeping with special output format of
+ # Perl_charnames. But it turns out that the code points don't have to
+ # be 5 digits long, like the rest, based on the internal workings of
+ # charnames.pm. This could be easily changed for consistency.
+ push @named_sequences, "$sequence\t$name";
}
return;
}
# The simple version's name in each mapping merely has an 's' in front
# of the full one's
my $simple = property_ref('s' . $case);
- $simple->initialize($case) if $simple->to_output_map();
+ $simple->initialize($full) if $simple->to_output_map();
}
return;
Initialize => $Graph & $ASCII,
);
- my $Print = $perl->add_match_table('Print',
+ $print = $perl->add_match_table('Print',
Description => 'Characters that are graphical plus space characters (but no controls)',
Initialize => $Blank + $Graph - $gc->table('Control'),
);
$perl->add_match_table("PosixPrint",
- Description =>
+ Description =>
'[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
- Initialize => $Print & $ASCII,
+ Initialize => $print & $ASCII,
);
my $Punct = $perl->add_match_table('Punct');
$lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
}
- # Create a new property specially located that is a combination of the
- # various Name properties: Name, Unicode_1_Name, Named Sequences, and
- # Name_Alias properties. (The final duplicates elements of the first.) A
- # comment for it is constructed based on the actual properties present and
- # used
- my $perl_charname = Property->new('Perl_Charnames',
- Core_Access => '\N{...} and charnames.pm',
- Default_Map => "",
- Directory => File::Spec->curdir(),
- File => 'Name',
- Internal_Only_Warning => 1,
- Perl_Extension => 1,
- Range_Size_1 => 1,
- Type => $STRING,
- Initialize => property_ref('Unicode_1_Name'),
- );
- # Name overrides Unicode_1_Name
- $perl_charname->property_add_or_replace_non_nulls(property_ref('Name'));
+ # Was previously constructed to contain both Name and Unicode_1_Name
my @composition = ('Name', 'Unicode_1_Name');
if (@named_sequences) {
$comment .= ", and $composition[-1]";
}
- # Wait for charnames to catch up
-# foreach my $entry (@more_Names,
-# split "\n", <<"END"
-#000A; LF
-#000C; FF
-#000D; CR
-#0085; NEL
-#200C; ZWNJ
-#200D; ZWJ
-#FEFF; BOM
-#FEFF; BYTE ORDER MARK
-#END
-# ) {
-# #local $to_trace = 1 if main::DEBUG;
-# trace $entry if main::DEBUG && $to_trace;
-# my ($code_point, $name) = split /\s*;\s*/, $entry;
-# $code_point = hex $code_point;
-# trace $code_point, $name if main::DEBUG && $to_trace;
-# $perl_charname->add_duplicate($code_point, $name);
-# }
-# #$perl_charname->add_comment("This file is for charnames.pm. It is the union of the $comment properties, plus certain commonly used but unofficial names, such as 'FF' and 'ZWNJ'. Unicode_1_Name entries are used only for otherwise nameless code points.$alias_sentence");
$perl_charname->add_comment(join_lines( <<END
This file is for charnames.pm. It is the union of the $comment properties.
Unicode_1_Name entries are used only for otherwise nameless code
The Name property is accessible through the \\N{} interpolation in
double-quoted strings and regular expressions, but both usages require a C<use
-charnames;> to be specified, which also contains related functions viacode()
-and vianame().
+charnames;> to be specified, which also contains related functions viacode(),
+vianame(), and string_vianame().
=head1 Unicode regular expression properties that are NOT accepted by Perl
For example, the uc() function implements the Uppercase_Mapping property and
uses the F<Upper.pl> file found in this directory.
-The available files with their properties (short names in parentheses),
-and any flags or comments about them, are:
+The available files in the current installation, with their properties (short
+names in parentheses), and any flags or comments about them, are:
@map_tables_actually_output
. $spaces_after;
}
-sub generate_tests($$$$$$) {
+sub generate_tests($$$$$) {
# This used only for making the test script. It generates test cases that
# are expected to compile successfully in perl. Note that the lhs and
# rhs are assumed to already be as randomized as the caller wants.
- my $file_handle = shift; # Where to output the tests
my $lhs = shift; # The property: what's to the left of the colon
# or equals separator
my $rhs = shift; # The property value; what's to the right
# The whole 'property=value'
my $name = "$lhs$separator$rhs";
+ my @output;
# Create a complete set of tests, with complements.
if (defined $valid_code) {
- printf $file_handle
- qq/Expect(1, $valid_code, '\\p{$name}', $warning);\n/;
- printf $file_handle
- qq/Expect(0, $valid_code, '\\p{^$name}', $warning);\n/;
- printf $file_handle
- qq/Expect(0, $valid_code, '\\P{$name}', $warning);\n/;
- printf $file_handle
- qq/Expect(1, $valid_code, '\\P{^$name}', $warning);\n/;
+ push @output, <<"EOC"
+Expect(1, $valid_code, '\\p{$name}', $warning);
+Expect(0, $valid_code, '\\p{^$name}', $warning);
+Expect(0, $valid_code, '\\P{$name}', $warning);
+Expect(1, $valid_code, '\\P{^$name}', $warning);
+EOC
}
if (defined $invalid_code) {
- printf $file_handle
- qq/Expect(0, $invalid_code, '\\p{$name}', $warning);\n/;
- printf $file_handle
- qq/Expect(1, $invalid_code, '\\p{^$name}', $warning);\n/;
- printf $file_handle
- qq/Expect(1, $invalid_code, '\\P{$name}', $warning);\n/;
- printf $file_handle
- qq/Expect(0, $invalid_code, '\\P{^$name}', $warning);\n/;
- }
- return;
+ push @output, <<"EOC"
+Expect(0, $invalid_code, '\\p{$name}', $warning);
+Expect(1, $invalid_code, '\\p{^$name}', $warning);
+Expect(1, $invalid_code, '\\P{$name}', $warning);
+Expect(0, $invalid_code, '\\P{^$name}', $warning);
+EOC
+ }
+ return @output;
}
-sub generate_error($$$$) {
+sub generate_error($$$) {
# This used only for making the test script. It generates test cases that
# are expected to not only not match, but to be syntax or similar errors
- my $file_handle = shift; # Where to output to.
my $lhs = shift; # The property: what's to the left of the
# colon or equals separator
my $rhs = shift; # The property value; what's to the right
my $property = $lhs . $separator . $rhs;
- print $file_handle qq/Error('\\p{$property}');\n/;
- print $file_handle qq/Error('\\P{$property}');\n/;
- return;
+ return <<"EOC";
+Error('\\p{$property}');
+Error('\\P{$property}');
+EOC
}
# These are used only for making the test script
$t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
- force_unlink ($t_path);
- push @files_actually_output, $t_path;
- my $OUT;
- if (not open $OUT, "> $t_path") {
- Carp::my_carp("Can't open $t_path. Skipping: $!");
- return;
- }
-
# Keep going down an order of magnitude
# until find that adding this quantity to
# 1 remains 1; but put an upper limit on
# use previous one
$min_floating_slop = $next;
}
- print $OUT $HEADER, <DATA>;
+
+ # It doesn't matter whether the elements of this array contain single lines
+ # or multiple lines. main::write doesn't count the lines.
+ my @output;
foreach my $property (property_ref('*')) {
foreach my $table ($property->tables) {
my $already_error = ! $table->file_path;
# Generate error cases for this alias.
- generate_error($OUT,
- $property_name,
- $table_name,
- $already_error);
+ push @output, generate_error($property_name,
+ $table_name,
+ $already_error);
# If the table is guaranteed to always generate an error,
# quit now without generating success cases.
# Don't output duplicate test cases.
if (! exists $test_generated{$test_name}) {
$test_generated{$test_name} = 1;
- generate_tests($OUT,
- $property_name,
- $standard,
- $valid,
- $invalid,
- $warning,
- );
+ push @output, generate_tests($property_name,
+ $standard,
+ $valid,
+ $invalid,
+ $warning,
+ );
}
$random = randomize_loose_name($table_name)
}
my $test_name = "$property_name=$random";
if (! exists $test_generated{$test_name}) {
$test_generated{$test_name} = 1;
- generate_tests($OUT,
- $property_name,
- $random,
- $valid,
- $invalid,
- $warning,
- );
+ push @output, generate_tests($property_name,
+ $random,
+ $valid,
+ $invalid,
+ $warning,
+ );
# If the name is a rational number, add tests for the
# floating point equivalent.
if abs($table_name - $existing)
< $MAX_FLOATING_SLOP;
}
- generate_error($OUT,
- $property_name,
- $table_name,
- 1 # 1 => already an error
- );
+ push @output, generate_error($property_name,
+ $table_name,
+ 1 # 1 => already an error
+ );
}
else {
# Here the number of digits exceeds the
# minimum we think is needed. So generate a
# success test case for it.
- generate_tests($OUT,
- $property_name,
- $table_name,
- $valid,
- $invalid,
- $warning,
- );
+ push @output, generate_tests($property_name,
+ $table_name,
+ $valid,
+ $invalid,
+ $warning,
+ );
}
}
}
}
}
- foreach my $test (@backslash_X_tests) {
- print $OUT "Test_X('$test');\n";
- }
-
- print $OUT "Finished();\n";
- close $OUT;
+ &write($t_path, [<DATA>,
+ @output,
+ (map {"Test_X('$_');\n"} @backslash_X_tests),
+ "Finished();\n"]);
return;
}
}, File::Spec->curdir());
my @mktables_list_output_files;
+my $old_start_time = 0;
-if ($write_unchanged_files) {
+if (! -e $file_list) {
+ print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
+ $write_unchanged_files = 1;
+} elsif ($write_unchanged_files) {
print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
}
else {
print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
my $file_handle;
if (! open $file_handle, "<", $file_list) {
- Carp::my_carp("Failed to open '$file_list' (this is expected to be missing the first time); turning on -globlist option instead: $!");
+ Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
$glob_list = 1;
}
else {
for my $list ( \@input, \@mktables_list_output_files ) {
while (<$file_handle>) {
s/^ \s+ | \s+ $//xg;
+ if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
+ $old_start_time = $1;
+ }
next if /^ \s* (?: \# .* )? $/x;
last if /^ =+ $/x;
my ( $file ) = split /\t/;
# We set $youngest to be the most recently changed input file, including this
# program itself (done much earlier in this file)
foreach my $in (@input_files) {
- my $age = -M $in;
- next unless defined $age; # Keep going even if missing a file
- $youngest = $age if $age < $youngest;
+ next unless -e $in; # Keep going even if missing a file
+ my $mod_time = (stat $in)[9];
+ $youngest = $mod_time if $mod_time > $youngest;
# See that the input files have distinct names, to warn someone if they
# are adding a new one
}
}
-my $ok = ! $write_unchanged_files
- && scalar @mktables_list_output_files; # If none known, rebuild
+my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
+ || ! scalar @mktables_list_output_files # or if no outputs known
+ || $old_start_time < $youngest; # or out-of-date
# Now we check to see if any output files are older than youngest, if
# they are, we need to continue on, otherwise we can presumably bail.
-if ($ok) {
+if (! $rebuild) {
foreach my $out (@mktables_list_output_files) {
if ( ! file_exists($out)) {
print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
- $ok = 0;
+ $rebuild = 1;
last;
}
#local $to_trace = 1 if main::DEBUG;
- trace $youngest, -M $out if main::DEBUG && $to_trace;
- if ( -M $out > $youngest ) {
- #trace "$out: age: ", -M $out, ", youngest: $youngest\n" if main::DEBUG && $to_trace;
+ trace $youngest, (stat $out)[9] if main::DEBUG && $to_trace;
+ if ( (stat $out)[9] <= $youngest ) {
+ #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $youngest\n" if main::DEBUG && $to_trace;
print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
- $ok = 0;
+ $rebuild = 1;
last;
}
}
}
-if ($ok) {
- print "Files seem to be ok, not bothering to rebuild.\n";
+if (! $rebuild) {
+ print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
exit(0);
}
print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
return
}
else {
+ my $localtime = localtime $start_time;
print $ofh <<"END";
#
# $file_list -- File list for $0.
#
-# Autogenerated on @{[scalar localtime]}
+# Autogenerated starting on $start_time ($localtime)
#
# - First section is input files
# ($0 itself is not listed but is automatically considered an input)
use strict;
use warnings;
+# If run outside the normal test suite on an ASCII platform, you can
+# just create a latin1_to_native() function that just returns its
+# inputs, because that's the only function used from test.pl
+require "test.pl";
+
# Test qr/\X/ and the \p{} regular expression constructs. This file is
# constructed by mktables from the tables it generates, so if mktables is
# buggy, this won't necessarily catch those bugs. Tests are generated for all
my $Tests = 0;
my $Fails = 0;
-my $non_ASCII = (ord('A') != 65);
-
-# The 256 8-bit characters in ASCII ordinal order, with the ones that don't
-# have Perl names replaced by -1
-my @ascii_ordered_chars = (
- "\0",
- (-1) x 6,
- "\a", "\b", "\t", "\n",
- -1, # No Vt
- "\f", "\r",
- (-1) x 18,
- " ", "!", "\"", "#", '$', "%", "&", "'",
- "(", ")", "*", "+", ",", "-", ".", "/",
- "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
- ":", ";", "<", "=", ">", "?", "@",
- "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
- "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
- "[", "\\", "]", "^", "_", "`",
- "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
- "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
- "{", "|", "}", "~",
- (-1) x 129
-);
-
-sub ASCII_ord_to_native ($) {
- # Converts input ordinal number to the native one, if can be done easily.
- # Returns -1 otherwise.
-
- my $ord = shift;
-
- return $ord if $ord > 255 || ! $non_ASCII;
- my $result = $ascii_ordered_chars[$ord];
- return $result if $result eq '-1';
- return ord($result);
-}
-
sub Expect($$$$) {
my $expected = shift;
my $ord = shift;
my $warning_type = shift; # Type of warning message, like 'deprecated'
# or empty if none
my $line = (caller)[2];
-
- # Convert the non-ASCII code points expressible as characters to their
- # ASCII equivalents, and skip the others.
- $ord = ASCII_ord_to_native($ord);
- if ($ord < 0) {
- $Tests++;
- print "ok $Tests - "
- . sprintf("\"\\x{%04X}\"", $ord)
- . " =~ $regex # Skipped: non-ASCII\n";
- return;
- }
+ $ord = ord(latin1_to_native(chr($ord)));
# Convert the code point to hex form
my $string = sprintf "\"\\x{%04X}\"", $ord;
my $this_string = "";
my $this_display = "";
foreach my $code_point (@code_points) {
- my $ord = ASCII_ord_to_native(hex $code_point);
- if ($ord < 0) {
- $Tests++;
- print "ok $Tests - String containing $code_point =~ /(\\X)/g # Skipped: non-ASCII\n";
- return;
- }
- $this_string .= chr $ord;
+ $this_string .= latin1_to_native(chr(hex $code_point));
$this_display .= "\\x{$code_point}";
}