# that instituted the change to main::objaddr, and subsequent commits that
# changed 0+$self to pack 'J', $self.)
+my $start_time;
+BEGIN { # Get the time the script started running; do it at compiliation to
+ # get it as close as possible
+ $start_time= time;
+}
+
+
require 5.010_001;
use strict;
use warnings;
# is nonsensical.
#
# There are no match tables generated for matches of the null string. These
-# would like like qr/\p{JSN=}/ currently without modifying the regex code.
+# would look like qr/\p{JSN=}/ currently without modifying the regex code.
# Perhaps something like them could be added if necessary. The JSN does have
# a real code point U+110B that maps to the null string, but it is a
# contributory property, and therefore not output by default. And it's easily
# kPrimaryNumeric property have commas and an unexpected comment. A filter
# could be added for these; or for a particular installation, the Unihan.txt
# file could be edited to fix them.
-# have to be
#
# HOW TO ADD A FILE TO BE PROCESSED
#
# Stores the most-recently changed file. If none have changed, can skip the
# build
-my $youngest = -M $0; # Do this before the chdir!
+my $youngest = (stat $0)[9]; # Do this before the chdir!
# Change directories now, because need to read 'version' early.
if ($use_directory) {
$INTEGER_FORMAT => 'integer',
$HEX_FORMAT => 'positive hex whole number; a code point',
$RATIONAL_FORMAT => 'rational: an integer or a fraction',
- $STRING_FORMAT => 'arbitrary string',
+ $STRING_FORMAT => 'string',
);
# Unicode didn't put such derived files in a separate directory at first.
# These constants names and values were taken from the Unicode standard,
# version 5.1, section 3.12. They are used in conjunction with Hangul
-# syllables
-my $SBase = 0xAC00;
-my $LBase = 0x1100;
-my $VBase = 0x1161;
-my $TBase = 0x11A7;
+# syllables. The '_string' versions are so generated tables can retain the
+# hex format, which is the more familiar value
+my $SBase_string = "0xAC00";
+my $SBase = CORE::hex $SBase_string;
+my $LBase_string = "0x1100";
+my $LBase = CORE::hex $LBase_string;
+my $VBase_string = "0x1161";
+my $VBase = CORE::hex $VBase_string;
+my $TBase_string = "0x11A7";
+my $TBase = CORE::hex $TBase_string;
my $SCount = 11172;
my $LCount = 19;
my $VCount = 21;
my $gc;
my $perl;
my $block;
+my $perl_charname;
+my $print;
# Are there conflicting names because of beginning with 'In_', or 'Is_'
my $has_In_conflicts = 0;
# "protection" is only by convention. All that happens is that the
# accessor functions' names begin with an underscore. So instead of
# calling set_foo, the call is _set_foo. (Real protection could be
- # accomplished by having a new subroutine, end_package called at the
+ # accomplished by having a new subroutine, end_package, called at the
# end of each package, and then storing the __LINE__ ranges and
# checking them on every accessor. But that is way overkill.)
return $i + 1;
}
- sub value_of {
- # Returns the value associated with the code point, undef if none
+ sub containing_range {
+ # Returns the range object that contains the code point, undef if none
my $self = shift;
my $codepoint = shift;
# contains() returns 1 beyond where we should look
no overloading;
- return $ranges{pack 'J', $self}->[$i-1]->value;
+ return $ranges{pack 'J', $self}->[$i-1];
+ }
+
+ sub value_of {
+ # Returns the value associated with the code point, undef if none
+
+ my $self = shift;
+ my $codepoint = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my $range = $self->containing_range($codepoint);
+ return unless defined $range;
+
+ return $range->value;
+ }
+
+ sub type_of {
+ # Returns the type of the range containing the code point, undef if
+ # the code point is not in the table
+
+ my $self = shift;
+ my $codepoint = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my $range = $self->containing_range($codepoint);
+ return unless defined $range;
+
+ return $range->type;
}
sub _search_ranges {
#
# The range list is kept sorted so that the range with the lowest
# starting position is first in the list, and generally, adjacent
- # ranges with the same values are merged into single larger one (see
+ # ranges with the same values are merged into a single larger one (see
# exceptions below).
#
- # There are more parameters, all are key => value pairs:
+ # There are more parameters; all are key => value pairs:
# Type gives the type of the value. It is only valid for '+'.
# All ranges have types; if this parameter is omitted, 0 is
# assumed. Ranges with type 0 are assumed to obey the
# => $IF_NOT_EQUIVALENT means to replace the existing values
# with this one if they are not equivalent.
# Ranges are equivalent if their types are the
- # same, and they are the same string, or if
+ # same, and they are the same string; or if
# both are type 0 ranges, if their Unicode
# standard forms are identical. In this last
# case, the routine chooses the more "modern"
# multiple times.
# => anything else is the same as => $IF_NOT_EQUIVALENT
#
- # "same value" means identical for type-0 ranges, and it means having
- # the same standard forms for non-type-0 ranges.
+ # "same value" means identical for non-type-0 ranges, and it means
+ # having the same standard forms for type-0 ranges.
return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
# Accessors for the range list stored in this table. First for
# unconditional
for my $sub (qw(
+ containing_range
contains
count
each_range
min
range_count
reset_each_range
+ type_of
value_of
))
{
# These constants names and values were taken from the Unicode standard,
# version 5.1, section 3.12. They are used in conjunction with Hangul
# syllables
- my \$SBase = 0xAC00;
- my \$LBase = 0x1100;
- my \$VBase = 0x1161;
- my \$TBase = 0x11A7;
- my \$SCount = 11172;
- my \$LCount = 19;
- my \$VCount = 21;
- my \$TCount = 28;
+ my \$SBase = $SBase_string;
+ my \$LBase = $LBase_string;
+ my \$VBase = $VBase_string;
+ my \$TBase = $TBase_string;
+ my \$SCount = $SCount;
+ my \$LCount = $LCount;
+ my \$VCount = $VCount;
+ my \$TCount = $TCount;
my \$NCount = \$VCount * \$TCount;
END
} # End of has Jamos
aliases
comment
complete_name
+ containing_range
core_access
count
default_map
status
status_info
to_output_map
+ type_of
value_of
write
))
;
# The defaults apply only to unassigned characters
- $default_R .= '$gc->table("Cn") & $default;';
+ $default_R .= '$gc->table("Unassigned") & $default;';
if ($v_version lt v3.0.0) {
$default = Multi_Default->new(R => $default_R, 'L');
if ($v_version ge 3.1.0) {
$default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
}
- $default_AL .= '$gc->table("Cn") & $default';
+ $default_AL .= '$gc->table("Unassigned") & $default';
$default = Multi_Default->new(AL => $default_AL,
R => $default_R,
'L');
# Name_Alias properties. (The final duplicates elements of the
# first.) A comment for it will later be constructed based on the
# actual properties present and used
- Property->new('Perl_Charnames',
+ $perl_charname = Property->new('Perl_Charnames',
Core_Access => '\N{...} and "use charnames"',
Default_Map => "",
Directory => File::Spec->curdir(),
# Certain fields just haven't been empty so far in any Unicode
# version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
# $CATEGORY. This leaves just the two fields, and so we hard-code in
- # the defaults; which are verly unlikely to ever change.
+ # the defaults; which are very unlikely to ever change.
$fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
$fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
# code in this subroutine that does the same thing, but doesn't
# know about these ranges.
$_ = "";
+
return;
}
# essentially be this code.) This uses the algorithm published by
# Unicode.
if (property_ref('Decomposition_Mapping')->to_output_map) {
- local $to_trace = 1 if main::DEBUG;
for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
use integer;
my $SIndex = $S - $SBase;
#
# This just adds the sequence to an array for later handling
- return; # XXX Until charnames catches up
my $file = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
"Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
next;
}
- push @named_sequences, "$sequence\t\t$name";
+
+ # Note single \t in keeping with special output format of
+ # Perl_charnames. But it turns out that the code points don't have to
+ # be 5 digits long, like the rest, based on the internal workings of
+ # charnames.pm. This could be easily changed for consistency.
+ push @named_sequences, "$sequence\t$name";
}
return;
}
# The simple version's name in each mapping merely has an 's' in front
# of the full one's
my $simple = property_ref('s' . $case);
- $simple->initialize($case) if $simple->to_output_map();
+ $simple->initialize($full) if $simple->to_output_map();
}
return;
Initialize => $Graph & $ASCII,
);
- my $Print = $perl->add_match_table('Print',
+ $print = $perl->add_match_table('Print',
Description => 'Characters that are graphical plus space characters (but no controls)',
Initialize => $Blank + $Graph - $gc->table('Control'),
);
$perl->add_match_table("PosixPrint",
Description =>
'[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
- Initialize => $Print & $ASCII,
+ Initialize => $print & $ASCII,
);
my $Punct = $perl->add_match_table('Punct');
$lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
}
- my $perl_charname = property_ref('Perl_Charnames');
# Was previously constructed to contain both Name and Unicode_1_Name
my @composition = ('Name', 'Unicode_1_Name');
$comment .= ", and $composition[-1]";
}
- # Wait for charnames to catch up
-# foreach my $entry (@more_Names,
-# split "\n", <<"END"
-#000A; LF
-#000C; FF
-#000D; CR
-#0085; NEL
-#200C; ZWNJ
-#200D; ZWJ
-#FEFF; BOM
-#FEFF; BYTE ORDER MARK
-#END
-# ) {
-# #local $to_trace = 1 if main::DEBUG;
-# trace $entry if main::DEBUG && $to_trace;
-# my ($code_point, $name) = split /\s*;\s*/, $entry;
-# $code_point = hex $code_point;
-# trace $code_point, $name if main::DEBUG && $to_trace;
-# $perl_charname->add_duplicate($code_point, $name);
-# }
-# #$perl_charname->add_comment("This file is for charnames.pm. It is the union of the $comment properties, plus certain commonly used but unofficial names, such as 'FF' and 'ZWNJ'. Unicode_1_Name entries are used only for otherwise nameless code points.$alias_sentence");
$perl_charname->add_comment(join_lines( <<END
This file is for charnames.pm. It is the union of the $comment properties.
Unicode_1_Name entries are used only for otherwise nameless code
The Name property is accessible through the \\N{} interpolation in
double-quoted strings and regular expressions, but both usages require a C<use
-charnames;> to be specified, which also contains related functions viacode()
-and vianame().
+charnames;> to be specified, which also contains related functions viacode(),
+vianame(), and string_vianame().
=head1 Unicode regular expression properties that are NOT accepted by Perl
For example, the uc() function implements the Uppercase_Mapping property and
uses the F<Upper.pl> file found in this directory.
-The available files with their properties (short names in parentheses),
-and any flags or comments about them, are:
+The available files in the current installation, with their properties (short
+names in parentheses), and any flags or comments about them, are:
@map_tables_actually_output
}, File::Spec->curdir());
my @mktables_list_output_files;
+my $old_start_time = 0;
if (! -e $file_list) {
print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
for my $list ( \@input, \@mktables_list_output_files ) {
while (<$file_handle>) {
s/^ \s+ | \s+ $//xg;
+ if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
+ $old_start_time = $1;
+ }
next if /^ \s* (?: \# .* )? $/x;
last if /^ =+ $/x;
my ( $file ) = split /\t/;
# We set $youngest to be the most recently changed input file, including this
# program itself (done much earlier in this file)
foreach my $in (@input_files) {
- my $age = -M $in;
- next unless defined $age; # Keep going even if missing a file
- $youngest = $age if $age < $youngest;
+ next unless -e $in; # Keep going even if missing a file
+ my $mod_time = (stat $in)[9];
+ $youngest = $mod_time if $mod_time > $youngest;
# See that the input files have distinct names, to warn someone if they
# are adding a new one
}
}
-my $ok = ! $write_unchanged_files
- && scalar @mktables_list_output_files; # If none known, rebuild
+my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
+ || ! scalar @mktables_list_output_files # or if no outputs known
+ || $old_start_time < $youngest; # or out-of-date
# Now we check to see if any output files are older than youngest, if
# they are, we need to continue on, otherwise we can presumably bail.
-if ($ok) {
+if (! $rebuild) {
foreach my $out (@mktables_list_output_files) {
if ( ! file_exists($out)) {
print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
- $ok = 0;
+ $rebuild = 1;
last;
}
#local $to_trace = 1 if main::DEBUG;
- trace $youngest, -M $out if main::DEBUG && $to_trace;
- if ( -M $out > $youngest ) {
- #trace "$out: age: ", -M $out, ", youngest: $youngest\n" if main::DEBUG && $to_trace;
+ trace $youngest, (stat $out)[9] if main::DEBUG && $to_trace;
+ if ( (stat $out)[9] <= $youngest ) {
+ #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $youngest\n" if main::DEBUG && $to_trace;
print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
- $ok = 0;
+ $rebuild = 1;
last;
}
}
}
-if ($ok) {
- print "Files seem to be ok, not bothering to rebuild.\n";
+if (! $rebuild) {
+ print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
exit(0);
}
print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
return
}
else {
+ my $localtime = localtime $start_time;
print $ofh <<"END";
#
# $file_list -- File list for $0.
#
-# Autogenerated on @{[scalar localtime]}
+# Autogenerated starting on $start_time ($localtime)
#
# - First section is input files
# ($0 itself is not listed but is automatically considered an input)