X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/28093d0e3328797fc0783f9d909d7931ba57bd59..aeab61505e139a7a3116de31dc4b2930c111e315:/lib/unicore/mktables diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 7dfff8c..7f214db 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -9,11 +9,19 @@ # 5.8: needs pack "U". But almost all occurrences of objaddr have been # removed in favor of using 'no overloading'. You also would have to go # through and replace occurrences like: -# my $addr; { no overloading; $addr = 0+$self; } +# my $addr = do { no overloading; pack 'J', $self; } # with # my $addr = main::objaddr $self; # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b -# that instituted this change.) +# that instituted the change to main::objaddr, and subsequent commits that +# changed 0+$self to pack 'J', $self.) + +my $start_time; +BEGIN { # Get the time the script started running; do it at compiliation to + # get it as close as possible + $start_time= time; +} + require 5.010_001; use strict; @@ -310,7 +318,7 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # is nonsensical. # # There are no match tables generated for matches of the null string. These -# would like like qr/\p{JSN=}/ currently without modifying the regex code. +# would look like qr/\p{JSN=}/ currently without modifying the regex code. # Perhaps something like them could be added if necessary. The JSN does have # a real code point U+110B that maps to the null string, but it is a # contributory property, and therefore not output by default. And it's easily @@ -412,7 +420,6 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # kPrimaryNumeric property have commas and an unexpected comment. A filter # could be added for these; or for a particular installation, the Unihan.txt # file could be edited to fix them. -# have to be # # HOW TO ADD A FILE TO BE PROCESSED # @@ -585,6 +592,12 @@ sub uniques { # Encapsulated Cleverness". p. 455 in first edition. my %seen; + # Arguably this breaks encapsulation, if the goal is to permit multiple + # distinct objects to stringify to the same value, and be interchangeable. + # However, for this program, no two objects stringify identically, and all + # lists passed to this function are either objects or strings. So this + # doesn't affect correctness, but it does give a couple of percent speedup. + no overloading; return grep { ! $seen{$_}++ } @_; } @@ -610,6 +623,10 @@ my $glob_list = 0; # ? Should we try to include unknown .txt files # in the input. my $output_range_counts = 1; # ? Should we include the number of code points # in ranges in the output +my $output_names = 0; # ? Should character names be in the output +my @viacode; # Contains the 1 million character names, if + # $output_names is true + # Verbosity levels; 0 is quiet my $NORMAL_VERBOSITY = 1; my $PROGRESS = 2; @@ -665,6 +682,9 @@ while (@ARGV) { elsif ($arg eq '-c') { $output_range_counts = ! $output_range_counts } + elsif ($arg eq '-output_names') { + $output_names = 1; + } else { my $with_c = 'with'; $with_c .= 'out' if $output_range_counts; # Complements the state @@ -689,6 +709,9 @@ usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ] -maketest : Make test script 'TestProp.pl' in current (or -C directory), overrides -T -makelist : Rewrite the file list $file_list based on current setup + -output_names : Output each character's name in the table files; useful for + doing what-ifs, looking at diffs; is slow, memory intensive, + resulting tables are usable but very large. -check A B : Executes $0 only if A and B are the same END } @@ -696,7 +719,7 @@ END # Stores the most-recently changed file. If none have changed, can skip the # build -my $youngest = -M $0; # Do this before the chdir! +my $most_recent = (stat $0)[9]; # Do this before the chdir! # Change directories now, because need to read 'version' early. if ($use_directory) { @@ -775,7 +798,7 @@ if ($v_version gt v3.2.0) { # unless explicitly added. if ($v_version ge v5.2.0) { my $unihan = 'Unihan; remove from list if using Unihan'; - foreach my $table qw ( + foreach my $table (qw ( kAccountingNumeric kOtherNumeric kPrimaryNumeric @@ -791,7 +814,7 @@ if ($v_version ge v5.2.0) { kIRG_USource kIRG_VSource kRSUnicode - ) + )) { $why_suppress_if_empty_warn_if_not{$table} = $unihan; } @@ -1135,7 +1158,7 @@ my %map_table_formats = ( $INTEGER_FORMAT => 'integer', $HEX_FORMAT => 'positive hex whole number; a code point', $RATIONAL_FORMAT => 'rational: an integer or a fraction', - $STRING_FORMAT => 'arbitrary string', + $STRING_FORMAT => 'string', ); # Unicode didn't put such derived files in a separate directory at first. @@ -1153,11 +1176,16 @@ my %loose_property_name_of; # Loosely maps property names to standard form # These constants names and values were taken from the Unicode standard, # version 5.1, section 3.12. They are used in conjunction with Hangul -# syllables -my $SBase = 0xAC00; -my $LBase = 0x1100; -my $VBase = 0x1161; -my $TBase = 0x11A7; +# syllables. The '_string' versions are so generated tables can retain the +# hex format, which is the more familiar value +my $SBase_string = "0xAC00"; +my $SBase = CORE::hex $SBase_string; +my $LBase_string = "0x1100"; +my $LBase = CORE::hex $LBase_string; +my $VBase_string = "0x1161"; +my $VBase = CORE::hex $VBase_string; +my $TBase_string = "0x11A7"; +my $TBase = CORE::hex $TBase_string; my $SCount = 11172; my $LCount = 19; my $VCount = 21; @@ -1193,6 +1221,8 @@ my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms my $gc; my $perl; my $block; +my $perl_charname; +my $print; # Are there conflicting names because of beginning with 'In_', or 'Is_' my $has_In_conflicts = 0; @@ -1224,7 +1254,7 @@ sub objaddr($) { no overloading; # If overloaded, numifying below won't work. # Numifying a ref gives its address. - return 0 + $_[0]; + return pack 'J', $_[0]; } # Commented code below should work on Perl 5.8. @@ -1249,7 +1279,7 @@ sub objaddr($) { # bless $_[0], 'main::Fake'; # # # Numifying a ref gives its address. -# my $addr = 0 + $_[0]; +# my $addr = pack 'J', $_[0]; # # # Return to original class # bless $_[0], $pkg; @@ -1439,7 +1469,7 @@ package main; # Use typeglob to give the anonymous subroutine the name we want *$destroy_name = sub { my $self = shift; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; $self->$destroy_callback if $destroy_callback; foreach my $field (keys %{$package_fields{$package}}) { @@ -1485,7 +1515,7 @@ package main; # "protection" is only by convention. All that happens is that the # accessor functions' names begin with an underscore. So instead of # calling set_foo, the call is _set_foo. (Real protection could be - # accomplished by having a new subroutine, end_package called at the + # accomplished by having a new subroutine, end_package, called at the # end of each package, and then storing the __LINE__ ranges and # checking them on every accessor. But that is way overkill.) @@ -1538,7 +1568,7 @@ package main; return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; my $self = shift; my $value = shift; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; Carp::carp_extra_args(\@_) if main::DEBUG && @_; if (ref $value) { return if grep { $value == $_ } @{$field->{$addr}}; @@ -1572,7 +1602,7 @@ package main; *$subname = sub { use strict "refs"; Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; - my $addr; { no overloading; $addr = 0+$_[0]; } + my $addr = do { no overloading; pack 'J', $_[0]; }; if (ref $field->{$addr} ne 'ARRAY') { my $type = ref $field->{$addr}; $type = 'scalar' unless $type; @@ -1595,7 +1625,7 @@ package main; use strict "refs"; Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; no overloading; - return $field->{0+$_[0]}; + return $field->{pack 'J', $_[0]}; } } } @@ -1610,7 +1640,7 @@ package main; } # $self is $_[0]; $value is $_[1] no overloading; - $field->{0+$_[0]} = $_[1]; + $field->{pack 'J', $_[0]} = $_[1]; return; } } @@ -1770,7 +1800,7 @@ sub trace { return main::trace(@_); } my $class = shift; my $self = bless \do{ my $anonymous_scalar }, $class; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # Set defaults $handler{$addr} = \&main::process_generic_property_file; @@ -1861,7 +1891,7 @@ sub trace { return main::trace(@_); } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; my $file = $file{$addr}; @@ -2031,7 +2061,7 @@ END my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # Here the file is open (or if the handle is not a ref, is an open # 'virtual' file). Get the next line; any inserted lines get priority @@ -2176,7 +2206,7 @@ END # # an each_line_handler() on the line. # # my $self = shift; -# my $addr; { no overloading; $addr = 0+$self; } +# my $addr = do { no overloading; pack 'J', $self; }; # # foreach my $inserted_ref (@{$added_lines{$addr}}) { # my ($adjusted, $line) = @{$inserted_ref}; @@ -2218,7 +2248,7 @@ END # indicate that this line hasn't been adjusted, and needs to be # processed. no overloading; - push @{$added_lines{0+$self}}, map { [ 0, $_ ] } @_; + push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_; return; } @@ -2242,7 +2272,7 @@ END # Each inserted line is an array, with the first element being 1 to # indicate that this line has been adjusted no overloading; - push @{$added_lines{0+$self}}, map { [ 1, $_ ] } @_; + push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_; return; } @@ -2255,7 +2285,7 @@ END my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # If not accepting a list return, just return the first one. return shift @{$missings{$addr}} unless wantarray; @@ -2269,7 +2299,7 @@ END # Add a property field to $_, if this file requires it. my $self = shift; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; my $property = $property{$addr}; Carp::carp_extra_args(\@_) if main::DEBUG && @_; @@ -2288,7 +2318,7 @@ END my $message = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; $message = 'Unexpected line' unless $message; @@ -2353,7 +2383,7 @@ package Multi_Default; my $class = shift; my $self = bless \do{my $anonymous_scalar}, $class; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; while (@_ > 1) { my $default = shift; @@ -2371,7 +2401,7 @@ package Multi_Default; my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; return each %{$class_defaults{$addr}}; } @@ -2418,7 +2448,7 @@ package Alias; my $class = shift; my $self = bless \do { my $anonymous_scalar }, $class; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; $name{$addr} = shift; $loose_match{$addr} = shift; @@ -2480,7 +2510,7 @@ sub trace { return main::trace(@_); } my $class = shift; my $self = bless \do { my $anonymous_scalar }, $class; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; $start{$addr} = shift; $end{$addr} = shift; @@ -2510,7 +2540,7 @@ sub trace { return main::trace(@_); } sub _operator_stringify { my $self = shift; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # Output it like '0041..0065 (value)' my $return = sprintf("%04X", $start{$addr}) @@ -2533,7 +2563,7 @@ sub trace { return main::trace(@_); } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; return $standard_form{$addr} if defined $standard_form{$addr}; return $value{$addr}; @@ -2546,7 +2576,7 @@ sub trace { return main::trace(@_); } my $indent = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; my $return = $indent . sprintf("%04X", $start{$addr}) @@ -2628,7 +2658,7 @@ sub trace { return main::trace(@_); } return _union($class, $initialize, %args) if defined $initialize; $self = bless \do { my $anonymous_scalar }, $class; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # Optional parent object, only for debug info. $owner_name_of{$addr} = delete $args{'Owner'}; @@ -2660,7 +2690,7 @@ sub trace { return main::trace(@_); } sub _operator_stringify { my $self = shift; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; return "Range_List attached to '$owner_name_of{$addr}'" if $owner_name_of{$addr}; @@ -2719,7 +2749,7 @@ sub trace { return main::trace(@_); } my $message = ""; if (defined $self) { no overloading; - $message .= $owner_name_of{0+$self}; + $message .= $owner_name_of{pack 'J', $self}; } Carp::my_carp_bug($message .= "Undefined argument to _union. No union done."); return; @@ -2741,7 +2771,7 @@ sub trace { return main::trace(@_); } my $message = ""; if (defined $self) { no overloading; - $message .= $owner_name_of{0+$self}; + $message .= $owner_name_of{pack 'J', $self}; } Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done."); return; @@ -2782,7 +2812,7 @@ sub trace { return main::trace(@_); } Carp::carp_extra_args(\@_) if main::DEBUG && @_; no overloading; - return scalar @{$ranges{0+$self}}; + return scalar @{$ranges{pack 'J', $self}}; } sub min { @@ -2795,7 +2825,7 @@ sub trace { return main::trace(@_); } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # If the range list is empty, return a large value that isn't adjacent # to any that could be in the range list, for simpler tests @@ -2820,12 +2850,12 @@ sub trace { return main::trace(@_); } # So is in the table if and only iff it is at least the start position # of range $i. no overloading; - return 0 if $ranges{0+$self}->[$i]->start > $codepoint; + return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint; return $i + 1; } - sub value_of { - # Returns the value associated with the code point, undef if none + sub containing_range { + # Returns the range object that contains the code point, undef if none my $self = shift; my $codepoint = shift; @@ -2836,7 +2866,34 @@ sub trace { return main::trace(@_); } # contains() returns 1 beyond where we should look no overloading; - return $ranges{0+$self}->[$i-1]->value; + return $ranges{pack 'J', $self}->[$i-1]; + } + + sub value_of { + # Returns the value associated with the code point, undef if none + + my $self = shift; + my $codepoint = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $range = $self->containing_range($codepoint); + return unless defined $range; + + return $range->value; + } + + sub type_of { + # Returns the type of the range containing the code point, undef if + # the code point is not in the table + + my $self = shift; + my $codepoint = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $range = $self->containing_range($codepoint); + return unless defined $range; + + return $range->type; } sub _search_ranges { @@ -2850,7 +2907,7 @@ sub trace { return main::trace(@_); } my $code_point = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; return if $code_point > $max{$addr}; my $r = $ranges{$addr}; # The current list of ranges @@ -2955,10 +3012,10 @@ sub trace { return main::trace(@_); } # # The range list is kept sorted so that the range with the lowest # starting position is first in the list, and generally, adjacent - # ranges with the same values are merged into single larger one (see + # ranges with the same values are merged into a single larger one (see # exceptions below). # - # There are more parameters, all are key => value pairs: + # There are more parameters; all are key => value pairs: # Type gives the type of the value. It is only valid for '+'. # All ranges have types; if this parameter is omitted, 0 is # assumed. Ranges with type 0 are assumed to obey the @@ -2982,7 +3039,7 @@ sub trace { return main::trace(@_); } # => $IF_NOT_EQUIVALENT means to replace the existing values # with this one if they are not equivalent. # Ranges are equivalent if their types are the - # same, and they are the same string, or if + # same, and they are the same string; or if # both are type 0 ranges, if their Unicode # standard forms are identical. In this last # case, the routine chooses the more "modern" @@ -3001,8 +3058,8 @@ sub trace { return main::trace(@_); } # multiple times. # => anything else is the same as => $IF_NOT_EQUIVALENT # - # "same value" means identical for type-0 ranges, and it means having - # the same standard forms for non-type-0 ranges. + # "same value" means identical for non-type-0 ranges, and it means + # having the same standard forms for type-0 ranges. return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5; @@ -3024,7 +3081,7 @@ sub trace { return main::trace(@_); } Carp::carp_extra_args(\%args) if main::DEBUG && %args; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; if ($operation ne '+' && $operation ne '-') { Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken."); @@ -3609,7 +3666,7 @@ sub trace { return main::trace(@_); } Carp::carp_extra_args(\@_) if main::DEBUG && @_; no overloading; - undef $each_range_iterator{0+$self}; + undef $each_range_iterator{pack 'J', $self}; return; } @@ -3620,7 +3677,7 @@ sub trace { return main::trace(@_); } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; return if $self->is_empty; @@ -3637,7 +3694,7 @@ sub trace { return main::trace(@_); } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; my $count = 0; foreach my $range (@{$ranges{$addr}}) { @@ -3661,7 +3718,7 @@ sub trace { return main::trace(@_); } Carp::carp_extra_args(\@_) if main::DEBUG && @_; no overloading; - return scalar @{$ranges{0+$self}} == 0; + return scalar @{$ranges{pack 'J', $self}} == 0; } sub hash { @@ -3672,7 +3729,7 @@ sub trace { return main::trace(@_); } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # These are quickly computable. Return looks like 'min..max;count' return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}}; @@ -3944,8 +4001,6 @@ sub trace { return main::trace(@_); } return $self->_add_delete('+', $start, $end, ""); } - my $non_ASCII = (ord('A') != 65); # Assumes test on same platform - sub is_code_point_usable { # This used only for making the test script. See if the input # proposed trial code point is one that Perl will handle. If second @@ -3958,15 +4013,6 @@ sub trace { return main::trace(@_); } return 0 if $code < 0; # Never use a negative - # For non-ASCII, we shun the characters that don't have Perl encoding- - # independent symbols for them. 'A' is such a symbol, so is "\n". - return $try_hard if $non_ASCII - && $code <= 0xFF - && ($code >= 0x7F - || ($code >= 0x0E && $code <= 0x1F) - || ($code >= 0x01 && $code <= 0x06) - || $code == 0x0B); - # shun null. I'm (khw) not sure why this was done, but NULL would be # the character very frequently used. return $try_hard if $code == 0x0000; @@ -3991,7 +4037,7 @@ sub trace { return main::trace(@_); } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # On first pass, don't choose less desirable code points; if no good # one is found, repeat, allowing a less desirable one to be selected. @@ -4183,7 +4229,7 @@ sub trace { return main::trace(@_); } my $class = shift; my $self = bless \do { my $anonymous_scalar }, $class; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; my %args = @_; @@ -4199,6 +4245,7 @@ sub trace { return main::trace(@_); } $status{$addr} = delete $args{'Status'} || $NORMAL; $status_info{$addr} = delete $args{'_Status_Info'} || ""; $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0; + $range_size_1{$addr} = 1 if $output_names; # Make sure 1 name per line my $description = delete $args{'Description'}; my $externally_ok = delete $args{'Externally_Ok'}; @@ -4307,10 +4354,10 @@ sub trace { return main::trace(@_); } # Here are the methods that are required to be defined by any derived # class - for my $sub qw( + for my $sub (qw( append_to_body pre_body - ) + )) # append_to_body and pre_body are called in the write() method # to add stuff after the main body of the table, but before # its close; and to prepend stuff before the beginning of the @@ -4336,7 +4383,7 @@ sub trace { return main::trace(@_); } # Returns the array of ranges associated with this table. no overloading; - return $range_list{0+shift}->ranges; + return $range_list{pack 'J', shift}->ranges; } sub add_alias { @@ -4372,7 +4419,7 @@ sub trace { return main::trace(@_); } # release $name = ucfirst($name) unless $name =~ /^k[A-Z]/; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # Figure out if should be loosely matched if not already specified. if (! defined $loose_match) { @@ -4434,7 +4481,7 @@ sub trace { return main::trace(@_); } # This name may be shorter than any existing ones, so clear the cache # of the shortest, so will have to be recalculated. no overloading; - undef $short_name{0+$self}; + undef $short_name{pack 'J', $self}; return; } @@ -4457,7 +4504,7 @@ sub trace { return main::trace(@_); } my $nominal_length_ptr = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # For efficiency, don't recalculate, but this means that adding new # aliases could change what the shortest is, so the code that does @@ -4533,7 +4580,7 @@ sub trace { return main::trace(@_); } Carp::carp_extra_args(\@_) if main::DEBUG && @_; no overloading; - push @{$description{0+$self}}, $description; + push @{$description{pack 'J', $self}}, $description; return; } @@ -4546,7 +4593,7 @@ sub trace { return main::trace(@_); } Carp::carp_extra_args(\@_) if main::DEBUG && @_; no overloading; - push @{$note{0+$self}}, $note; + push @{$note{pack 'J', $self}}, $note; return; } @@ -4560,7 +4607,7 @@ sub trace { return main::trace(@_); } chomp $comment; no overloading; - push @{$comment{0+$self}}, $comment; + push @{$comment{pack 'J', $self}}, $comment; return; } @@ -4573,7 +4620,7 @@ sub trace { return main::trace(@_); } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; my @list = @{$comment{$addr}}; return @list if wantarray; my $return = ""; @@ -4591,7 +4638,7 @@ sub trace { return main::trace(@_); } # initialization for range lists. my $self = shift; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; my $initialization = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; @@ -4615,7 +4662,7 @@ sub trace { return main::trace(@_); } $return .= $DEVELOPMENT_ONLY if $compare_versions; $return .= $HEADER; no overloading; - $return .= $INTERNAL_ONLY if $internal_only{0+$self}; + $return .= $INTERNAL_ONLY if $internal_only{pack 'J', $self}; return $return; } @@ -4630,7 +4677,7 @@ sub trace { return main::trace(@_); } # the range Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # Start with the header my @OUT = $self->header; @@ -4668,8 +4715,24 @@ sub trace { return main::trace(@_); } # If has or wants a single point range output if ($start == $end || $range_size_1) { - for my $i ($start .. $end) { - push @OUT, sprintf "%04X\t\t%s\n", $i, $value; + if (ref $range_size_1 eq 'CODE') { + for my $i ($start .. $end) { + push @OUT, &$range_size_1($i, $value); + } + } + else { + for my $i ($start .. $end) { + push @OUT, sprintf "%04X\t\t%s\n", $i, $value; + if ($output_names) { + if (! defined $viacode[$i]) { + $viacode[$i] = + Property::property_ref('Perl_Charnames') + ->value_of($i) + || ""; + } + $OUT[-1] =~ s/\n/\t# $viacode[$i]\n/; + } + } } } else { @@ -4718,7 +4781,7 @@ sub trace { return main::trace(@_); } my $info = shift; # Any message associated with it. Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; $status{$addr} = $status; $status_info{$addr} = $info; @@ -4733,7 +4796,7 @@ sub trace { return main::trace(@_); } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; $locked{$addr} = ""; @@ -4761,7 +4824,7 @@ sub trace { return main::trace(@_); } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; return 0 if ! $locked{$addr}; Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n"); @@ -4773,13 +4836,14 @@ sub trace { return main::trace(@_); } # Rest of parameters passed on no overloading; - @{$file_path{0+$self}} = @_; + @{$file_path{pack 'J', $self}} = @_; return } # Accessors for the range list stored in this table. First for # unconditional - for my $sub qw( + for my $sub (qw( + containing_range contains count each_range @@ -4789,22 +4853,23 @@ sub trace { return main::trace(@_); } min range_count reset_each_range + type_of value_of - ) + )) { no strict "refs"; *$sub = sub { use strict "refs"; my $self = shift; no overloading; - return $range_list{0+$self}->$sub(@_); + return $range_list{pack 'J', $self}->$sub(@_); } } # Then for ones that should fail if locked - for my $sub qw( + for my $sub (qw( delete_range - ) + )) { no strict "refs"; *$sub = sub { @@ -4813,7 +4878,7 @@ sub trace { return main::trace(@_); } return if $self->carp_if_locked; no overloading; - return $range_list{0+$self}->$sub(@_); + return $range_list{pack 'J', $self}->$sub(@_); } } @@ -4919,7 +4984,7 @@ sub trace { return main::trace(@_); } _Range_List => $range_list, %args); - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; $anomalous_entries{$addr} = []; $core_access{$addr} = $core_access; @@ -4971,7 +5036,7 @@ sub trace { return main::trace(@_); } # Can't change the table if locked. return if $self->carp_if_locked; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; $has_specials{$addr} = 1 if $type; @@ -4989,7 +5054,7 @@ sub trace { return main::trace(@_); } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; return "" unless @{$anomalous_entries{$addr}}; return join("\n", @{$anomalous_entries{$addr}}) . "\n"; @@ -5016,8 +5081,8 @@ sub trace { return main::trace(@_); } return; } - my $addr; { no overloading; $addr = 0+$self; } - my $other_addr; { no overloading; $other_addr = 0+$other; } + my $addr = do { no overloading; pack 'J', $self; }; + my $other_addr = do { no overloading; pack 'J', $other; }; local $to_trace = 0 if main::DEBUG; @@ -5050,7 +5115,7 @@ sub trace { return main::trace(@_); } my $map = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # Convert the input to the standard equivalent, if any (won't have any # for $STRING properties) @@ -5095,7 +5160,7 @@ sub trace { return main::trace(@_); } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # If overridden, use that return $to_output_map{$addr} if defined $to_output_map{$addr}; @@ -5140,7 +5205,7 @@ sub trace { return main::trace(@_); } # No sense generating a comment if aren't going to write it out. return if ! $self->to_output_map; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; my $property = $self->property; @@ -5312,7 +5377,7 @@ END my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; my $name = $self->property->swash_name; @@ -5444,7 +5509,9 @@ END # multiple code points. These do not appear in the main body, but are defined # in the hash below. -# The key: UTF-8 _bytes_, the value: UTF-8 (speed hack) +# Each key is the string of N bytes that together make up the UTF-8 encoding +# for the code point. (i.e. the same as looking at the code point's UTF-8 +# under "use bytes"). Each value is the UTF-8 of the translation, for speed. %utf8::ToSpec$name = ( END $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n"; @@ -5561,14 +5628,14 @@ $jamo_t # These constants names and values were taken from the Unicode standard, # version 5.1, section 3.12. They are used in conjunction with Hangul # syllables - my \$SBase = 0xAC00; - my \$LBase = 0x1100; - my \$VBase = 0x1161; - my \$TBase = 0x11A7; - my \$SCount = 11172; - my \$LCount = 19; - my \$VCount = 21; - my \$TCount = 28; + my \$SBase = $SBase_string; + my \$LBase = $LBase_string; + my \$VBase = $VBase_string; + my \$TBase = $TBase_string; + my \$SCount = $SCount; + my \$LCount = $LCount; + my \$VCount = $VCount; + my \$TCount = $TCount; my \$NCount = \$VCount * \$TCount; END } # End of has Jamos @@ -5639,7 +5706,7 @@ END my $L = $LBase + $SIndex / $NCount; my $V = $VBase + ($SIndex % $NCount) / $TCount; my $T = $TBase + $SIndex % $TCount; - $name = "$HANGUL_SYLLABLE $Jamo{$L}$Jamo{$V}"; + $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; $name .= $Jamo{$T} if $T != $TBase; return $name; } @@ -5755,7 +5822,7 @@ END my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; return $self->SUPER::write( ($self->property == $block) @@ -5765,9 +5832,9 @@ END } # Accessors for the underlying list that should fail if locked. - for my $sub qw( + for my $sub (qw( add_duplicate - ) + )) { no strict "refs"; *$sub = sub { @@ -5902,7 +5969,7 @@ sub trace { return main::trace(@_); } _Property => $property, _Range_List => $range_list, ); - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; $conflicting{$addr} = [ ]; $equivalents{$addr} = [ ]; @@ -5943,7 +6010,7 @@ sub trace { return main::trace(@_); } return if $self->carp_if_locked; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; if (ref $other) { @@ -6010,7 +6077,7 @@ sub trace { return main::trace(@_); } # be an optional parameter. Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # Check if the conflicting name is exactly the same as any existing # alias in this table (as long as there is a real object there to @@ -6058,7 +6125,7 @@ sub trace { return main::trace(@_); } # Two tables are equivalent if they have the same leader. no overloading; - return $leader{0+$self} == $leader{0+$other}; + return $leader{pack 'J', $self} == $leader{pack 'J', $other}; return; } @@ -6132,7 +6199,7 @@ sub trace { return main::trace(@_); } my $are_equivalent = $self->is_equivalent_to($other); return if ! defined $are_equivalent || $are_equivalent; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; my $current_leader = ($related) ? $parent{$addr} : $leader{$addr}; if ($related && @@ -6143,8 +6210,8 @@ sub trace { return main::trace(@_); } $related = 0; } - my $leader; { no overloading; $leader = 0+$current_leader; } - my $other_addr; { no overloading; $other_addr = 0+$other; } + my $leader = do { no overloading; pack 'J', $current_leader; }; + my $other_addr = do { no overloading; pack 'J', $other; }; # Any tables that are equivalent to or children of this table must now # instead be equivalent to or (children) to the new leader (parent), @@ -6159,7 +6226,7 @@ sub trace { return main::trace(@_); } next if $table == $other; trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace; - my $table_addr; { no overloading; $table_addr = 0+$table; } + my $table_addr = do { no overloading; pack 'J', $table; }; $leader{$table_addr} = $other; $matches_all{$table_addr} = $matches_all; $self->_set_range_list($other->_range_list); @@ -6213,7 +6280,7 @@ sub trace { return main::trace(@_); } # an equivalent group Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$leader; } + my $addr = do { no overloading; pack 'J', $leader; }; if ($leader{$addr} != $leader) { Carp::my_carp_bug(<table('N') && defined (my $yes = $property->table('Y'))) { - my $yes_addr; { no overloading; $yes_addr = 0+$yes; } + my $yes_addr = do { no overloading; pack 'J', $yes; }; @yes_perl_synonyms = grep { $_->property == $perl } main::uniques($yes, @@ -6284,12 +6351,12 @@ END my @conflicting; # Will hold the table conflicts. # Look at the parent, any yes synonyms, and all the children - my $parent_addr; { no overloading; $parent_addr = 0+$parent; } + my $parent_addr = do { no overloading; pack 'J', $parent; }; for my $table ($parent, @yes_perl_synonyms, @{$children{$parent_addr}}) { - my $table_addr; { no overloading; $table_addr = 0+$table; } + my $table_addr = do { no overloading; pack 'J', $table; }; my $table_property = $table->property; # Tables are separated by a blank line to create a grouping. @@ -6535,10 +6602,10 @@ END } # Accessors for the underlying list - for my $sub qw( + for my $sub (qw( get_valid_code_point get_invalid_code_point - ) + )) { no strict "refs"; *$sub = sub { @@ -6706,7 +6773,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } my %args = @_; $self = bless \do { my $anonymous_scalar }, $class; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; $directory{$addr} = delete $args{'Directory'}; $file{$addr} = delete $args{'File'}; @@ -6767,7 +6834,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } } else { no overloading; - $map{0+$self}->delete_range($other, $other); + $map{pack 'J', $self}->delete_range($other, $other); } return $self; } @@ -6780,7 +6847,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } my $name = shift; my %args = @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; my $table = $table_ref{$addr}{$name}; my $standard_name = main::standardize($name); @@ -6848,7 +6915,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } my $name = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name}; @@ -6867,7 +6934,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } # property no overloading; - return main::uniques(values %{$table_ref{0+shift}}); + return main::uniques(values %{$table_ref{pack 'J', shift}}); } sub directory { @@ -6876,7 +6943,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } # priority; 'undef' is returned if the type isn't defined; # or $map_directory for everything else. - my $addr; { no overloading; $addr = 0+shift; } + my $addr = do { no overloading; pack 'J', shift; }; return $directory{$addr} if defined $directory{$addr}; return undef if $type{$addr} == $UNKNOWN; @@ -6897,7 +6964,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; return $file{$addr} if defined $file{$addr}; return $map{$addr}->external_name; @@ -6913,7 +6980,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } # The whole point of this pseudo property is match tables. return 1 if $self == $perl; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # Don't generate tables of code points that match the property values # of a string property. Such a list would most likely have many @@ -6948,7 +7015,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } } no overloading; - return $map{0+$self}->map_add_or_replace_non_nulls($map{0+$other}); + return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other}); } sub set_type { @@ -6967,7 +7034,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } return; } - { no overloading; $type{0+$self} = $type; } + { no overloading; $type{pack 'J', $self} = $type; } return if $type != $BINARY; my $yes = $self->table('Y'); @@ -6997,7 +7064,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } my $map = shift; # What the range maps to. # Rest of parameters passed on. - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; # If haven't the type of the property, gather information to figure it # out. @@ -7049,7 +7116,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$self; } + my $addr = do { no overloading; pack 'J', $self; }; my $type = $type{$addr}; @@ -7103,7 +7170,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } # Most of the accessors for a property actually apply to its map table. # Setup up accessor functions for those, referring to %map - for my $sub qw( + for my $sub (qw( add_alias add_anomalous_entry add_comment @@ -7114,6 +7181,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } aliases comment complete_name + containing_range core_access count default_map @@ -7146,9 +7214,10 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } status status_info to_output_map + type_of value_of write - ) + )) # 'property' above is for symmetry, so that one can take # the property of a property and get itself, and so don't # have to distinguish between properties and tables in @@ -7159,7 +7228,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } use strict "refs"; my $self = shift; no overloading; - return $map{0+$self}->$sub(@_); + return $map{pack 'J', $self}->$sub(@_); } } @@ -7447,12 +7516,7 @@ sub write ($\@) { push @files_actually_output, $file; - my $text; - if (@$lines_ref) { - $text = join "", @$lines_ref; - } - else { - $text = ""; + unless (@$lines_ref) { Carp::my_carp("Output file '$file' is empty; writing it anyway;"); } @@ -7463,10 +7527,12 @@ sub write ($\@) { Carp::my_carp("can't open $file for output. Skipping this file: $!"); return; } + + print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!"); + close $OUT or die Carp::my_carp("close '$file' failed: $!"); + print "$file written.\n" if $verbosity >= $VERBOSE; - print $OUT $text; - close $OUT; return; } @@ -7567,7 +7633,7 @@ sub standardize ($) { else { # Keep track of cycles in the input, and refuse to infinitely loop - my $addr; { no overloading; $addr = 0+$item; } + my $addr = do { no overloading; pack 'J', $item; }; if (defined $already_output{$addr}) { return "${indent}ALREADY OUTPUT: $item\n"; } @@ -7688,7 +7754,7 @@ sub dump_inside_out { my $fields_ref = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $addr; { no overloading; $addr = 0+$object; } + my $addr = do { no overloading; pack 'J', $object; }; my %hash; foreach my $key (keys %$fields_ref) { @@ -7716,7 +7782,7 @@ sub _operator_dot { } else { my $ref = ref $$which; - my $addr; { no overloading; $addr = 0+$$which; } + my $addr = do { no overloading; pack 'J', $$which; }; $$which = "$ref ($addr)"; } } @@ -7735,7 +7801,7 @@ sub _operator_equal { return 0 unless defined $other; return 0 unless ref $other; no overloading; - return 0+$self == 0+$other; + return $self == $other; } sub _operator_not_equal { @@ -7886,7 +7952,7 @@ sub finish_property_setup { ; # The defaults apply only to unassigned characters - $default_R .= '$gc->table("Cn") & $default;'; + $default_R .= '$gc->table("Unassigned") & $default;'; if ($v_version lt v3.0.0) { $default = Multi_Default->new(R => $default_R, 'L'); @@ -7906,7 +7972,7 @@ sub finish_property_setup { if ($v_version ge 3.1.0) { $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);'; } - $default_AL .= '$gc->table("Cn") & $default'; + $default_AL .= '$gc->table("Unassigned") & $default'; $default = Multi_Default->new(AL => $default_AL, R => $default_R, 'L'); @@ -8521,6 +8587,17 @@ END return @return; } +sub output_perl_charnames_line ($$) { + + # Output the entries in Perl_charnames specially, using 5 digits instead + # of four. This makes the entries a constant length, and simplifies + # charnames.pm which this table is for. Unicode can have 6 digit + # ordinals, but they are all private use or noncharacters which do not + # have names, so won't be in this table. + + return sprintf "%05X\t%s\n", $_[0], $_[1]; +} + { # Closure # This is used to store the range list of all the code points usable when # the little used $compare_versions feature is enabled. @@ -8696,7 +8773,7 @@ END $file->carp_bad_line("Unexpected property '$property_name'. Skipped"); next LINE; } - { no overloading; $property_addr = 0+($property_object); } + { no overloading; $property_addr = pack 'J', $property_object; } # Defer changing names until have a line that is acceptable # (the 'next' statement above means is unacceptable) @@ -8748,7 +8825,7 @@ END if $file->has_missings_defaults; foreach my $default_ref (@missings_list) { my $default = $default_ref->[0]; - my $addr; { no overloading; $addr = 0+property_ref($default_ref->[1]); } + my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); }; # For string properties, the default is just what the # file says, but non-string properties should already @@ -8963,23 +9040,6 @@ END } } -# XXX Unused until revise charnames; -#sub check_and_handle_compound_name { -# This looks at Name properties for parenthesized components and splits -# them off. Thus it finds FF as an equivalent to Form Feed. -# my $code_point = shift; -# my $name = shift; -# if ($name =~ /^ ( .*? ) ( \s* ) \( ( [^)]* ) \) (.*) $/x) { -# #local $to_trace = 1 if main::DEBUG; -# trace $1, $2, $3, $4 if main::DEBUG && $to_trace; -# push @more_Names, "$code_point; $1"; -# push @more_Names, "$code_point; $3"; -# Carp::my_carp_bug("Expecting blank space before left parenthesis in '$_'. Proceeding and assuming it was there;") if $2 ne " "; -# Carp::my_carp_bug("Not expecting anything after the right parenthesis in '$_'. Proceeding and ignoring that;") if $4 ne ""; -# } -# return; -#} - { # Closure for UnicodeData.txt handling # This file was the first one in the UCD; its design leads to some @@ -9115,14 +9175,14 @@ END # Name_Alias properties. (The final duplicates elements of the # first.) A comment for it will later be constructed based on the # actual properties present and used - Property->new('Perl_Charnames', + $perl_charname = Property->new('Perl_Charnames', Core_Access => '\N{...} and "use charnames"', Default_Map => "", Directory => File::Spec->curdir(), File => 'Name', Internal_Only_Warning => 1, Perl_Extension => 1, - Range_Size_1 => 1, + Range_Size_1 => \&output_perl_charnames_line, Type => $STRING, ); @@ -9309,7 +9369,7 @@ END # Certain fields just haven't been empty so far in any Unicode # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC, # $CATEGORY. This leaves just the two fields, and so we hard-code in - # the defaults; which are verly unlikely to ever change. + # the defaults; which are very unlikely to ever change. $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq ""; $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq ""; @@ -9347,32 +9407,21 @@ END $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); $in_range = 0; } - # XXX until charnames catches up. -# if ($fields[$CHARNAME] =~ s/- $cp $//x) { -# -# # These are code points whose names end in their code points, -# # which means the names are algorithmically derivable from the -# # code points. To shorten the output Name file, the algorithm -# # for deriving these is placed in the file instead of each -# # code point, so they have map type $CP_IN_NAME -# $fields[$CHARNAME] = $CMD_DELIM -# . $MAP_TYPE_CMD -# . '=' -# . $CP_IN_NAME -# . $CMD_DELIM -# . $fields[$CHARNAME]; -# } - $fields[$NAME] = $fields[$CHARNAME]; + if ($fields[$CHARNAME] =~ s/- $cp $//x) { - # Some official names are really two alternate names with one in - # parentheses. What we do here is use the full official one for - # the standard property (stored just above), but for the charnames - # table, we add two more entries, one for each of the alternate - # ones. - # elsif name ne "" - #check_and_handle_compound_name($cp, $fields[$CHARNAME]); - #check_and_handle_compound_name($cp, $unicode_1_name); - # XXX until charnames catches up. + # These are code points whose names end in their code points, + # which means the names are algorithmically derivable from the + # code points. To shorten the output Name file, the algorithm + # for deriving these is placed in the file instead of each + # code point, so they have map type $CP_IN_NAME + $fields[$CHARNAME] = $CMD_DELIM + . $MAP_TYPE_CMD + . '=' + . $CP_IN_NAME + . $CMD_DELIM + . $fields[$CHARNAME]; + } + $fields[$NAME] = $fields[$CHARNAME]; } elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) { $fields[$CHARNAME] = $fields[$NAME] = $1; @@ -9500,6 +9549,7 @@ END # code in this subroutine that does the same thing, but doesn't # know about these ranges. $_ = ""; + return; } @@ -9575,7 +9625,6 @@ END # essentially be this code.) This uses the algorithm published by # Unicode. if (property_ref('Decomposition_Mapping')->to_output_map) { - local $to_trace = 1 if main::DEBUG; for (my $S = $SBase; $S < $SBase + $SCount; $S++) { use integer; my $SIndex = $S - $SBase; @@ -9744,7 +9793,6 @@ sub process_NamedSequences { # # This just adds the sequence to an array for later handling - return; # XXX Until charnames catches up my $file = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; @@ -9755,7 +9803,12 @@ sub process_NamedSequences { "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'"); next; } - push @named_sequences, "$sequence\t\t$name"; + + # Note single \t in keeping with special output format of + # Perl_charnames. But it turns out that the code points don't have to + # be 5 digits long, like the rest, based on the internal workings of + # charnames.pm. This could be easily changed for consistency. + push @named_sequences, "$sequence\t$name"; } return; } @@ -9858,7 +9911,7 @@ sub setup_special_casing { # The simple version's name in each mapping merely has an 's' in front # of the full one's my $simple = property_ref('s' . $case); - $simple->initialize($case) if $simple->to_output_map(); + $simple->initialize($full) if $simple->to_output_map(); } return; @@ -10981,14 +11034,14 @@ sub compile_perl() { Initialize => $Graph & $ASCII, ); - my $Print = $perl->add_match_table('Print', + $print = $perl->add_match_table('Print', Description => 'Characters that are graphical plus space characters (but no controls)', Initialize => $Blank + $Graph - $gc->table('Control'), ); $perl->add_match_table("PosixPrint", - Description => + Description => '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]', - Initialize => $Print & $ASCII, + Initialize => $print & $ASCII, ); my $Punct = $perl->add_match_table('Punct'); @@ -11128,7 +11181,6 @@ sub compile_perl() { $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V'); } - my $perl_charname = property_ref('Perl_Charnames'); # Was previously constructed to contain both Name and Unicode_1_Name my @composition = ('Name', 'Unicode_1_Name'); @@ -11166,27 +11218,6 @@ END $comment .= ", and $composition[-1]"; } - # Wait for charnames to catch up -# foreach my $entry (@more_Names, -# split "\n", <<"END" -#000A; LF -#000C; FF -#000D; CR -#0085; NEL -#200C; ZWNJ -#200D; ZWJ -#FEFF; BOM -#FEFF; BYTE ORDER MARK -#END -# ) { -# #local $to_trace = 1 if main::DEBUG; -# trace $entry if main::DEBUG && $to_trace; -# my ($code_point, $name) = split /\s*;\s*/, $entry; -# $code_point = hex $code_point; -# trace $code_point, $name if main::DEBUG && $to_trace; -# $perl_charname->add_duplicate($code_point, $name); -# } -# #$perl_charname->add_comment("This file is for charnames.pm. It is the union of the $comment properties, plus certain commonly used but unofficial names, such as 'FF' and 'ZWNJ'. Unicode_1_Name entries are used only for otherwise nameless code points.$alias_sentence"); $perl_charname->add_comment(join_lines( < to be specified, which also contains related functions viacode() -and vianame(). +charnames;> to be specified, which also contains related functions viacode(), +vianame(), and string_vianame(). =head1 Unicode regular expression properties that are NOT accepted by Perl @@ -12647,8 +12678,8 @@ accessible through the Perl core, although some may be accessed indirectly. For example, the uc() function implements the Uppercase_Mapping property and uses the F file found in this directory. -The available files with their properties (short names in parentheses), -and any flags or comments about them, are: +The available files in the current installation, with their properties (short +names in parentheses), and any flags or comments about them, are: @map_tables_actually_output @@ -13133,12 +13164,11 @@ sub generate_separator($) { . $spaces_after; } -sub generate_tests($$$$$$) { +sub generate_tests($$$$$) { # This used only for making the test script. It generates test cases that # are expected to compile successfully in perl. Note that the lhs and # rhs are assumed to already be as randomized as the caller wants. - my $file_handle = shift; # Where to output the tests my $lhs = shift; # The property: what's to the left of the colon # or equals separator my $rhs = shift; # The property value; what's to the right @@ -13155,35 +13185,31 @@ sub generate_tests($$$$$$) { # The whole 'property=value' my $name = "$lhs$separator$rhs"; + my @output; # Create a complete set of tests, with complements. if (defined $valid_code) { - printf $file_handle - qq/Expect(1, $valid_code, '\\p{$name}', $warning);\n/; - printf $file_handle - qq/Expect(0, $valid_code, '\\p{^$name}', $warning);\n/; - printf $file_handle - qq/Expect(0, $valid_code, '\\P{$name}', $warning);\n/; - printf $file_handle - qq/Expect(1, $valid_code, '\\P{^$name}', $warning);\n/; + push @output, <<"EOC" +Expect(1, $valid_code, '\\p{$name}', $warning); +Expect(0, $valid_code, '\\p{^$name}', $warning); +Expect(0, $valid_code, '\\P{$name}', $warning); +Expect(1, $valid_code, '\\P{^$name}', $warning); +EOC } if (defined $invalid_code) { - printf $file_handle - qq/Expect(0, $invalid_code, '\\p{$name}', $warning);\n/; - printf $file_handle - qq/Expect(1, $invalid_code, '\\p{^$name}', $warning);\n/; - printf $file_handle - qq/Expect(1, $invalid_code, '\\P{$name}', $warning);\n/; - printf $file_handle - qq/Expect(0, $invalid_code, '\\P{^$name}', $warning);\n/; - } - return; + push @output, <<"EOC" +Expect(0, $invalid_code, '\\p{$name}', $warning); +Expect(1, $invalid_code, '\\p{^$name}', $warning); +Expect(1, $invalid_code, '\\P{$name}', $warning); +Expect(0, $invalid_code, '\\P{^$name}', $warning); +EOC + } + return @output; } -sub generate_error($$$$) { +sub generate_error($$$) { # This used only for making the test script. It generates test cases that # are expected to not only not match, but to be syntax or similar errors - my $file_handle = shift; # Where to output to. my $lhs = shift; # The property: what's to the left of the # colon or equals separator my $rhs = shift; # The property value; what's to the right @@ -13200,9 +13226,10 @@ sub generate_error($$$$) { my $property = $lhs . $separator . $rhs; - print $file_handle qq/Error('\\p{$property}');\n/; - print $file_handle qq/Error('\\P{$property}');\n/; - return; + return <<"EOC"; +Error('\\p{$property}'); +Error('\\P{$property}'); +EOC } # These are used only for making the test script @@ -13368,14 +13395,6 @@ sub make_property_test_script() { $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name - force_unlink ($t_path); - push @files_actually_output, $t_path; - my $OUT; - if (not open $OUT, "> $t_path") { - Carp::my_carp("Can't open $t_path. Skipping: $!"); - return; - } - # Keep going down an order of magnitude # until find that adding this quantity to # 1 remains 1; but put an upper limit on @@ -13392,7 +13411,10 @@ sub make_property_test_script() { # use previous one $min_floating_slop = $next; } - print $OUT $HEADER, ; + + # It doesn't matter whether the elements of this array contain single lines + # or multiple lines. main::write doesn't count the lines. + my @output; foreach my $property (property_ref('*')) { foreach my $table ($property->tables) { @@ -13427,10 +13449,9 @@ sub make_property_test_script() { my $already_error = ! $table->file_path; # Generate error cases for this alias. - generate_error($OUT, - $property_name, - $table_name, - $already_error); + push @output, generate_error($property_name, + $table_name, + $already_error); # If the table is guaranteed to always generate an error, # quit now without generating success cases. @@ -13451,13 +13472,12 @@ sub make_property_test_script() { # Don't output duplicate test cases. if (! exists $test_generated{$test_name}) { $test_generated{$test_name} = 1; - generate_tests($OUT, - $property_name, - $standard, - $valid, - $invalid, - $warning, - ); + push @output, generate_tests($property_name, + $standard, + $valid, + $invalid, + $warning, + ); } $random = randomize_loose_name($table_name) } @@ -13469,13 +13489,12 @@ sub make_property_test_script() { my $test_name = "$property_name=$random"; if (! exists $test_generated{$test_name}) { $test_generated{$test_name} = 1; - generate_tests($OUT, - $property_name, - $random, - $valid, - $invalid, - $warning, - ); + push @output, generate_tests($property_name, + $random, + $valid, + $invalid, + $warning, + ); # If the name is a rational number, add tests for the # floating point equivalent. @@ -13517,24 +13536,22 @@ sub make_property_test_script() { if abs($table_name - $existing) < $MAX_FLOATING_SLOP; } - generate_error($OUT, - $property_name, - $table_name, - 1 # 1 => already an error - ); + push @output, generate_error($property_name, + $table_name, + 1 # 1 => already an error + ); } else { # Here the number of digits exceeds the # minimum we think is needed. So generate a # success test case for it. - generate_tests($OUT, - $property_name, - $table_name, - $valid, - $invalid, - $warning, - ); + push @output, generate_tests($property_name, + $table_name, + $valid, + $invalid, + $warning, + ); } } } @@ -13543,12 +13560,10 @@ sub make_property_test_script() { } } - foreach my $test (@backslash_X_tests) { - print $OUT "Test_X('$test');\n"; - } - - print $OUT "Finished();\n"; - close $OUT; + &write($t_path, [, + @output, + (map {"Test_X('$_');\n"} @backslash_X_tests), + "Finished();\n"]); return; } @@ -13631,7 +13646,6 @@ my @input_file_objects = ( Each_Line_Handler => \&filter_jamo_line, ), Input_file->new('UnicodeData.txt', v1.1.5, -non_skip => 1, Pre_Handler => \&setup_UnicodeData, # We clean up this file for some early versions. @@ -13824,15 +13838,19 @@ File::Find::find({ }, File::Spec->curdir()); my @mktables_list_output_files; +my $old_start_time = 0; -if ($write_unchanged_files) { +if (! -e $file_list) { + print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE; + $write_unchanged_files = 1; +} elsif ($write_unchanged_files) { print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE; } else { print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE; my $file_handle; if (! open $file_handle, "<", $file_list) { - Carp::my_carp("Failed to open '$file_list' (this is expected to be missing the first time); turning on -globlist option instead: $!"); + Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!"); $glob_list = 1; } else { @@ -13843,6 +13861,9 @@ else { for my $list ( \@input, \@mktables_list_output_files ) { while (<$file_handle>) { s/^ \s+ | \s+ $//xg; + if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) { + $old_start_time = $1; + } next if /^ \s* (?: \# .* )? $/x; last if /^ =+ $/x; my ( $file ) = split /\t/; @@ -13950,12 +13971,12 @@ if ( $verbosity >= $VERBOSE ) { "Checking ".scalar( @mktables_list_output_files )." output files.\n"; } -# We set $youngest to be the most recently changed input file, including this -# program itself (done much earlier in this file) +# We set $most_recent to be the most recently changed input file, including +# this program itself (done much earlier in this file) foreach my $in (@input_files) { - my $age = -M $in; - next unless defined $age; # Keep going even if missing a file - $youngest = $age if $age < $youngest; + next unless -e $in; # Keep going even if missing a file + my $mod_time = (stat $in)[9]; + $most_recent = $mod_time if $mod_time > $most_recent; # See that the input files have distinct names, to warn someone if they # are adding a new one @@ -13968,30 +13989,31 @@ foreach my $in (@input_files) { } } -my $ok = ! $write_unchanged_files - && scalar @mktables_list_output_files; # If none known, rebuild +my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild + || ! scalar @mktables_list_output_files # or if no outputs known + || $old_start_time < $most_recent; # or out-of-date # Now we check to see if any output files are older than youngest, if # they are, we need to continue on, otherwise we can presumably bail. -if ($ok) { +if (! $rebuild) { foreach my $out (@mktables_list_output_files) { if ( ! file_exists($out)) { print "'$out' is missing.\n" if $verbosity >= $VERBOSE; - $ok = 0; + $rebuild = 1; last; } #local $to_trace = 1 if main::DEBUG; - trace $youngest, -M $out if main::DEBUG && $to_trace; - if ( -M $out > $youngest ) { - #trace "$out: age: ", -M $out, ", youngest: $youngest\n" if main::DEBUG && $to_trace; + trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace; + if ( (stat $out)[9] <= $most_recent ) { + #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace; print "'$out' is too old.\n" if $verbosity >= $VERBOSE; - $ok = 0; + $rebuild = 1; last; } } } -if ($ok) { - print "Files seem to be ok, not bothering to rebuild.\n"; +if (! $rebuild) { + print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n"; exit(0); } print "Must rebuild tables.\n" if $verbosity >= $VERBOSE; @@ -14034,11 +14056,12 @@ if ( $file_list and $make_list ) { return } else { + my $localtime = localtime $start_time; print $ofh <<"END"; # # $file_list -- File list for $0. # -# Autogenerated on @{[scalar localtime]} +# Autogenerated starting on $start_time ($localtime) # # - First section is input files # ($0 itself is not listed but is automatically considered an input) @@ -14091,6 +14114,11 @@ __DATA__ use strict; use warnings; +# If run outside the normal test suite on an ASCII platform, you can +# just create a latin1_to_native() function that just returns its +# inputs, because that's the only function used from test.pl +require "test.pl"; + # Test qr/\X/ and the \p{} regular expression constructs. This file is # constructed by mktables from the tables it generates, so if mktables is # buggy, this won't necessarily catch those bugs. Tests are generated for all @@ -14103,42 +14131,6 @@ use warnings; my $Tests = 0; my $Fails = 0; -my $non_ASCII = (ord('A') != 65); - -# The 256 8-bit characters in ASCII ordinal order, with the ones that don't -# have Perl names replaced by -1 -my @ascii_ordered_chars = ( - "\0", - (-1) x 6, - "\a", "\b", "\t", "\n", - -1, # No Vt - "\f", "\r", - (-1) x 18, - " ", "!", "\"", "#", '$', "%", "&", "'", - "(", ")", "*", "+", ",", "-", ".", "/", - "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", - ":", ";", "<", "=", ">", "?", "@", - "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", - "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", - "[", "\\", "]", "^", "_", "`", - "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", - "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", - "{", "|", "}", "~", - (-1) x 129 -); - -sub ASCII_ord_to_native ($) { - # Converts input ordinal number to the native one, if can be done easily. - # Returns -1 otherwise. - - my $ord = shift; - - return $ord if $ord > 255 || ! $non_ASCII; - my $result = $ascii_ordered_chars[$ord]; - return $result if $result eq '-1'; - return ord($result); -} - sub Expect($$$$) { my $expected = shift; my $ord = shift; @@ -14146,17 +14138,7 @@ sub Expect($$$$) { my $warning_type = shift; # Type of warning message, like 'deprecated' # or empty if none my $line = (caller)[2]; - - # Convert the non-ASCII code points expressible as characters to their - # ASCII equivalents, and skip the others. - $ord = ASCII_ord_to_native($ord); - if ($ord < 0) { - $Tests++; - print "ok $Tests - " - . sprintf("\"\\x{%04X}\"", $ord) - . " =~ $regex # Skipped: non-ASCII\n"; - return; - } + $ord = ord(latin1_to_native(chr($ord))); # Convert the code point to hex form my $string = sprintf "\"\\x{%04X}\"", $ord; @@ -14287,13 +14269,7 @@ sub Test_X($) { my $this_string = ""; my $this_display = ""; foreach my $code_point (@code_points) { - my $ord = ASCII_ord_to_native(hex $code_point); - if ($ord < 0) { - $Tests++; - print "ok $Tests - String containing $code_point =~ /(\\X)/g # Skipped: non-ASCII\n"; - return; - } - $this_string .= chr $ord; + $this_string .= latin1_to_native(chr(hex $code_point)); $this_display .= "\\x{$code_point}"; }