# changed 0+$self to pack 'J', $self.)
my $start_time;
-BEGIN { # Get the time the script started running; do it at compiliation to
+BEGIN { # Get the time the script started running; do it at compilation to
# get it as close as possible
$start_time= time;
}
# their rational equivalent
my %loose_property_name_of; # Loosely maps property names to standard form
+# Most properties are immune to caseless matching, otherwise you would get
+# nonsensical results, as properties are a function of a code point, not
+# everything that is caselessly equivalent to that code point. For example,
+# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
+# be true because 's' and 'S' are equivalent caselessly. However,
+# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
+# extend that concept to those very few properties that are like this. Each
+# such property will match the full range caselessly. They are hard-coded in
+# the program; it's not worth trying to make it general as it's extremely
+# unlikely that they will ever change.
+my %caseless_equivalent_to;
+
# These constants names and values were taken from the Unicode standard,
# version 5.1, section 3.12. They are used in conjunction with Hangul
# syllables. The '_string' versions are so generated tables can retain the
my @annotate_char_type; # Contains a type of those characters, specifically
# for the purposes of annotation.
my $annotate_ranges; # A map of ranges of code points that have the same
- # name for the purposes of annoation. They map to the
+ # name for the purposes of annotation. They map to the
# upper edge of the range, so that the end point can
# be immediately found. This is used to skip ahead to
# the end of a range, and avoid processing each
trace "i =[", $i, "]", $r->[$i];
trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
- trace "removed @return";
+ trace "removed ", @return if @return;
}
# An actual deletion could have changed the maximum in the list.
# the character very frequently used.
return $try_hard if $code == 0x0000;
- return 0 if $try_hard; # XXX Temporary until fix utf8.c
-
# shun non-character code points.
return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
# A comment about its being obsolete, or whatever non normal status it has
main::set_access('status_info', \%status_info, 'r');
+ my %caseless_equivalent;
+ # The table this is equivalent to under /i matching, if any.
+ main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
+
my %range_size_1;
# Is the table to be output with each range only a single code point?
# This is done to avoid breaking existing code that may have come to rely
$status{$addr} = delete $args{'Status'} || $NORMAL;
$status_info{$addr} = delete $args{'_Status_Info'} || "";
$range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
+ $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
my $description = delete $args{'Description'};
my $externally_ok = delete $args{'Externally_Ok'};
# not, is normal. The lists are prioritized so the most serious
# ones are checked first
if (exists $why_suppressed{$complete_name}
- # Don't suppress if overriden
+ # Don't suppress if overridden
&& ! grep { $_ eq $complete_name{$addr} }
@output_mapped_properties)
{
# The pack() below can't cope with surrogates.
if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
- Carp::my_carp("Surrogage code point '$code_point' in mapping to '$map' in $self. No map created");
+ Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
next;
}
# not quite so many.
# If they are related, one must be a perl extension. This is because
# we can't guarantee that Unicode won't change one or the other in a
- # later release even if they are idential now.
+ # later release even if they are identical now.
my $self = shift;
my $other = shift;
my $status = $other->status;
my $status_info = $other->status_info;
my $matches_all = $matches_all{other_addr};
+ my $caseless_equivalent = $other->caseless_equivalent;
foreach my $table ($current_leader, @{$equivalents{$leader}}) {
next if $table == $other;
trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
$parent{$table_addr} = $other;
push @{$children{$other_addr}}, $table;
$table->set_status($status, $status_info);
+ $self->set_caseless_equivalent($caseless_equivalent);
}
}
# each of them is stored in %alias_to_property_of as they are defined.
# But it's possible that this subroutine will be called with some
# variant, so if the initial lookup fails, it is repeated with the
- # standarized form of the input name. If found, besides returning the
+ # standardized form of the input name. If found, besides returning the
# result, the input name is added to the list so future calls won't
# have to do the conversion again.
. " argument to '-='. Subtraction ignored.");
return $self;
}
- elsif ($reversed) { # Shouldnt happen in a -=, but just in case
+ elsif ($reversed) { # Shouldn't happen in a -=, but just in case
Carp::my_carp_bug("Can't cope with a "
. __PACKAGE__
. " being the first parameter in a '-='. Subtraction ignored.");
# A blank separates the joined lines except if there is a break; an extra
# blank is inserted after a period ending a line.
- # Intialize the return with the first line.
+ # Initialize the return with the first line.
my ($return, @lines) = split "\n", shift;
# If the first line is null, it was an empty line, add the \n back in
$name =~ s/^\s+//g;
$name =~ s/\s+$//g;
- # Convert interior white space and hypens into underscores.
+ # Convert interior white space and hyphens into underscores.
$name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
# Capitalize the letter following an underscore, and convert a sequence of
return lc $name;
}
+sub utf8_heavy_name ($$) {
+ # Returns the name that utf8_heavy.pl will use to find a table. XXX
+ # perhaps this function should be placed somewhere, like Heavy.pl so that
+ # utf8_heavy can use it directly without duplicating code that can get
+ # out-of sync.
+
+ my $table = shift;
+ my $alias = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my $property = $table->property;
+ $property = ($property == $perl)
+ ? "" # 'perl' is never explicitly stated
+ : standardize($property->name) . '=';
+ if ($alias->loose_match) {
+ return $property . standardize($alias->name);
+ }
+ else {
+ return lc ($property . $alias->name);
+ }
+
+ return;
+}
+
{ # Closure
my $indent_increment = " " x 2;
my $fold = property_ref('Case_Folding');
$fold->set_file('Fold') if defined $fold;
- # utf8.c can't currently cope with non range-size-1 for these, and even if
- # it were changed to do so, someone else may be using them, expecting the
- # old style
+ # utf8.c has a different meaning for non range-size-1 for map properties
+ # that this program doesn't currently handle; and even if it were changed
+ # to do so, some other code may be using them expecting range size 1.
foreach my $property (qw {
Case_Folding
Lowercase_Mapping
#
# meaning the codepoints in the range all have the value 'map' under
# 'property'.
- # Beginning and trailing white space in each field are not signficant.
+ # Beginning and trailing white space in each field are not significant.
# Note there is not a trailing semi-colon in the above. A trailing
# semi-colon means the map is a null-string. An omitted map, as
# opposed to a null-string, is assumed to be 'Y', based on Unicode
# file, in any order, interspersed in any way. The first time a
# property is seen, it gets information about that property and
# caches it for quick retrieval later. It also normalizes the maps
- # so that only one of many synonym is stored. The Unicode input files
- # do use some multiple synonyms.
+ # so that only one of many synonyms is stored. The Unicode input
+ # files do use some multiple synonyms.
my $file = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# If the map begins with a special command to us (enclosed in
# delimiters), extract the command(s).
- if (substr($map, 0, 1) eq $CMD_DELIM) {
- while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
- my $command = $1;
- if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
- $replace = $1;
- }
- elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
- $map_type = $1;
- }
- else {
- $file->carp_bad_line("Unknown command line: '$1'");
- next LINE;
- }
+ while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
+ my $command = $1;
+ if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
+ $replace = $1;
+ }
+ elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
+ $map_type = $1;
+ }
+ else {
+ $file->carp_bad_line("Unknown command line: '$1'");
+ next LINE;
}
}
}
# the code point and name on each line. This was actually the hardest
# thing to design around. The code points in those ranges may actually
# have real maps not given by these two lines. These maps will either
- # be algorthimically determinable, or in the extracted files furnished
+ # be algorithmically determinable, or in the extracted files furnished
# with the UCD. In the event of conflicts between these extracted files,
# and this one, Unicode says that this one prevails. But it shouldn't
# prevail for conflicts that occur in these ranges. The data from the
# one.
# Titlecase duplicates UnicodeData.txt: gc=lt
# Unassigned Code Value duplicates UnicodeData.txt: gc=cc
- # Zero-width never made into offical property;
+ # Zero-width never made into official property;
# subset of gc=cf
# Most of the properties have the same names in this file as in later
# versions, but a couple do not.
}
# Add any remaining code points to the mapping, using the default for
- # missing code points
+ # missing code points.
if (defined (my $default_map = $property->default_map)) {
- foreach my $range ($property->inverse_list->ranges) {
- $property->add_map($range->start, $range->end, $default_map);
- }
+
+ # This fills in any missing values with the default.
+ $property->add_map(0, $LAST_UNICODE_CODEPOINT,
+ $default_map, Replace => $NO);
# Make sure there is a match table for the default
if (! defined $property->table($default_map)) {
$LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
my $Cs = $gc->table('Cs');
- if (defined $Cs) {
- $Cs->add_note('Mostly not usable in Perl.');
- $Cs->add_comment(join_lines(<<END
-Surrogates are used exclusively for I/O in UTF-16, and should not appear in
-Unicode text, and hence their use will generate (usually fatal) messages
-END
- ));
- }
# Folding information was introduced later into Unicode data. To get
my $description_start = "Code point's usage introduced in version ";
$first_age->add_description($description_start . $first_age->name);
- # To construct the accumlated values, for each of the age tables
+ # To construct the accumulated values, for each of the age tables
# starting with the 2nd earliest, merge the earliest with it, to get
# all those code points existing in the 2nd earliest. Repeat merging
# the new 2nd earliest with the 3rd earliest to get all those existing
sub register_file_for_name($$$) {
# Given info about a table and a datafile that it should be associated
- # with, register that assocation
+ # with, register that association
my $table = shift;
my $directory_ref = shift; # Array of the directory path for the file
my $deprecated = ($table->status eq $DEPRECATED)
? $table->status_info
: "";
+ my $caseless_equivalent = $table->caseless_equivalent;
# And for each of the table's aliases... This inner loop eventually
# goes through all aliases in the UCD that we generate regex match
# files for
foreach my $alias ($table->aliases) {
- my $name = $alias->name;
+ my $standard = utf8_heavy_name($table, $alias);
# Generate an entry in either the loose or strict hashes, which
# will translate the property and alias names combination into the
# file where the table for them is stored.
- my $standard;
if ($alias->loose_match) {
- $standard = $property . standardize($alias->name);
if (exists $loose_to_file_of{$standard}) {
Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
}
}
}
else {
- $standard = lc ($property . $name);
if (exists $stricter_to_file_of{$standard}) {
Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
}
# will work. Also note that this assumes that such a
# number is matched strictly; so if that were to change,
# this would be wrong.
- if ((my $integer_name = $name)
+ if ((my $integer_name = $alias->name)
=~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
{
$stricter_to_file_of{$property . $integer_name}
if ($deprecated) {
$utf8::why_deprecated{$sub_filename} = $deprecated;
}
+
+ # And a substitute table, if any, for case-insensitive matching
+ if ($caseless_equivalent != 0) {
+ $caseless_equivalent_to{$standard} = $caseless_equivalent;
+ }
}
}
# expression, but with only one of 'Single', 'Short' if there
# are both items.
if ($short_name || $single_form || $table->conflicting) {
- $parenthesized .= '(';
$parenthesized .= "Short: $short_name" if $short_name;
if ($short_name && $single_form) {
$parenthesized .= ', ';
# to go on every entry.
my $conflicting = join " NOR ", $table->conflicting;
if ($conflicting) {
- $parenthesized .= '(' if ! $parenthesized;
- $parenthesized .= '; ' if $parenthesized ne '(';
+ $parenthesized .= '; ' if $parenthesized ne "";
$parenthesized .= "NOT $conflicting";
}
- $parenthesized .= ')' if $parenthesized;
- push @info, $parenthesized if $parenthesized;
+ push @info, "($parenthesized)" if $parenthesized;
if ($table_property != $perl && $table->perl_extension) {
push @info, '(Perl extension)';
=back
Some properties are considered obsolete, but still available. There are
-several varieties of obsolesence:
+several varieties of obsolescence:
=over 4
@block_warning
The table below has two columns. The left column contains the \\p{}
-constructs to look up, possibly preceeded by the flags mentioned above; and
+constructs to look up, possibly preceded by the flags mentioned above; and
the right column contains information about them, like a description, or
synonyms. It shows both the single and compound forms for each property that
has them. If the left column is a short name for a property, the right column
push @heavy, <<END;
);
+# A few properties have different behavior under /i matching. This maps the
+# those to substitute files to use under /i.
+\%utf8::caseless_equivalent = (
+END
+
+
+ # We set the key to the file when we associated files with tables, but we
+ # couldn't do the same for the value then, as we might not have the file
+ # for the alternate table figured out at that time.
+ foreach my $cased (keys %caseless_equivalent_to) {
+ my @path = $caseless_equivalent_to{$cased}->file_path;
+ my $path = join '/', @path[1, -1];
+ $path =~ s/\.pl//;
+ $utf8::caseless_equivalent_to{$cased} = $path;
+ }
+ push @heavy, simple_dumper (\%utf8::caseless_equivalent_to, ' ' x 4);
+ push @heavy, <<END;
+);
+
1;
END
$filename = $table->file;
}
- # Use specified filename if avaliable, or default to property's
+ # Use specified filename if available, or default to property's
# shortest name. We need an 8.3 safe filename (which means "an 8
# safe" filename, since after the dot is only 'pl', which is < 3)
# The 2nd parameter is if the filename shouldn't be changed, and
#
# - First section is input files
# ($0 itself is not listed but is automatically considered an input)
-# - Section seperator is /^=+\$/
+# - Section separator is /^=+\$/
# - Second section is a list of output files.
# - Lines matching /^\\s*#/ are treated as comments
# which along with blank lines are ignored.