}
}
+my @warnings;
+local $SIG{__WARN__} = sub { push @warnings, @_ };
+
use strict;
use Unicode::UCD;
use Test::More;
use Unicode::UCD 'charinfo';
+my $input_record_separator = 7; # Make sure Unicode::UCD isn't affected by
+$/ = $input_record_separator; # setting this.
+
my $charinfo;
is(charinfo(0x110000), undef, "Verify charinfo() of non-unicode is undef");
# If this fails, then maybe one should look at the Unicode changes to see
# what else might need to be updated.
-is(Unicode::UCD::UnicodeVersion, '6.0.0', 'UnicodeVersion');
+is(Unicode::UCD::UnicodeVersion, '7.0.0', 'UnicodeVersion');
use Unicode::UCD qw(compexcl);
is($casefold->{simple}, "", 'casefold 0xDF simple');
is($casefold->{turkic}, "", 'casefold 0xDF turkic');
-# Do different tests depending on if version <= 3.1, or not.
-(my $version = Unicode::UCD::UnicodeVersion) =~ /^(\d+)\.(\d+)/;
-if (defined $1 && ($1 <= 2 || $1 == 3 && defined $2 && $2 <= 1)) {
+# Do different tests depending on if version < 3.2, or not.
+my $v_unicode_version = pack "C*", split /\./, Unicode::UCD::UnicodeVersion();
+if ($v_unicode_version lt v3.2.0) {
$casefold = casefold(0x130);
is($casefold->{code}, '0130', 'casefold 0x130 code');
{
my $r1 = charscript('Latin');
- my $n1 = @$r1;
- is($n1, 30, "number of ranges in Latin script (Unicode 6.0.0)");
- shift @$r1 while @$r1;
- my $r2 = charscript('Latin');
- is(@$r2, $n1, "modifying results should not mess up internal caches");
+ if (ok(defined $r1, "Found Latin script")) {
+ my $n1 = @$r1;
+ is($n1, 33, "number of ranges in Latin script (Unicode 7.0.0)");
+ shift @$r1 while @$r1;
+ my $r2 = charscript('Latin');
+ is(@$r2, $n1, "modifying results should not mess up internal caches");
+ }
}
{
is(num("\N{ETHIOPIC NUMBER TEN THOUSAND}"), 10000, 'Verify num("\N{ETHIOPIC NUMBER TEN THOUSAND}") == 10000');
is(num("\N{NORTH INDIC FRACTION ONE HALF}"), .5, 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5');
is(num("\N{U+12448}"), 9, 'Verify num("\N{U+12448}") == 9');
+is(num("\N{U+5146}"), 1000000000000, 'Verify num("\N{U+5146}") == 1000000000000');
# Create a user-defined property
sub InKana {<<'END'}
"prop_aliases('isgc') returns <undef> since is not covered Perl extension");
is(prop_aliases("Is_Is_Any"), undef,
"prop_aliases('Is_Is_Any') returns <undef> since two is's");
+is(prop_aliases("ccc=vr"), undef,
+ "prop_aliases('ccc=vr') doesn't generate a warning");
require 'utf8_heavy.pl';
require "unicore/Heavy.pl";
], "prop_aliases('perldecimaldigit') returns Perl_Decimal_Digit as both short and full names");
# Get the official Unicode property name synonyms and test them.
+
+SKIP: {
+skip "PropertyAliases.txt is not in this Unicode version", 1 if $v_unicode_version lt v3.2.0;
open my $props, "<", "../lib/unicore/PropertyAliases.txt"
or die "Can't open Unicode PropertyAliases.txt";
+local $/ = "\n";
while (<$props>) {
s/\s*#.*//; # Remove comments
next if /^\s* $/x; # Ignore empty and comment lines
chomp;
+ local $/ = $input_record_separator;
my $count = 0; # 0th field in line is short name; 1th is long name
my $short_name;
my $full_name;
# matching, which the tested function does on all inputs.
my $mod_name = "$extra_chars$alias";
- my $loose = utf8::_loose_name(lc $alias);
+ my $loose = &utf8::_loose_name(lc $alias);
# Indicate we have tested this.
$props{$loose} = 1;
$count++;
}
}
+} # End of SKIP block
# Now test anything we can find that wasn't covered by the tests of the
# official properties. We have no way of knowing if mktables omitted a Perl
# extension or not, but we do the best we can from its generated lists
-foreach my $alias (keys %utf8::loose_to_file_of) {
+foreach my $alias (sort keys %utf8::loose_to_file_of) {
next if $alias =~ /=/;
my $lc_name = lc $alias;
- my $loose = utf8::_loose_name($lc_name);
+ my $loose = &utf8::_loose_name($lc_name);
next if exists $props{$loose}; # Skip if already tested
$props{$loose} = 1;
my $mod_name = "$extra_chars$alias"; # Tests loose matching
my @aliases = prop_aliases($mod_name);
- my $found_it = grep { utf8::_loose_name(lc $_) eq $lc_name } @aliases;
+ my $found_it = grep { &utf8::_loose_name(lc $_) eq $lc_name } @aliases;
if ($found_it) {
pass("prop_aliases: '$lc_name' is listed as an alias for '$mod_name'");
}
# returned as an alias, so having successfully stripped it off above,
# try again.
if ($stripped) {
- $found_it = grep { utf8::_loose_name(lc $_) eq $lc_name } @aliases;
+ $found_it = grep { &utf8::_loose_name(lc $_) eq $lc_name } @aliases;
}
# If that didn't work, it could be that it's a block, which is always
# returned with a leading 'In_' to avoid ambiguity. Try comparing
# with that stripped off.
if (! $found_it) {
- $found_it = grep { utf8::_loose_name(s/^In_(.*)/\L$1/r) eq $lc_name }
+ $found_it = grep { &utf8::_loose_name(s/^In_(.*)/\L$1/r) eq $lc_name }
@aliases;
# Could check that is a real block, but tests for invmap will
# likely pickup any errors, since this will be tested there.
# correct.
my %pva_tested; # List of things already tested.
+
+SKIP: {
+skip "PropValueAliases.txt is not in this Unicode version", 1 if $v_unicode_version lt v3.2.0;
open my $propvalues, "<", "../lib/unicore/PropValueAliases.txt"
or die "Can't open Unicode PropValueAliases.txt";
+local $/ = "\n";
while (<$propvalues>) {
s/\s*#.*//; # Remove comments
next if /^\s* $/x; # Ignore empty and comment lines
chomp;
+ local $/ = $input_record_separator;
+
+ # Fix typo in official input file
+ s/CCC133/CCC132/g if $v_unicode_version eq v6.1.0;
my @fields = split /\s*;\s*/; # Fields are separated by semi-colons
my $prop = shift @fields; # 0th field is the property,
$fields[0] = $fields[1];
}
elsif ($fields[0] ne $fields[1]
- && utf8::_loose_name(lc $fields[0])
- eq utf8::_loose_name(lc $fields[1])
+ && &utf8::_loose_name(lc $fields[0])
+ eq &utf8::_loose_name(lc $fields[1])
&& $fields[1] !~ /[[:upper:]]/)
{
# Also, there is a bug in the file in which "n/a" is omitted, and
# the short and full names, respectively. See comments in input file.
splice (@fields, 0, 0, splice(@fields, 1, 2)) if $prop eq 'ccc';
- my $loose_prop = utf8::_loose_name(lc $prop);
+ my $loose_prop = &utf8::_loose_name(lc $prop);
my $suppressed = grep { $_ eq $loose_prop }
@Unicode::UCD::suppressed_properties;
foreach my $value (@fields) {
is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop");
next;
}
- elsif (grep { $_ eq ("$loose_prop=" . utf8::_loose_name(lc $value)) } @Unicode::UCD::suppressed_properties) {
+ elsif (grep { $_ eq ("$loose_prop=" . &utf8::_loose_name(lc $value)) } @Unicode::UCD::suppressed_properties) {
is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop=$value");
next;
}
else {
my @all_names = prop_value_aliases($mod_prop, $mod_value);
is_deeply(\@all_names, \@names_via_short, "In '$prop', prop_value_aliases() returns the same list for both '$short_name' and '$mod_value'");
- ok((grep { utf8::_loose_name(lc $_) eq utf8::_loose_name(lc $value) } prop_value_aliases($prop, $short_name)), "'$value' is listed as an alias for prop_value_aliases('$prop', '$short_name')");
+ ok((grep { &utf8::_loose_name(lc $_) eq &utf8::_loose_name(lc $value) } prop_value_aliases($prop, $short_name)), "'$value' is listed as an alias for prop_value_aliases('$prop', '$short_name')");
}
- $pva_tested{utf8::_loose_name(lc $prop) . "=" . utf8::_loose_name(lc $value)} = 1;
+ $pva_tested{&utf8::_loose_name(lc $prop) . "=" . &utf8::_loose_name(lc $value)} = 1;
$count++;
}
}
+} # End of SKIP block
# And test as best we can, the non-official pva's that mktables generates.
foreach my $hash (\%utf8::loose_to_file_of, \%utf8::stricter_to_file_of) {
- foreach my $test (keys %$hash) {
+ foreach my $test (sort keys %$hash) {
next if exists $pva_tested{$test}; # Skip if already tested
my ($prop, $value) = split "=", $test;
is_deeply(\@l_, \@LC, "prop_value_aliases('$mod_prop', '$mod_value) returns the same list as prop_value_aliases('gc', 'lc')");
}
else {
- ok((grep { utf8::_loose_name(lc $_) eq utf8::_loose_name(lc $value) }
+ ok((grep { &utf8::_loose_name(lc $_) eq &utf8::_loose_name(lc $value) }
prop_value_aliases($mod_prop, $mod_value)),
"'$value' is listed as an alias for prop_value_aliases('$mod_prop', '$mod_value')");
}
# whole thing.
my $prop = "uc";
my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
-is($format, 'cl', "prop_invmap() format of '$prop' is 'cl'");
-is($missing, '<code point>', "prop_invmap() missing of '$prop' is '<code point>'");
+is($format, 'al', "prop_invmap() format of '$prop' is 'al'");
+is($missing, '0', "prop_invmap() missing of '$prop' is '0'");
is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61");
is($invmap_ref->[1], 0x41, "prop_invmap('$prop') map[1] is 0x41");
$prop = "upper";
($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
-is($format, 's', "prop_invmap() format of '$prop' is 'cl'");
-is($missing, 'N', "prop_invmap() missing of '$prop' is '<code point>'");
+is($format, 's', "prop_invmap() format of '$prop' is 's");
+is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'");
is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41");
is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'");
$prop = "lower";
($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
-is($format, 's', "prop_invmap() format of '$prop' is 'cl'");
-is($missing, 'N', "prop_invmap() missing of '$prop' is '<code point>'");
+is($format, 's', "prop_invmap() format of '$prop' is 's'");
+is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'");
is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61");
is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'");
$prop = "lc";
($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
-is($format, 'cl', "prop_invmap() format of '$prop' is 'cl'");
-is($missing, '<code point>', "prop_invmap() missing of '$prop' is '<code point>'");
+is($format, 'al', "prop_invmap() format of '$prop' is 'al'");
+is($missing, '0', "prop_invmap() missing of '$prop' is '0'");
is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41");
is($invmap_ref->[1], 0x61, "prop_invmap('$prop') map[1] is 0x61");
require File::Temp;
my $off = File::Temp->new();
+ local $/ = "\n";
chomp $official;
print $off $official, "\n";
close $off || die "Can't close official";
# strict
foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of)
{
- foreach my $table (keys %$set_of_tables) {
+ foreach my $table (sort keys %$set_of_tables) {
my $mod_table;
my ($prop_only, $value) = split "=", $table;
}
else { # Single-form.
- # Like above, use looose if required, and insert underscores
+ # Like above, use loose if required, and insert underscores
# between digits if strict.
if ($set_of_tables == \%utf8::loose_to_file_of) {
$mod_table = "$extra_chars$table";
}
$tested_invlist{$file} = dclone \@tested;
- # A leading '!' in the file name means that it is to be inverted.
- my $invert = $file =~ s/^!//;
- my $official = do "unicore/lib/$file.pl";
+ # A '!' in the file name means that it is to be inverted.
+ my $invert = $file =~ s/!//;
+ my $official;
+
+ # If the file's directory is '#', it is a special case where the
+ # contents are in-lined with semi-colons meaning new-lines, instead of
+ # it being an actual file to read. The file is an index in to the
+ # array of the definitions
+ if ($file =~ s!^#/!!) {
+ $official = $utf8::inline_definitions[$file];
+ }
+ else {
+ $official = do "unicore/lib/$file.pl";
+ }
# Get rid of any trailing space and comments in the file.
$official =~ s/\s*(#.*)?$//mg;
+ local $/ = "\n";
chomp $official;
+ $/ = $input_record_separator;
# If we are to test against an inverted file, it is easier to invert
# our array than the file.
- # The file only is valid for Unicode code points, while the inversion
- # list is valid for all possible code points. Therefore, we must test
- # just the Unicode part against the file. Later we will test for
- # the non-Unicode part.
-
- my $before_invert; # Saves the pre-inverted table.
if ($invert) {
- $before_invert = dclone \@tested;
if (@tested && $tested[0] == 0) {
shift @tested;
} else {
unshift @tested, 0;
}
- if (@tested && $tested[-1] == 0x110000) {
- pop @tested;
- }
- else {
- push @tested, 0x110000;
- }
}
# Now construct a string from the list that should match the file.
- # The file gives ranges of code points with starting and ending values
- # in hex, like this:
- # 0041\t005A
- # 0061\t007A
- # 00AA
- # Our list has even numbered elements start ranges that are in the
- # list, and odd ones that aren't in the list. Therefore the odd
- # numbered ones are one beyond the end of the previous range, but
- # otherwise don't get reflected in the file.
- my $tested = "";
- my $i = 0;
- for (; $i < @tested - 1; $i += 2) {
- my $start = $tested[$i];
- my $end = $tested[$i+1] - 1;
- if ($start == $end) {
- $tested .= sprintf("%04X\n", $start);
- }
- else {
- $tested .= sprintf "%04X\t%04X\n", $start, $end;
- }
- }
-
- # As mentioned earlier, the disk files only go up through Unicode,
- # whereas the prop_invlist() ones go as high as necessary. The
- # comparison is only valid through max Unicode.
- if ($i == @tested - 1 && $tested[$i] <= 0x10FFFF) {
- $tested .= sprintf("%04X\t10FFFF\n", $tested[$i]);
- }
+ # The file is inversion list format code points, like this:
+ # V1216
+ # 65 # [26]
+ # 91
+ # 192 # [23]
+ # ...
+ # The V indicates it's an inversion list, and is followed immediately
+ # by the number of elements (lines) that follow giving its contents.
+ # The list has even numbered elements (0th, 2nd, ...) start ranges
+ # that are in the list, and odd ones that aren't in the list.
+ # Therefore the odd numbered ones are one beyond the end of the
+ # previous range, but otherwise don't get reflected in the file.
+ my $tested = join "\n", ("V" . scalar @tested), @tested;
+ local $/ = "\n";
chomp $tested;
+ $/ = $input_record_separator;
if ($tested ne $official) {
fail_with_diff($mod_table, $official, $tested, "prop_invlist");
next;
}
- # Here, it matched the table. Now need to check for if it is correct
- # for beyond Unicode. First, calculate if is the default table or
- # not. This is the same algorithm as used internally in
- # prop_invlist(), so if it is wrong there, this test won't catch it.
- my $prop = lc $table;
- ($prop_only, $table) = split /\s*[:=]\s*/, $prop;
- if (defined $table) {
-
- # May have optional prefixed 'is'
- $prop = utf8::_loose_name($prop_only) =~ s/^is//r;
- $prop = $utf8::loose_property_name_of{$prop};
- $prop .= "=" . utf8::_loose_name($table);
- }
- else {
- $prop = utf8::_loose_name($prop);
- }
- my $is_default = exists $Unicode::UCD::loose_defaults{$prop};
-
- @tested = @$before_invert if $invert; # Use the original
- if (@tested % 2 == 0) {
-
- # If there are an even number of elements, the final one starts a
- # range (going to infinity) of code points that are not in the
- # list.
- if ($is_default) {
- fail("prop_invlist('$mod_table')");
- diag("default table doesn't goto infinity");
- use Data::Dumper;
- diag Dumper \@tested;
- next;
- }
- }
- else {
- # An odd number of elements means the final one starts a range
- # (going to infinity of code points that are in the list.
- if (! $is_default) {
- fail("prop_invlist('$mod_table')");
- diag("non-default table needs to stop in the Unicode range");
- use Data::Dumper;
- diag Dumper \@tested;
- next;
- }
- }
-
pass("prop_invlist('$mod_table')");
}
}
@list = prop_invmap("Is_Is_Any");
is(@list, 0, "prop_invmap('Is_Is_Any') returns <undef> since two is's");
+# The files for these properties are not used by Perl, but are retained for
+# backwards compatibility with applications that read them directly, with
+# comments in them that their use is deprecated. Until such time as we remove
+# them completely, we test that they exist, are correct, and that their
+# formats haven't changed. This hash contains the info needed to test them as
+# if they were regular properties. 'replaced_by' gives the equivalent
+# property now used by Perl.
+my %legacy_props = (
+ Legacy_Case_Folding => { replaced_by => 'cf',
+ file => 'To/Fold',
+ swash_name => 'ToFold'
+ },
+ Legacy_Lowercase_Mapping => { replaced_by => 'lc',
+ file => 'To/Lower',
+ swash_name => 'ToLower'
+ },
+ Legacy_Titlecase_Mapping => { replaced_by => 'tc',
+ file => 'To/Title',
+ swash_name => 'ToTitle'
+ },
+ Legacy_Uppercase_Mapping => { replaced_by => 'uc',
+ file => 'To/Upper',
+ swash_name => 'ToUpper'
+ },
+ Legacy_Perl_Decimal_Digit => { replaced_by => 'Perl_Decimal_Digit',
+ file => 'To/Digit',
+ swash_name => 'ToDigit'
+ },
+ );
+
+foreach my $legacy_prop (keys %legacy_props) {
+ @list = prop_invmap($legacy_prop);
+ is(@list, 0, "'$legacy_prop' is unknown to prop_invmap");
+}
+
+# The files for these properties shouldn't have their formats changed in case
+# applications use them (though such use is deprecated).
+my @legacy_file_format = (keys %legacy_props,
+ qw( Bidi_Mirroring_Glyph
+ NFKC_Casefold
+ )
+ );
+
# The set of properties to test on has already been compiled into %props by
# the prop_aliases() tests.
# lists returned by prop_invlist(), which has already been tested.
PROPERTY:
-foreach my $prop (keys %props) {
- my $loose_prop = utf8::_loose_name(lc $prop);
+foreach my $prop (sort(keys %props), sort keys %legacy_props) {
+ my $is_legacy = 0;
+ my $loose_prop = &utf8::_loose_name(lc $prop);
my $suppressed = grep { $_ eq $loose_prop }
@Unicode::UCD::suppressed_properties;
+ my $actual_lookup_prop;
+ my $display_prop; # The property name that is displayed, as opposed
+ # to the one that is actually used.
+
# Find the short and full names that this property goes by
my ($name, $full_name) = prop_aliases($prop);
if (! $name) {
- if (! $suppressed) {
- fail("prop_invmap('$prop')");
- diag("is unknown to prop_aliases(), and we need it in order to test prop_invmap");
+
+ # Here, Perl doesn't know about this property. It could be a
+ # suppressed one, or a legacy one.
+ if (grep { $prop eq $_ } keys %legacy_props) {
+
+ # For legacy properties, we look up the modern equivalent
+ # property instead; later massaging the results to look like the
+ # known format of the legacy property. We add info about the
+ # legacy property to the data structures for the rest of the
+ # properties; this is to avoid more special cases for the legacies
+ # in the code below
+ $full_name = $name = $prop;
+ $actual_lookup_prop = $legacy_props{$prop}->{'replaced_by'};
+ my $base_file = $legacy_props{$prop}->{'file'};
+
+ # This legacy property is otherwise unknown to Perl; so shouldn't
+ # have any information about it already.
+ ok(! exists $utf8::loose_property_to_file_of{$loose_prop},
+ "There isn't a hash entry for file lookup of $prop");
+ $utf8::loose_property_to_file_of{$loose_prop} = $base_file;
+
+ ok(! exists $utf8::file_to_swash_name{$loose_prop},
+ "There isn't a hash entry for swash lookup of $prop");
+ $utf8::file_to_swash_name{$base_file}
+ = $legacy_props{$prop}->{'swash_name'};
+ $display_prop = $prop;
+ $is_legacy = 1;
}
+ else {
+ if (! $suppressed) {
+ fail("prop_invmap('$prop')");
+ diag("is unknown to prop_aliases(), and we need it in order to test prop_invmap");
+ }
next PROPERTY;
+ }
}
# Normalize the short name, as it is stored in the hashes under the
# normalized version.
- $name = utf8::_loose_name(lc $name);
+ $name = &utf8::_loose_name(lc $name);
# Add in the characters that are supposed to be ignored to test loose
# matching, which the tested function applies to all properties
- my $mod_prop = "$extra_chars$prop";
+ $display_prop = "$extra_chars$prop" unless $display_prop;
+ $actual_lookup_prop = $display_prop unless $actual_lookup_prop;
- my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($mod_prop);
+ my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($actual_lookup_prop);
my $return_ref = [ $invlist_ref, $invmap_ref, $format, $missing ];
+
+ # The legacy property files all are expanded out so that each range is 1
+ # element long. That isn't true of the modern equivalent we use to check
+ # those files for correctness against. So take the output of the proxy
+ # and expand it to match the legacy file.
+ if ($is_legacy) {
+ my @expanded_list;
+ my @expanded_map;
+ for my $i (0 .. @$invlist_ref - 1 - 1) {
+ if (ref $invmap_ref->[$i] || $invmap_ref->[$i] eq $missing) {
+
+ # No adjustments should be done for the default mapping and
+ # the multi-char ones.
+ push @expanded_list, $invlist_ref->[$i];
+ push @expanded_map, $invmap_ref->[$i];
+ }
+ else {
+
+ # Expand the range into separate elements for each item.
+ my $offset = 0;
+ for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
+ push @expanded_list, $j;
+ push @expanded_map, $invmap_ref->[$i] + $offset;
+
+ # The 'ae' format is for Legacy_Perl_Decimal_Digit; the
+ # other 4 are kept with leading zeros in the file, so
+ # convert to that.
+ $expanded_map[-1] = sprintf("%04X", $expanded_map[-1])
+ if $format ne 'ae';
+ $offset++;
+ }
+ }
+ }
+
+ # Final element is taken as is. The map should always be to the
+ # default value, so don't do a sprintf like we did above.
+ push @expanded_list, $invlist_ref->[-1];
+ push @expanded_map, $invmap_ref->[-1];
+
+ $invlist_ref = \@expanded_list;
+ $invmap_ref = \@expanded_map;
+ }
+
# If have already tested this property under a different name, merely
# compare the return from now with the saved one from before.
if (exists $tested_invmaps{$name}) {
- is_deeply($return_ref, $tested_invmaps{$name}, "prop_invmap('$mod_prop') gave same results as its synonym, '$name'");
+ is_deeply($return_ref, $tested_invmaps{$name}, "prop_invmap('$display_prop') gave same results as its synonym, '$name'");
next PROPERTY;
}
$tested_invmaps{$name} = dclone $return_ref;
# not generated.
if ($suppressed) {
if (defined $format) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("did not return undef for suppressed property $prop");
}
next PROPERTY;
}
elsif (!defined $format) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("'$prop' is unknown to prop_invmap()");
next PROPERTY;
}
# The two parallel arrays must have the same number of elements.
if (@$invlist_ref != @$invmap_ref) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("invlist has "
. scalar @$invlist_ref
. " while invmap has "
# The last element must be for the above-Unicode code points, and must be
# for the default value.
if ($invlist_ref->[-1] != 0x110000) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("The last inversion list element is not 0x110000");
next PROPERTY;
}
- if ($invmap_ref->[-1] ne $missing) {
- fail("prop_invmap('$mod_prop')");
+
+ my $upper_limit_subtract;
+
+ # prop_invmap() adds an extra element not present in the disk files for
+ # the above-Unicode code points. For almost all properties, that will be
+ # to $missing. In that case we don't look further at it when comparing
+ # with the disk files.
+ if ($invmap_ref->[-1] eq $missing) {
+ $upper_limit_subtract = 1;
+ }
+ elsif ($invmap_ref->[-1] eq 'Y' && ! grep { $_ !~ /[YN]/ } @$invmap_ref) {
+
+ # But that's not true for a few binary properties like 'Unassigned'
+ # that are Perl extensions (in this case for Gc=Unassigned) which
+ # match above-Unicode code points (hence the 'Y' in the test above).
+ # For properties where it isn't $missing, we're going to want to look
+ # at the whole thing when comparing with the disk file.
+ $upper_limit_subtract = 0;
+
+ # In those properties like 'Unassigned, the final element should be
+ # just a repetition of the next-to-last element, and won't be in the
+ # disk file, so remove it for the comparison. Otherwise, we will
+ # compare the whole of the array with the whole of the disk file.
+ if ($invlist_ref->[-2] <= 0x10FFFF && $invmap_ref->[-2] eq 'Y') {
+ pop @$invlist_ref;
+ pop @$invmap_ref;
+ }
+ }
+ else {
+ fail("prop_invmap('$display_prop')");
diag("The last inversion list element is '$invmap_ref->[-1]', and should be '$missing'");
next PROPERTY;
}
if ($name eq 'bmg') { # This one has an atypical $missing
if ($missing ne "") {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("The missings should be \"\"; got '$missing'");
next PROPERTY;
}
}
- elsif ($format =~ /^ [cd] /x) {
- if ($missing ne "<code point>") {
- fail("prop_invmap('$mod_prop')");
- diag("The missings should be '<code point>'; got '$missing'");
+ elsif ($format =~ /^ a (?!r) /x) {
+ if ($full_name eq 'Perl_Decimal_Digit') {
+ if ($missing ne "") {
+ fail("prop_invmap('$display_prop')");
+ diag("The missings should be \"\"; got '$missing'");
+ next PROPERTY;
+ }
+ }
+ elsif ($missing ne "0" && ! grep { $prop eq $_ } keys %legacy_props) {
+ fail("prop_invmap('$display_prop')");
+ diag("The missings should be '0'; got '$missing'");
next PROPERTY;
}
}
elsif ($missing =~ /[<>]/) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("The missings should NOT be something with <...>'");
next PROPERTY;
# We captured the U, L, or T, leading to uc, lc, or tc.
$proxy_prop = lc $1 . "c";
}
- if ($format ne "c") {
- fail("prop_invmap('$mod_prop')");
- diag("The format should be 'c'; got '$format'");
+ if ($format ne "a") {
+ fail("prop_invmap('$display_prop')");
+ diag("The format should be 'a'; got '$format'");
next PROPERTY;
}
}
+ if ($format !~ / ^ (?: a [der]? | ale? | n | sl? ) $ /x) {
+ fail("prop_invmap('$display_prop')");
+ diag("Unknown format '$format'");
+ next PROPERTY;
+ }
+
my $base_file;
my $official;
{
# Translate the charblocks() data structure to what the file
# would like.
- $official .= sprintf"%04X\t%04X\t%s\n",
+ $official .= sprintf"%X\t%X\t%s\n",
$range->[0][0],
$range->[0][1],
$range->[0][2];
}
}
else {
- $base_file = "Decomposition" if $format eq 'd';
+ $base_file = "Decomposition" if $format eq 'ad';
# Above leaves $base_file undefined only if it came from the hash
# below. This should happen only when it is a binary property
# property comes along without these characteristics
if (!defined $base_file) {
$base_file = $utf8::loose_to_file_of{$proxy_prop};
- $is_binary = ($base_file =~ s/^!//) ? -1 : 1;
- $base_file = "lib/$base_file";
+ $is_binary = ($base_file =~ s/!//) ? -1 : 1;
+ $base_file = "lib/$base_file" unless $base_file =~ m!^#/!;
}
- # Read in the file
- $file = "unicore/$base_file.pl";
- $official = do $file;
+ # Read in the file. If the file's directory is '#', it is a
+ # special case where the contents are in-lined with semi-colons
+ # meaning new-lines, instead of it being an actual file to read.
+ if ($base_file =~ s!^#/!!) {
+ $official = $utf8::inline_definitions[$base_file];
+ }
+ else {
+ $official = do "unicore/$base_file.pl";
+ }
# Get rid of any trailing space and comments in the file.
$official =~ s/\s*(#.*)?$//mg;
- # Decomposition.pl also has the <compatible> types in it, which
- # should be removed.
- $official =~ s/<.*?> //mg if $format eq 'd';
+ if ($format eq 'ad') {
+ my @official = split /\n/, $official;
+ $official = "";
+ foreach my $line (@official) {
+ my ($start, $end, $value)
+ = $line =~ / ^ (.+?) \t (.*?) \t (.+?)
+ \s* ( \# .* )? $ /x;
+ # Decomposition.pl also has the <compatible> types in it,
+ # which should be removed.
+ $value =~ s/<.*?> //;
+ $official .= "$start\t\t$value\n";
+
+ # If this is a multi-char range, we turn it into as many
+ # single character ranges as necessary. This makes things
+ # easier below.
+ if ($end ne "") {
+ for my $i (hex($start) + 1 .. hex $end) {
+ $official .= sprintf "%X\t\t%s\n", $i, $value;
+ }
+ }
+ }
+ }
}
+ local $/ = "\n";
chomp $official;
-
- # If there are any special elements, get a reference to them.
- my $specials_ref = $utf8::file_to_swash_name{$base_file};
- if ($specials_ref) {
- $specials_ref = $utf8::SwashInfo{$specials_ref}{'specials_name'};
+ $/ = $input_record_separator;
+
+ # Get the format for the file, and if there are any special elements,
+ # get a reference to them.
+ my $swash_name = $utf8::file_to_swash_name{$base_file};
+ my $specials_ref;
+ my $file_format; # The 'format' given inside the file
+ if ($swash_name) {
+ $specials_ref = $utf8::SwashInfo{$swash_name}{'specials_name'};
if ($specials_ref) {
# Convert from the name to the actual reference.
no strict 'refs';
$specials_ref = \%{$specials_ref};
}
+
+ $file_format = $utf8::SwashInfo{$swash_name}{'format'};
}
+ # Leading zeros used to be used with the values in the files that give,
+ # ranges, but these have been mostly stripped off, except for some
+ # files whose formats should not change in any way.
+ my $file_range_format = (grep { $full_name eq $_ } @legacy_file_format)
+ ? "%04X"
+ : "%X";
+ # Currently this property still has leading zeroes in the mapped-to
+ # values, but otherwise, those values follow the same rules as the
+ # ranges.
+ my $file_map_format = ($full_name eq 'Decomposition_Mapping')
+ ? "%04X"
+ : $file_range_format;
+
# Certain of the proxy properties have to be adjusted to match the
# real ones.
- if (($proxy_prop ne $name && $full_name =~ 'Mapping')
- || $full_name eq 'Case_Folding')
+ if ($full_name
+ =~ /^(Legacy_)?(Case_Folding|(Lower|Title|Upper)case_Mapping)/)
{
# Here we have either
my @list;
for (split "\n", $official) {
my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
- \s* ( \# .* )?
- $ /x;
+ \s* ( \# .* )? $ /x;
$end = $start if $end eq "";
- if ($end ne $start) {
- fail("prop_invmap('$mod_prop')");
- diag("This test is expecting only single code point ranges in $file.pl");
- next PROPERTY;
- }
- push @list, [ hex $start, $value ];
+ push @list, [ hex $start, hex $end, hex $value ];
}
- # For Case_Folding, the file contains all the simple mappings,
+ # For these mappings, the file contains all the simple mappings,
# including the ones that are overridden by the specials. These
- # need to be removed as the list is for just the full ones. For
- # the other files, the proxy is missing the simple mappings that
- # are overridden by the specials, so we need to add them.
-
- # For the missing simples, we get the correct values by calling
- # charinfo(). Set up which element of the hash returned by
- # charinfo to look at
- my $charinfo_element;
- if ($full_name =~ / ^ Simple_ (Lower | Upper | Title) case_Mapping/x)
- {
- $charinfo_element = lc $1; # e.g. Upper is referred to by the
- # key 'upper' in the charinfo()
- # returned hash
- }
+ # need to be removed as the list is for just the full ones.
# Go through any special mappings one by one. They are packed.
my $i = 0;
foreach my $utf8_cp (sort keys %$specials_ref) {
my $cp = unpack("C0U", $utf8_cp);
- # Get what the simple value for this should be; either nothing
- # for Case_Folding, or what charinfo returns for the others.
- my $simple = ($full_name eq "Case_Folding")
- ? ""
- : charinfo($cp)->{$charinfo_element};
-
- # And create an entry to add to the list, if appropriate
- my $replacement;
- $replacement = [ $cp, $simple ] if $simple ne "";
-
# Find the spot in the @list of simple mappings that this
# special applies to; uses a linear search.
while ($i < @list -1 ) {
- last if $cp <= $list[$i][0];
+ last if $cp <= $list[$i][1];
$i++;
}
- #note $i-1 . ": " . join " => ", @{$list[$i-1]};
- #note $i-0 . ": " . join " => ", @{$list[$i-0]};
- #note $i+1 . ": " . join " => ", @{$list[$i+1]};
+ # Here $i is such that it points to the first range which ends
+ # at or above cp, and hence is the only range that could
+ # possibly contain it.
- if (! defined $replacement) {
+ # If not in this range, no range contains it: nothing to
+ # remove.
+ next if $cp < $list[$i][0];
- # Here, are to remove any existing entry for this code
- # point.
- next if $cp != $list[$i][0];
- splice @list, $i, 1;
- }
- elsif ($cp == $list[$i][0]) {
+ # Otherwise, remove the existing entry. If it is the first
+ # element of the range...
+ if ($cp == $list[$i][0]) {
- # Here, are to add something, but there is an existing
- # entry, so this just replaces it.
- $list[$i] = $replacement;
- }
- else {
+ # ... and there are other elements in the range, just
+ # shorten the range to exclude this code point.
+ if ($list[$i][1] > $list[$i][0]) {
+ $list[$i][0]++;
+ }
- # Here, are to add something, and there isn't an existing
- # entry.
- splice @list, $i, 0, $replacement;
+ # ... but if it is the only element in the range, remove
+ # it entirely.
+ else {
+ splice @list, $i, 1;
+ }
+ }
+ else { # Is somewhere in the middle of the range
+ # Split the range into two, excluding this one in the
+ # middle
+ splice @list, $i, 1,
+ [ $list[$i][0], $cp - 1, $list[$i][2] ],
+ [ $cp + 1, $list[$i][1], $list[$i][2] ];
}
-
- #note __LINE__ . ": $cp";
- #note $i-1 . ": " . join " => ", @{$list[$i-1]};
- #note $i-0 . ": " . join " => ", @{$list[$i-0]};
- #note $i+1 . ": " . join " => ", @{$list[$i+1]};
}
# Here, have gone through all the specials, modifying @list as
# needed. Turn it back into what the file should look like.
- $official = join "\n", map { sprintf "%04X\t\t%s", @$_ } @list;
-
- # And, no longer need the specials for the simple mappings, as are
- # all incorporated into $official
- undef $specials_ref if $full_name ne 'Case_Folding';
+ $official = "";
+ for my $element (@list) {
+ $official .= "\n" if $official;
+ if ($element->[1] == $element->[0]) {
+ $official
+ .= sprintf "$file_range_format\t\t$file_map_format",
+ $element->[0], $element->[2];
+ }
+ else {
+ $official .= sprintf "$file_range_format\t$file_range_format\t$file_map_format",
+ $element->[0],
+ $element->[1],
+ $element->[2];
+ }
+ }
}
- elsif ($full_name eq 'Simple_Case_Folding') {
+ elsif ($full_name
+ =~ / ^ Simple_(Case_Folding|(Lower|Title|Upper)case_Mapping) $ /x)
+ {
- # This property has everything in the regular array, and the
+ # These properties have everything in the regular array, and the
# specials are superfluous.
- undef $specials_ref if $full_name ne 'Case_Folding';
+ undef $specials_ref;
+ }
+ elsif ($format !~ /^a/ && defined $file_format && $file_format eq 'x') {
+
+ # For these properties the file is output using hex notation for the
+ # map. Convert from hex to decimal.
+ my @lines = split "\n", $official;
+ foreach my $line (@lines) {
+ my ($lower, $upper, $map) = split "\t", $line;
+ $line = "$lower\t$upper\t" . hex $map;
+ }
+ $official = join "\n", @lines;
}
# Here, in $official, we have what the file looks like, or should like
# appends the next line to the running string.
my $tested_map = "";
+ # For use with files for binary properties only, which are stored in
+ # inversion list format. This counts the number of data lines in the
+ # file.
+ my $binary_count = 0;
+
# Create a copy of the file's specials hash. (It has been undef'd if
# we know it isn't relevant to this property, so if it exists, it's an
# error or is relevant). As we go along, we delete from that copy.
# it's an error
my %specials = %$specials_ref if $specials_ref;
- # The extra -1 is because the final element has been tested above to
- # be for anything above Unicode. The file doesn't go that high.
- for my $i (0 .. @$invlist_ref - 1 - 1) {
+ # The extra -$upper_limit_subtract is because the final element may
+ # have been tested above to be for anything above Unicode, in which
+ # case the file may not go that high.
+ for (my $i = 0; $i < @$invlist_ref - $upper_limit_subtract; $i++) {
# If the map element is a reference, have to stringify it (but
# don't do so if the format doesn't allow references, so that an
# improper format will generate an error.
if (ref $invmap_ref->[$i]
- && ($format eq 'd' || $format =~ /^ . l /x))
+ && ($format eq 'ad' || $format =~ /^ . l /x))
{
- # The stringification depends on the format. At the time of
- # this writing, all 'sl' formats are space separated.
+ # The stringification depends on the format.
if ($format eq 'sl') {
- $invmap_ref->[$i] = join " ", @{$invmap_ref->[$i]};
+
+ # At the time of this writing, there are two types of 'sl'
+ # format One, in Name_Alias, has multiple separate
+ # entries for each code point; the other, in
+ # Script_Extension, is space separated. Assume the latter
+ # for non-Name_Alias.
+ if ($full_name ne 'Name_Alias') {
+ $invmap_ref->[$i] = join " ", @{$invmap_ref->[$i]};
+ }
+ else {
+ # For Name_Alias, we emulate the file. Entries with
+ # just one value don't need any changes, but we
+ # convert the list entries into a series of lines for
+ # the file, starting with the first name. The
+ # succeeding entries are on separate lines, with the
+ # code point repeated for each one and then two tabs,
+ # then the value. Code at the end of the loop will
+ # set up the first line with its code point and two
+ # tabs before the value, just as it does for every
+ # other property; thus the special handling of the
+ # first line.
+ if (ref $invmap_ref->[$i]) {
+ my $hex_cp = sprintf("%X", $invlist_ref->[$i]);
+ my $concatenated = $invmap_ref->[$i][0];
+ for (my $j = 1; $j < @{$invmap_ref->[$i]}; $j++) {
+ $concatenated .= "\n$hex_cp\t\t"
+ . $invmap_ref->[$i][$j];
+ }
+ $invmap_ref->[$i] = $concatenated;
+ }
+ }
}
- elsif ($format =~ / ^ cl e? $/x) {
+ elsif ($format =~ / ^ al e? $/x) {
- # For a cl property, the stringified result should be in
+ # For an al property, the stringified result should be in
# the specials hash. The key is the packed code point,
# and the value is the packed map.
my $value;
- if (! defined ($value = delete $specials{pack("C0U", $invlist_ref->[$i]) })) {
- fail("prop_invmap('$mod_prop')");
+ if (! defined ($value = delete $specials{pack("C0U",
+ $invlist_ref->[$i]) }))
+ {
+ fail("prop_invmap('$display_prop')");
diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]);
next PROPERTY;
}
my $packed = pack "U*", @{$invmap_ref->[$i]};
if ($value ne $packed) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "For %04X, expected the mapping to be '$packed', but got '$value'");
next PROPERTY;
}
if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
|| $invlist_ref->[$i] >= $invlist_ref->[$i+1])
{
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
next PROPERTY;
}
next;
}
- elsif ($format eq 'd') {
+ elsif ($format eq 'ad') {
# The decomposition mapping file has the code points as
# a string of space-separated hex constants.
- $invmap_ref->[$i] = join " ", map { sprintf "%04X", $_ } @{$invmap_ref->[$i]};
+ $invmap_ref->[$i] = join " ", map { sprintf "%04X", $_ }
+ @{$invmap_ref->[$i]};
}
else {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("Can't handle format '$format'");
next PROPERTY;
}
+ } # Otherwise, the map is to a simple scalar
+ elsif (defined $file_format && $file_format eq 'ax') {
+ # These maps are in hex
+ $invmap_ref->[$i] = sprintf("%X", $invmap_ref->[$i]);
}
- elsif ($format eq 'cle' && $invmap_ref->[$i] eq "") {
-
- # cle properties have maps to the empty string that also
- # should be in the specials hash, with the key the packed code
- # point, and the map just empty.
- my $value;
- if (! defined ($value = delete $specials{pack("C0U", $invlist_ref->[$i]) })) {
- fail("prop_invmap('$mod_prop')");
- diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]);
- next PROPERTY;
- }
- if ($value ne "") {
- fail("prop_invmap('$mod_prop')");
- diag(sprintf "For %04X, expected the mapping to be \"\", but got '$value'", $invlist_ref->[$i]);
- next PROPERTY;
+ elsif ($format eq 'ad' || $format eq 'ale') {
+
+ # The numerics in the returned map are stored as adjusted
+ # decimal integers. The defaults are 0, and don't appear in
+ # $official, and are excluded later, but the elements must be
+ # converted back to their hex values before comparing with
+ # $official, as these files, for backwards compatibility, are
+ # not stored as adjusted. (There currently is only one ale
+ # property, nfkccf. If that changed this would also have to.)
+ if ($invmap_ref->[$i] =~ / ^ -? \d+ $ /x
+ && $invmap_ref->[$i] != 0)
+ {
+ my $next = $invmap_ref->[$i] + 1;
+ $invmap_ref->[$i] = sprintf($file_map_format,
+ $invmap_ref->[$i]);
+
+ # If there are other elements in this range they need to
+ # be adjusted; they must individually be re-mapped. Do
+ # this by splicing in a new element into the list and the
+ # map containing the remainder of the range. Next time
+ # through we will look at that (possibly splicing again
+ # until the whole range is processed).
+ if ($invlist_ref->[$i+1] > $invlist_ref->[$i] + 1) {
+ splice @$invlist_ref, $i+1, 0,
+ $invlist_ref->[$i] + 1;
+ splice @$invmap_ref, $i+1, 0, $next;
+ }
}
+ if ($format eq 'ale' && $invmap_ref->[$i] eq "") {
- # As this doesn't get tested when we later compare with
- # the actual file, it could be out of order and we
- # wouldn't know it.
- if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
- || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
- {
- fail("prop_invmap('$mod_prop')");
- diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
- next PROPERTY;
+ # ale properties have maps to the empty string that also
+ # should be in the specials hash, with the key the packed
+ # code point, and the map just empty.
+ my $value;
+ if (! defined ($value = delete $specials{pack("C0U",
+ $invlist_ref->[$i]) }))
+ {
+ fail("prop_invmap('$display_prop')");
+ diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]);
+ next PROPERTY;
+ }
+ if ($value ne "") {
+ fail("prop_invmap('$display_prop')");
+ diag(sprintf "For %04X, expected the mapping to be \"\", but got '$value'", $invlist_ref->[$i]);
+ next PROPERTY;
+ }
+
+ # As this doesn't get tested when we later compare with
+ # the actual file, it could be out of order and we
+ # wouldn't know it.
+ if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
+ || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
+ {
+ fail("prop_invmap('$display_prop')");
+ diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+ next PROPERTY;
+ }
+ next;
}
- next;
}
elsif ($is_binary) { # These binary files don't have an explicit Y
$invmap_ref->[$i] =~ s/Y//;
if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
|| $invlist_ref->[$i] >= $invlist_ref->[$i+1])
{
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
next PROPERTY;
}
next;
}
- # 'c'-type and 'd' properties have the mapping expressed in hex in
- # the file
- if ($format =~ /^ [cd] /x) {
-
- # The d property has one entry which isn't in the file.
- # Ignore it, but make sure it is in order.
- if ($format eq 'd'
- && $invmap_ref->[$i] eq '<hangul syllable>'
- && $invlist_ref->[$i] == 0xAC00)
+ # The ad property has one entry which isn't in the file.
+ # Ignore it, but make sure it is in order.
+ if ($format eq 'ad'
+ && $invmap_ref->[$i] eq '<hangul syllable>'
+ && $invlist_ref->[$i] == 0xAC00)
+ {
+ if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
+ || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
{
- if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
- || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
- {
- fail("prop_invmap('$mod_prop')");
- diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
- next PROPERTY;
- }
- next;
+ fail("prop_invmap('$display_prop')");
+ diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+ next PROPERTY;
}
- $invmap_ref->[$i] = sprintf("%04X", $invmap_ref->[$i])
- if $invmap_ref->[$i] =~ / ^ [A-Fa-f0-9]+ $/x;
+ next;
}
# Finally have figured out what the map column in the file should
# be. Append the line to the running string.
my $start = $invlist_ref->[$i];
- my $end = $invlist_ref->[$i+1] - 1;
- $end = ($start == $end) ? "" : sprintf("%04X", $end);
- if ($invmap_ref->[$i] ne "") {
- $tested_map .= sprintf "%04X\t%s\t%s\n", $start, $end, $invmap_ref->[$i];
- }
- elsif ($end ne "") {
- $tested_map .= sprintf "%04X\t%s\n", $start, $end;
+ my $end = (defined $invlist_ref->[$i+1])
+ ? $invlist_ref->[$i+1] - 1
+ : $Unicode::UCD::MAX_CP;
+ if ($is_binary) {
+
+ # Files for binary properties are in inversion list format,
+ # without ranges.
+ $tested_map .= "$start\n";
+ $binary_count++;
+
+ # If the final value is infinity, no line for it exists.
+ if ($end < $Unicode::UCD::MAX_CP) {
+ $tested_map .= ($end + 1) . "\n";
+ $binary_count++;
+ }
}
else {
- $tested_map .= sprintf "%04X\n", $start;
+ $end = ($start == $end) ? "" : sprintf($file_range_format, $end);
+ if ($invmap_ref->[$i] ne "") {
+ $tested_map .= sprintf "$file_range_format\t%s\t%s\n",
+ $start, $end, $invmap_ref->[$i];
+ }
+ elsif ($end ne "") {
+ $tested_map .= sprintf "$file_range_format\t%s\n",
+ $start, $end;
+ }
+ else {
+ $tested_map .= sprintf "$file_range_format\n", $start;
+ }
}
} # End of looping over all elements.
+ # Binary property files begin with a line count line.
+ $tested_map = "V$binary_count\n$tested_map" if $binary_count;
+
# Here are done with generating what the file should look like
+ local $/ = "\n";
chomp $tested_map;
+ $/ = $input_record_separator;
# And compare.
if ($tested_map ne $official) {
- fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap");
+ fail_with_diff($display_prop, $official, $tested_map, "prop_invmap");
next PROPERTY;
}
# There shouldn't be any specials unaccounted for.
if (keys %specials) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("Unexpected specials: " . join ", ", keys %specials);
next PROPERTY;
}
# Handle the Name property similar to the above. But the file is
# sufficiently different that it is more convenient to make a special
- # case for it.
+ # case for it. It is a combination of the Name, Unicode1_Name, and
+ # Name_Alias properties, and named sequences. We need to remove all
+ # but the Name in order to do the comparison.
if ($missing ne "") {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("The missings should be \"\"; got \"missing\"");
next PROPERTY;
}
$official =~ s/ ^ [^\t]+ \ .*? \n //xmg;
# And get rid of the controls. These are named in the file, but
- # shouldn't be in the property.
+ # shouldn't be in the property. This gets rid of the two ranges in
+ # one fell swoop, and also all the Unicode1_Name values that may not
+ # be in Name_Alias.
$official =~ s/ 00000 \t .* 0001F .*? \n//xs;
$official =~ s/ 0007F \t .* 0009F .*? \n//xs;
- # This is slow; it gets rid of the aliases. We look for lines that
- # are for the same code point as the previous line. The previous line
- # will be a name_alias; and the current line will be the name. Get
- # rid of the name_alias line. This won't work if there are multiple
- # aliases for a given name.
- my @temp_names = split "\n", $official;
- my $previous_cp = "";
- for (my $i = 0; $i < @temp_names - 1; $i++) {
- $temp_names[$i] =~ /^ (.*)? \t /x;
- my $current_cp = $1;
- if ($current_cp eq $previous_cp) {
- splice @temp_names, $i - 1, 1;
- redo;
- }
- else {
- $previous_cp = $current_cp;
+ # And remove the aliases. We read in the Name_Alias property, and go
+ # through them one by one.
+ my ($aliases_code_points, $aliases_maps, undef, undef)
+ = &prop_invmap('Name_Alias');
+ for (my $i = 0; $i < @$aliases_code_points; $i++) {
+ my $code_point = $aliases_code_points->[$i];
+
+ # Already removed these above.
+ next if $code_point <= 0x1F
+ || ($code_point >= 0x7F && $code_point <= 0x9F);
+
+ my $hex_code_point = sprintf "%05X", $code_point;
+
+ # Convert to a list if not already to make the following loop
+ # control uniform.
+ $aliases_maps->[$i] = [ $aliases_maps->[$i] ]
+ if ! ref $aliases_maps->[$i];
+
+ # Remove each alias for this code point from the file
+ foreach my $alias (@{$aliases_maps->[$i]}) {
+
+ # Remove the alias type from the entry, retaining just the name.
+ $alias =~ s/:.*//;
+
+ $alias = quotemeta($alias);
+ $official =~ s/$hex_code_point \t $alias \n //x;
}
}
- $official = join "\n", @temp_names;
- undef @temp_names;
+ local $/ = "\n";
chomp $official;
+ $/ = $input_record_separator;
# Here have adjusted the file. We also have to adjust the returned
# inversion map by checking and deleting all the lines in it that
my @code_point_in_names =
@Unicode::UCD::code_points_ending_in_code_point;
- for my $i (0 .. @$invlist_ref - 1 - 1) {
+ for my $i (0 .. @$invlist_ref - 1 - $upper_limit_subtract) {
my $start = $invlist_ref->[$i];
my $end = $invlist_ref->[$i+1] - 1;
if ($invmap_ref->[$i] eq $missing) {
if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
|| $invlist_ref->[$i] >= $invlist_ref->[$i+1])
{
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
next PROPERTY;
}
if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
|| $invlist_ref->[$i] >= $invlist_ref->[$i+1])
{
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
next PROPERTY;
}
if ($type eq "<hangul syllable>") {
if ($name ne "") {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("Unexpected text in $invmap_ref->[$i]");
next PROPERTY;
}
if ($start != 0xAC00) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf("<hangul syllables> should begin at 0xAC00, got %04X", $start));
next PROPERTY;
}
if ($end != $start + 11172 - 1) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf("<hangul syllables> should end at %04X, got %04X", $start + 11172 -1, $end));
next PROPERTY;
}
}
elsif ($type ne "<code point>") {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("Unexpected text '$type' in $invmap_ref->[$i]");
next PROPERTY;
}
last;
}
else {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("Unexpected code-point-in-name line '$invmap_ref->[$i]'");
next PROPERTY;
}
# Finished creating the string from the inversion map. Can compare
# with what the file is.
+ local $/ = "\n";
chomp $tested_map;
+ $/ = $input_record_separator;
if ($tested_map ne $official) {
- fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap");
+ fail_with_diff($display_prop, $official, $tested_map, "prop_invmap");
next PROPERTY;
}
if (@code_point_in_names) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
use Data::Dumper;
diag("Missing code-point-in-name line(s)" . Dumper \@code_point_in_names);
next PROPERTY;
}
}
- elsif ($format eq 's' || $format eq 'r') {
+ elsif ($format eq 's') {
# Here the map is not more or less directly from a file stored on
# disk. We try a different tack. These should all be properties that
my %maps;
my $previous_map;
- # (The extra -1 is to not look at the final element in the loop, which
- # we know is the one that starts just beyond Unicode and goes to
- # infinity.)
- for my $i (0 .. @$invlist_ref - 1 - 1) {
+ for my $i (0 .. @$invlist_ref - 1 - $upper_limit_subtract) {
my $range_start = $invlist_ref->[$i];
# Because we are sorting into buckets, things could be
if (($i > 0 && $range_start <= $invlist_ref->[$i-1])
|| $range_start >= $invlist_ref->[$i+1])
{
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
next PROPERTY;
}
# through each and verify that matches what prop_invlist() returns.
# We could use is_deeply() for the comparison, but would get multiple
# messages for each $prop.
- foreach my $map (keys %maps) {
+ foreach my $map (sort keys %maps) {
my @off_invlist = prop_invlist("$prop = $map");
my $min = (@off_invlist >= @{$maps{$map}})
? @off_invlist
: @{$maps{$map}};
for my $i (0 .. $min- 1) {
if ($i > @off_invlist - 1) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("There is no element [$i] for $prop=$map from prop_invlist(), while [$i] in the implicit one constructed from prop_invmap() is '$maps{$map}[$i]'");
next PROPERTY;
}
elsif ($i > @{$maps{$map}} - 1) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("There is no element [$i] from the implicit $prop=$map constructed from prop_invmap(), while [$i] in the one from prop_invlist() is '$off_invlist[$i]'");
next PROPERTY;
}
elsif ($maps{$map}[$i] ne $off_invlist[$i]) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("Element [$i] of the implicit $prop=$map constructed from prop_invmap() is '$maps{$map}[$i]', and the one from prop_invlist() is '$off_invlist[$i]'");
next PROPERTY;
}
}
else { # Don't know this property nor format.
- fail("prop_invmap('$mod_prop')");
- diag("Unknown format '$format'");
+ fail("prop_invmap('$display_prop')");
+ diag("Unknown property '$display_prop' or format '$format'");
+ next PROPERTY;
}
- pass("prop_invmap('$mod_prop')");
+ pass("prop_invmap('$display_prop')");
+}
+
+# A few tests of search_invlist
+use Unicode::UCD qw(search_invlist);
+
+my ($scripts_ranges_ref, $scripts_map_ref) = prop_invmap("Script");
+my $index = search_invlist($scripts_ranges_ref, 0x390);
+is($scripts_map_ref->[$index], "Greek", "U+0390 is Greek");
+my @alpha_invlist = prop_invlist("Alpha");
+is(search_invlist(\@alpha_invlist, ord("\t")), undef, "search_invlist returns undef for code points before first one on the list");
+
+ok($/ eq $input_record_separator, "The record separator didn't get overridden");
+
+if (! ok(@warnings == 0, "No warnings were generated")) {
+ diag(join "\n", "The warnings are:", @warnings);
}
done_testing();