From 34132297113975a3522f23d745e0ccf336803994 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 30 Jan 2012 18:17:11 -0700 Subject: [PATCH 1/1] Unicode::UCD::prop_invmap(): Make the NFKCCF property return deltas The file for this property is stored in the old-style format for backward compatibility with any applications that might be reading it directly. But the values should be returned through the Unicode::UCD API as deltas for consistency with other, similar properties. --- lib/Unicode/UCD.pm | 46 +++++++++++++++++++++++++++++++++++++--------- lib/Unicode/UCD.t | 27 +++++++++++---------------- 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index de62e50..3473ecb 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -2398,20 +2398,18 @@ that are lists, and the addition is extra work. =item B> -is like C except that, for the time being, as an interim measure, the map -returned for simple scalars is the correct value and the code point should NOT -be added to it. Also, some of the map array elements have the forms given by C, and +means that some of the map array elements have the forms given by C, and the rest are the empty string. The property C has this form. An example slice is: @$ranges_ref @$maps_ref Note ... - 0x00AA 0x0061 FEMININE ORDINAL INDICATOR => 'a' - 0x00AB + 0x00AA -73 FEMININE ORDINAL INDICATOR => 'a' + 0x00AB 0 0x00AD SOFT HYPHEN => "" - 0x00AE + 0x00AE 0 0x00AF [ 0x0020, 0x0304 ] MACRON => SPACE . COMBINING MACRON - 0x00B0 + 0x00B0 0 ... =item B> @@ -2576,8 +2574,9 @@ RETRY: # new-style, and this routine is supposed to return old-style block names. # The Name table is valid, but we need to execute the special code below # to add in the algorithmic-defined name entries. + # And NFKCCF needs conversion, so handle that here too. if (ref $swash eq "" - || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na) $ /x) + || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na | NFKCCF ) $ /x) { # Get the short name of the input property, in standard form @@ -2798,6 +2797,35 @@ RETRY: } $swash = \%decomps; } + elsif ($second_try eq 'nfkccf') { + + # This property is stored in the old format for backwards + # compatibility for any applications that read its file directly. + # So here we convert it to delta format for compatibility with the + # other properties similar to it. + my %nfkccf; + + # Create a new LIST with deltas instead of code points. + my $list = ""; + foreach my $range (split "\n", $swash->{'LIST'}) { + my ($hex_begin, $hex_end, $map) = split "\t", $range; + my $begin = hex $hex_begin; + my $end = (defined $hex_end && $hex_end ne "") + ? hex $hex_end + : $begin; + my $decimal_map = hex $map; + foreach my $code_point ($begin .. $end) { + $list .= sprintf("%04X\t\t%d\n", $code_point, $decimal_map - $code_point); + } + } + + $nfkccf{'LIST'} = $list; + $nfkccf{'TYPE'} = "ToNFKCCF"; + $nfkccf{'SPECIALS'} = $swash->{'SPECIALS'}; + $swash = \%nfkccf; + $utf8::SwashInfo{'ToNFKCCF'}{'missing'} = 0; + $utf8::SwashInfo{'ToNFKCCF'}{'format'} = 'i'; + } else { # Don't know this property. Fail. return; } @@ -2809,7 +2837,7 @@ RETRY: } # Here, have a valid swash return. Examine it. - my $returned_prop = $swash->{TYPE}; + my $returned_prop = $swash->{'TYPE'}; # All properties but binary ones should have 'missing' and 'format' # entries diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index 45573de..530c548 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -1259,13 +1259,6 @@ foreach my $prop (keys %props) { next PROPERTY; } } - elsif ($name eq 'nfkccf') { # This one has an atypical $missing - if ($missing ne "") { - fail("prop_invmap('$mod_prop')"); - diag("The missings should be \"\"; got '$missing'"); - next PROPERTY; - } - } elsif ($format =~ /^ c /x) { if ($missing ne "0") { fail("prop_invmap('$mod_prop')"); @@ -1619,13 +1612,15 @@ foreach my $prop (keys %props) { next PROPERTY; } } - elsif ($format eq 'd') { - - # The numerics in the map are stored as deltas. The defaults - # are 0, and don't appear in $official, and are excluded - # later, but the elements must be converted back to their real - # code point values before comparing with $official, as that - # file, for backwards compatibility, is not stored as deltas + elsif ($format eq 'd' || $format eq 'cle') { + + # The numerics in the returned map are stored as deltas. The + # defaults are 0, and don't appear in $official, and are + # excluded later, but the elements must be converted back to + # their real code point values before comparing with + # $official, as these files, for backwards compatibility, are + # not stored as deltas. (There currently is only one cle + # property, nfkccf. If that changed this would also have to.) if ($invmap_ref->[$i] =~ / ^ -? \d+ $ /x && $invmap_ref->[$i] != 0) { @@ -1644,8 +1639,7 @@ foreach my $prop (keys %props) { splice @$invmap_ref, $i+1, 0, $delta; } } - } - elsif ($format eq 'cle' && $invmap_ref->[$i] eq "") { + if ($format eq 'cle' && $invmap_ref->[$i] eq "") { # cle properties have maps to the empty string that also # should be in the specials hash, with the key the packed code @@ -1673,6 +1667,7 @@ foreach my $prop (keys %props) { next PROPERTY; } next; + } } elsif ($is_binary) { # These binary files don't have an explicit Y $invmap_ref->[$i] =~ s/Y//; -- 1.8.3.1