From 06c8fc8f09dc8f7e52006b1a902e84e1587b786f Mon Sep 17 00:00:00 2001 From: Rafael Garcia-Suarez Date: Sun, 7 Sep 2003 19:12:05 +0000 Subject: [PATCH] Upgrade to Unicode::Collate 0.28 p4raw-id: //depot/perl@21064 --- MANIFEST | 1 + lib/Unicode/Collate.pm | 84 +++++++++++++++------- lib/Unicode/Collate/Changes | 10 ++- lib/Unicode/Collate/README | 2 +- lib/Unicode/Collate/t/contract.t | 146 +++++++++++++++++++++++++++++++++++++++ lib/Unicode/Collate/t/test.t | 12 ++-- 6 files changed, 221 insertions(+), 34 deletions(-) create mode 100644 lib/Unicode/Collate/t/contract.t diff --git a/MANIFEST b/MANIFEST index eecaacd..e55b15b 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1729,6 +1729,7 @@ lib/Unicode/Collate/Changes Unicode::Collate lib/Unicode/Collate/keys.txt Unicode::Collate lib/Unicode/Collate.pm Unicode::Collate lib/Unicode/Collate/README Unicode::Collate +lib/Unicode/Collate/t/contract.t Unicode::Collate lib/Unicode/Collate/t/hangul.t Unicode::Collate lib/Unicode/Collate/t/index.t Unicode::Collate lib/Unicode/Collate/t/test.t Unicode::Collate diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm index 2bcc315..18ed446 100644 --- a/lib/Unicode/Collate.pm +++ b/lib/Unicode/Collate.pm @@ -14,7 +14,7 @@ use File::Spec; require Exporter; -our $VERSION = '0.27'; +our $VERSION = '0.28'; our $PACKAGE = __PACKAGE__; our @ISA = qw(Exporter); @@ -53,7 +53,7 @@ use constant NOMATCHPOS => -1; # A coderef to get combining class imported from Unicode::Normalize # (i.e. \&Unicode::Normalize::getCombinClass). # This is also used as a HAS_UNICODE_NORMALIZE flag. -our $getCombinClass; +our $CVgetCombinClass; # Supported Levels use constant MinLevel => 1; @@ -225,17 +225,16 @@ sub checkCollator { croak "Unicode/Normalize.pm is required to normalize strings: $@" if $@; - $getCombinClass = \&Unicode::Normalize::getCombinClass - if ! $getCombinClass; + $CVgetCombinClass = \&Unicode::Normalize::getCombinClass + if ! $CVgetCombinClass; - my $norm = $self->{normalization}; - $self->{normCode} = sub { + if ($self->{normalization} ne 'prenormalized') { + my $norm = $self->{normalization}; + $self->{normCode} = sub { Unicode::Normalize::normalize($norm, shift); }; - - eval { $self->{normCode}->("") }; # try - if ($@) { - croak "$PACKAGE unknown normalization form name: $norm"; + eval { $self->{normCode}->("") }; # try + $@ and croak "$PACKAGE unknown normalization form name: $norm"; } } return; @@ -261,7 +260,7 @@ sub new if ! exists $self->{overrideHangul}; $self->{overrideCJK} = '' if ! exists $self->{overrideCJK}; - $self->{normalization} = 'D' + $self->{normalization} = 'NFD' if ! exists $self->{normalization}; $self->{alternate} = $self->{alternateTable} || 'shifted' if ! exists $self->{alternate}; @@ -490,19 +489,31 @@ sub splitCE $i = $p; } } - } - # with Combining Char (UTS#10, 4.2.1). - # requires Unicode::Normalize. - # Not be $wLen, as not croaked due to $norm. - if ($getCombinClass) { - for (my $p = $i + 1; $p < @src; $p++) { - next if ! defined $src[$p]; - last unless $getCombinClass->($src[$p]); - my $tail = CODE_SEP . $src[$p]; - if ($ent->{$ce.$tail}) { - $ce .= $tail; - $src[$p] = undef; + # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1). + # This process requires Unicode::Normalize. + # If "normalize" is undef, here should be skipped *always* + # (in spite of bool value of $CVgetCombinClass), + # since canonical ordering cannot be expected. + # Blocked combining character should not be contracted. + + if ($self->{normalization}) + # $self->{normCode} is false in the case of "prenormalized". + { + my $preCC = 0; + my $curCC = 0; + + for (my $p = $i + 1; $p < @src; $p++) { + next if ! defined $src[$p]; + $curCC = $CVgetCombinClass->($src[$p]); + last unless $curCC; + my $tail = CODE_SEP . $src[$p]; + if ($preCC != $curCC && $ent->{$ce.$tail}) { + $ce .= $tail; + $src[$p] = undef; + } else { + $preCC = $curCC; + } } } } @@ -1128,16 +1139,37 @@ If specified, strings are normalized before preparation of sort keys A form name C accepts will be applied as C<$normalization_form>. +Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>. See C for detail. If omitted, C<'NFD'> is used. L is performed after L (if defined). -If C is passed explicitly as the value for this key, +Furthermore, special values, C and C<"prenormalized">, can be used, +though they are not concerned with C. + +If C (not a string C<"undef">) is passed explicitly +as the value for this key, any normalization is not carried out (this may make tailoring easier if any normalization is not desired). - -see B. +Under C<(normalization =E undef)>, only contiguous contractions +are resolved; e.g. C would be primary equal to C, +even if C (and C) is ordered after C. +In this point, +C<(normalization =E undef, preprocess =E sub { NFD(shift) })> +B equivalent to C<(normalization =E 'NFD')>. + +In the case of C<(normalization =E "prenormalized")>, +any normalization is not performed, but +non-contiguous contractions with combining characters are performed. +Therefore +C<(normalization =E 'prenormalized', preprocess =E sub { NFD(shift) })> +B equivalent to C<(normalization =E 'NFD')>. +If source strings are finely prenormalized, +C<(normalization =E 'prenormalized')> may save time for normalization. + +Except C<(normalization =E undef)>, +B is required (see also B). =item overrideCJK diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes index 4f61b83..3d39bbe 100644 --- a/lib/Unicode/Collate/Changes +++ b/lib/Unicode/Collate/Changes @@ -1,8 +1,16 @@ Revision history for Perl module Unicode::Collate. +0.28 Sat Sep 06 20:16:01 2003 + - Fixed another inconsistency under (normalization => undef): + Non-contiguous contraction is always neglected. + - Fixed: according to S2.1 in UTS #10, a blocked combining character + should not be contracted. One test in test.t was wrong, then removed. + - Added contract.t. + - (normalization => "prenormalized") is able to be used. + 0.27 Sun Aug 31 22:23:17 2003 some improvements: - - The maximum length of contracted CE was not checked. + - The maximum length of contracted CE was not checked (v0.22 to v0.26). Collation of a large string including a first letter of a contraction that is not a part of that contraction (say, 'c' of 'ca' where 'ch' is defined) was too slow, inefficient. diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README index 21e1ff8..3c86573 100644 --- a/lib/Unicode/Collate/README +++ b/lib/Unicode/Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.27 +Unicode/Collate version 0.28 =============================== NAME diff --git a/lib/Unicode/Collate/t/contract.t b/lib/Unicode/Collate/t/contract.t new file mode 100644 index 0000000..c2aaecf --- /dev/null +++ b/lib/Unicode/Collate/t/contract.t @@ -0,0 +1,146 @@ +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 40 }; + +use strict; +use warnings; +use Unicode::Collate; + +use vars qw($IsEBCDIC); +$IsEBCDIC = ord("A") != 0x41; + +our $kjeEntry = <<'ENTRIES'; +0301 ; [.0000.0032.0002.0301] # COMBINING ACUTE ACCENT +0334 ; [.0000.008B.0002.0334] # COMBINING TILDE OVERLAY +043A ; [.0D31.0020.0002.043A] # CYRILLIC SMALL LETTER KA +041A ; [.0D31.0020.0008.041A] # CYRILLIC CAPITAL LETTER KA +045C ; [.0DA1.0020.0002.045C] # CYRILLIC SMALL LETTER KJE +043A 0301 ; [.0DA1.0020.0002.045C] # CYRILLIC SMALL LETTER KJE +040C ; [.0DA1.0020.0008.040C] # CYRILLIC CAPITAL LETTER KJE +041A 0301 ; [.0DA1.0020.0008.040C] # CYRILLIC CAPITAL LETTER KJE +ENTRIES + +our $aaEntry = <<'ENTRIES'; +0304 ; [.0000.005A.0002.0304] # COMBINING MACRON (cc = 230) +030A ; [.0000.0043.0002.030A] # COMBINING RING ABOVE (cc = 230) +0327 ; [.0000.0055.0002.0327] # COMBINING CEDILLA (cc = 202) +031A ; [.0000.006B.0002.031A] # COMBINING LEFT ANGLE ABOVE (cc = 232) +0061 ; [.0A15.0020.0002.0061] # LATIN SMALL LETTER A +0041 ; [.0A15.0020.0008.0041] # LATIN CAPITAL LETTER A +007A ; [.0C13.0020.0002.007A] # LATIN SMALL LETTER Z +005A ; [.0C13.0020.0008.005A] # LATIN CAPITAL LETTER Z +00E5 ; [.0C25.0020.0002.00E5] # LATIN SMALL LETTER A WITH RING ABOVE; QQCM +00C5 ; [.0C25.0020.0008.00C5] # LATIN CAPITAL LETTER A WITH RING ABOVE; QQCM +0061 030A ; [.0C25.0020.0002.0061] # LATIN SMALL LETTER A WITH RING ABOVE +0041 030A ; [.0C25.0020.0008.0041] # LATIN CAPITAL LETTER A WITH RING ABOVE +ENTRIES + +######################### + +ok(1); # If we made it this far, we're ok. + +my $kjeNoN = Unicode::Collate->new( + level => 1, + table => undef, + normalization => undef, + entry => $kjeEntry, +); + +ok($kjeNoN->lt("\x{043A}", "\x{043A}\x{0301}")); +ok($kjeNoN->gt("\x{045C}", "\x{043A}\x{0334}\x{0301}")); +ok($kjeNoN->eq("\x{043A}", "\x{043A}\x{0334}\x{0301}")); +ok($kjeNoN->eq("\x{045C}", "\x{043A}\x{0301}\x{0334}")); + +our %sortkeys; + +$sortkeys{'KAac'} = $kjeNoN->viewSortKey("\x{043A}\x{0301}"); +$sortkeys{'KAta'} = $kjeNoN->viewSortKey("\x{043A}\x{0334}\x{0301}"); +$sortkeys{'KAat'} = $kjeNoN->viewSortKey("\x{043A}\x{0301}\x{0334}"); + +eval { require Unicode::Normalize }; +if (!$@ && !$IsEBCDIC) { + my $kjeNFD = Unicode::Collate->new( + level => 1, + table => undef, + entry => $kjeEntry, + ); +ok($kjeNFD->lt("\x{043A}", "\x{043A}\x{0301}")); +ok($kjeNFD->eq("\x{045C}", "\x{043A}\x{0334}\x{0301}")); +ok($kjeNFD->lt("\x{043A}", "\x{043A}\x{0334}\x{0301}")); +ok($kjeNFD->eq("\x{045C}", "\x{043A}\x{0301}\x{0334}")); + + my $aaNFD = Unicode::Collate->new( + level => 1, + table => undef, + entry => $aaEntry, + ); + +ok($aaNFD->lt("Z", "A\x{30A}\x{304}")); +ok($aaNFD->eq("A", "A\x{304}\x{30A}")); +ok($aaNFD->eq(pack('U', 0xE5), "A\x{30A}\x{304}")); +ok($aaNFD->eq("A\x{304}", "A\x{304}\x{30A}")); +ok($aaNFD->lt("Z", "A\x{327}\x{30A}")); +ok($aaNFD->lt("Z", "A\x{30A}\x{327}")); +ok($aaNFD->lt("Z", "A\x{31A}\x{30A}")); +ok($aaNFD->lt("Z", "A\x{30A}\x{31A}")); + + my $aaPre = Unicode::Collate->new( + level => 1, + normalization => "prenormalized", + table => undef, + entry => $aaEntry, + ); + +ok($aaPre->lt("Z", "A\x{30A}\x{304}")); +ok($aaPre->eq("A", "A\x{304}\x{30A}")); +ok($aaPre->eq(pack('U', 0xE5), "A\x{30A}\x{304}")); +ok($aaPre->eq("A\x{304}", "A\x{304}\x{30A}")); +ok($aaPre->lt("Z", "A\x{327}\x{30A}")); +ok($aaPre->lt("Z", "A\x{30A}\x{327}")); +ok($aaPre->lt("Z", "A\x{31A}\x{30A}")); +ok($aaPre->lt("Z", "A\x{30A}\x{31A}")); +} +else { + ok(1) for 1..20; +} + +# again: loading Unicode::Normalize should not affect $kjeNoN. +ok($kjeNoN->lt("\x{043A}", "\x{043A}\x{0301}")); +ok($kjeNoN->gt("\x{045C}", "\x{043A}\x{0334}\x{0301}")); +ok($kjeNoN->eq("\x{043A}", "\x{043A}\x{0334}\x{0301}")); +ok($kjeNoN->eq("\x{045C}", "\x{043A}\x{0301}\x{0334}")); + +ok($sortkeys{'KAac'}, $kjeNoN->viewSortKey("\x{043A}\x{0301}")); +ok($sortkeys{'KAta'}, $kjeNoN->viewSortKey("\x{043A}\x{0334}\x{0301}")); +ok($sortkeys{'KAat'}, $kjeNoN->viewSortKey("\x{043A}\x{0301}\x{0334}")); + +my $aaNoN = Unicode::Collate->new( + level => 1, + table => undef, + entry => $aaEntry, + normalization => undef, +); + +ok($aaNoN->lt("Z", "A\x{30A}\x{304}")); +ok($aaNoN->eq("A", "A\x{304}\x{30A}")); +ok($aaNoN->eq(pack('U', 0xE5), "A\x{30A}\x{304}")); +ok($aaNoN->eq("A\x{304}", "A\x{304}\x{30A}")); +ok($aaNoN->eq("A", "A\x{327}\x{30A}")); +ok($aaNoN->lt("Z", "A\x{30A}\x{327}")); +ok($aaNoN->eq("A", "A\x{31A}\x{30A}")); +ok($aaNoN->lt("Z", "A\x{30A}\x{31A}")); + diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t index de4ca59..0c170e4 100644 --- a/lib/Unicode/Collate/t/test.t +++ b/lib/Unicode/Collate/t/test.t @@ -15,7 +15,7 @@ BEGIN { } use Test; -BEGIN { plan tests => 199 }; +BEGIN { plan tests => 200 }; use strict; use warnings; @@ -86,7 +86,8 @@ eval { require Unicode::Normalize }; if (!$@ && !$IsEBCDIC) { my $NFD = Unicode::Collate->new( - table => undef, + table => 'keys.txt', + level => 1, entry => <<'ENTRIES', 0430 ; [.0CB5.0020.0002.0430] # CYRILLIC SMALL LETTER A 0410 ; [.0CB5.0020.0008.0410] # CYRILLIC CAPITAL LETTER A @@ -101,14 +102,11 @@ ENTRIES ok($NFD->eq("\x{4D3}\x{325}", "\x{430}\x{308}\x{325}")); ok($NFD->lt("\x{430}\x{308}A", "\x{430}\x{308}B")); ok($NFD->lt("\x{430}\x{3099}B", "\x{430}\x{308}\x{3099}A")); - ok($NFD->eq("\x{0430}\x{3099}\x{309A}\x{0308}", - "\x{0430}\x{309A}\x{3099}\x{0308}") ); } else { ok(1); ok(1); ok(1); - ok(1); } ############## @@ -117,7 +115,7 @@ my $trad = Unicode::Collate->new( table => 'keys.txt', normalization => undef, ignoreName => qr/HANGUL|HIRAGANA|KATAKANA|BOPOMOFO/, - level => 4, + level => 3, entry => << 'ENTRIES', 0063 0068 ; [.0A3F.0020.0002.0063] % "ch" in traditional Spanish 0043 0068 ; [.0A3F.0020.0008.0043] # "Ch" in traditional Spanish @@ -138,6 +136,8 @@ ok( ); ok($trad->eq("ocho", "oc\cAho")); # UCA v9 ok($trad->eq("ocho", "oc\0\cA\0\cBho")); # UCA v9 +ok($trad->eq("-", "")); # also UCA v8 +ok($trad->lt("oc-ho", "ocho")); # also UCA v8 my $hiragana = "\x{3042}\x{3044}"; my $katakana = "\x{30A2}\x{30A4}"; -- 1.8.3.1