Upgrade to Unicode::Collate 0.28

author Rafael Garcia-Suarez <rgarciasuarez@gmail.com>

Sun, 7 Sep 2003 19:12:05 +0000 (19:12 +0000)

committer Rafael Garcia-Suarez <rgarciasuarez@gmail.com>

Sun, 7 Sep 2003 19:12:05 +0000 (19:12 +0000)
author Rafael Garcia-Suarez <rgarciasuarez@gmail.com>
Sun, 7 Sep 2003 19:12:05 +0000 (19:12 +0000)
committer Rafael Garcia-Suarez <rgarciasuarez@gmail.com>
Sun, 7 Sep 2003 19:12:05 +0000 (19:12 +0000)
diff --git a/MANIFEST b/MANIFEST

index eecaacd..e55b15b 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -1729,6 +1729,7 @@ lib/Unicode/Collate/Changes       Unicode::Collate
  lib/Unicode/Collate/keys.txt   Unicode::Collate
  lib/Unicode/Collate.pm         Unicode::Collate
  lib/Unicode/Collate/README     Unicode::Collate
+lib/Unicode/Collate/t/contract.t       Unicode::Collate
  lib/Unicode/Collate/t/hangul.t Unicode::Collate
  lib/Unicode/Collate/t/index.t  Unicode::Collate
  lib/Unicode/Collate/t/test.t   Unicode::Collate
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm

index 2bcc315..18ed446 100644 (file)
--- a/lib/Unicode/Collate.pm
+++ b/lib/Unicode/Collate.pm
@@ -14,7 +14,7 @@ use File::Spec;
  
  require Exporter;
  
-our $VERSION = '0.27';
+our $VERSION = '0.28';
  our $PACKAGE = __PACKAGE__;
  
  our @ISA = qw(Exporter);
@@ -53,7 +53,7 @@ use constant NOMATCHPOS => -1;
  # A coderef to get combining class imported from Unicode::Normalize
  # (i.e. \&Unicode::Normalize::getCombinClass).
  # This is also used as a HAS_UNICODE_NORMALIZE flag.
-our $getCombinClass;
+our $CVgetCombinClass;
  
  # Supported Levels
  use constant MinLevel => 1;
@@ -225,17 +225,16 @@ sub checkCollator {
         croak "Unicode/Normalize.pm is required to normalize strings: $@"
             if $@;
  
-       $getCombinClass = \&Unicode::Normalize::getCombinClass
-           if ! $getCombinClass;
+       $CVgetCombinClass = \&Unicode::Normalize::getCombinClass
+           if ! $CVgetCombinClass;
  
-       my $norm = $self->{normalization};
-       $self->{normCode} = sub {
+       if ($self->{normalization} ne 'prenormalized') {
+           my $norm = $self->{normalization};
+           $self->{normCode} = sub {
                 Unicode::Normalize::normalize($norm, shift);
             };
-
-       eval { $self->{normCode}->("") }; # try
-       if ($@) {
-           croak "$PACKAGE unknown normalization form name: $norm";
+           eval { $self->{normCode}->("") }; # try
+           $@ and croak "$PACKAGE unknown normalization form name: $norm";
         }
      }
      return;
@@ -261,7 +260,7 @@ sub new
         if ! exists $self->{overrideHangul};
      $self->{overrideCJK} = ''
         if ! exists $self->{overrideCJK};
-    $self->{normalization} = 'D'
+    $self->{normalization} = 'NFD'
         if ! exists $self->{normalization};
      $self->{alternate} = $self->{alternateTable} || 'shifted'
         if ! exists $self->{alternate};
@@ -490,19 +489,31 @@ sub splitCE
                     $i = $p;
                 }
             }
-       }
  
-       # with Combining Char (UTS#10, 4.2.1).
-       # requires Unicode::Normalize.
-       # Not be $wLen, as not croaked due to $norm.
-       if ($getCombinClass) {
-           for (my $p = $i + 1; $p < @src; $p++) {
-               next if ! defined $src[$p];
-               last unless $getCombinClass->($src[$p]);
-               my $tail = CODE_SEP . $src[$p];
-               if ($ent->{$ce.$tail}) {
-                   $ce .= $tail;
-                   $src[$p] = undef;
+       # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
+       # This process requires Unicode::Normalize.
+       # If "normalize" is undef, here should be skipped *always*
+       # (in spite of bool value of $CVgetCombinClass),
+       # since canonical ordering cannot be expected.
+       # Blocked combining character should not be contracted.
+
+           if ($self->{normalization})
+           # $self->{normCode} is false in the case of "prenormalized".
+           {
+               my $preCC = 0;
+               my $curCC = 0;
+
+               for (my $p = $i + 1; $p < @src; $p++) {
+                   next if ! defined $src[$p];
+                   $curCC = $CVgetCombinClass->($src[$p]);
+                   last unless $curCC;
+                   my $tail = CODE_SEP . $src[$p];
+                   if ($preCC != $curCC && $ent->{$ce.$tail}) {
+                       $ce .= $tail;
+                       $src[$p] = undef;
+                   } else {
+                       $preCC = $curCC;
+                   }
                 }
             }
         }
@@ -1128,16 +1139,37 @@ If specified, strings are normalized before preparation of sort keys
  
  A form name C<Unicode::Normalize::normalize()> accepts will be applied
  as C<$normalization_form>.
+Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
  See C<Unicode::Normalize::normalize()> for detail.
  If omitted, C<'NFD'> is used.
  
  L<normalization> is performed after L<preprocess> (if defined).
  
-If C<undef> is passed explicitly as the value for this key,
+Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
+though they are not concerned with C<Unicode::Normalize::normalize()>.
+
+If C<undef> (not a string C<"undef">) is passed explicitly
+as the value for this key,
  any normalization is not carried out (this may make tailoring easier
  if any normalization is not desired).
-
-see B<CAVEAT>.
+Under C<(normalization =E<gt> undef)>, only contiguous contractions
+are resolved; e.g. C<A-cedilla-ring> would be primary equal to C<A>,
+even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>.
+In this point,
+C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
+B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
+
+In the case of C<(normalization =E<gt> "prenormalized")>,
+any normalization is not performed, but
+non-contiguous contractions with combining characters are performed.
+Therefore
+C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
+B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
+If source strings are finely prenormalized,
+C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
+
+Except C<(normalization =E<gt> undef)>,
+B<Unicode::Normalize> is required (see also B<CAVEAT>).
  
  =item overrideCJK
  
diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes

index 4f61b83..3d39bbe 100644 (file)
--- a/lib/Unicode/Collate/Changes
+++ b/lib/Unicode/Collate/Changes
@@ -1,8 +1,16 @@
  Revision history for Perl module Unicode::Collate.
  
+0.28  Sat Sep 06 20:16:01 2003
+    - Fixed another inconsistency under (normalization => undef):
+      Non-contiguous contraction is always neglected.
+    - Fixed: according to S2.1 in UTS #10, a blocked combining character
+      should not be contracted. One test in test.t was wrong, then removed.
+    - Added contract.t.
+    - (normalization => "prenormalized") is able to be used.
+
  0.27  Sun Aug 31 22:23:17 2003
        some improvements:
-    - The maximum length of contracted CE was not checked.
+    - The maximum length of contracted CE was not checked (v0.22 to v0.26).
        Collation of a large string including a first letter of a contraction
        that is not a part of that contraction (say, 'c' of 'ca'
        where 'ch' is defined) was too slow, inefficient.
diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README

index 21e1ff8..3c86573 100644 (file)
--- a/lib/Unicode/Collate/README
+++ b/lib/Unicode/Collate/README
@@ -1,4 +1,4 @@
-Unicode/Collate version 0.27
+Unicode/Collate version 0.28
  ===============================
  
  NAME
diff --git a/lib/Unicode/Collate/t/contract.t b/lib/Unicode/Collate/t/contract.t

new file mode 100644 (file)

index 0000000..c2aaecf
--- /dev/null
+++ b/lib/Unicode/Collate/t/contract.t
@@ -0,0 +1,146 @@
+BEGIN {
+    unless ("A" eq pack('U', 0x41)) {
+       print "1..0 # Unicode::Collate " .
+           "cannot stringify a Unicode code point\n";
+       exit 0;
+    }
+}
+
+BEGIN {
+    if ($ENV{PERL_CORE}) {
+        chdir('t') if -d 't';
+        @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+    }
+}
+
+use Test;
+BEGIN { plan tests => 40 };
+
+use strict;
+use warnings;
+use Unicode::Collate;
+
+use vars qw($IsEBCDIC);
+$IsEBCDIC = ord("A") != 0x41;
+
+our $kjeEntry = <<'ENTRIES';
+0301  ; [.0000.0032.0002.0301] # COMBINING ACUTE ACCENT
+0334  ; [.0000.008B.0002.0334] # COMBINING TILDE OVERLAY
+043A  ; [.0D31.0020.0002.043A] # CYRILLIC SMALL LETTER KA
+041A  ; [.0D31.0020.0008.041A] # CYRILLIC CAPITAL LETTER KA
+045C  ; [.0DA1.0020.0002.045C] # CYRILLIC SMALL LETTER KJE
+043A 0301 ; [.0DA1.0020.0002.045C] # CYRILLIC SMALL LETTER KJE
+040C  ; [.0DA1.0020.0008.040C] # CYRILLIC CAPITAL LETTER KJE
+041A 0301 ; [.0DA1.0020.0008.040C] # CYRILLIC CAPITAL LETTER KJE
+ENTRIES
+
+our $aaEntry = <<'ENTRIES';
+0304  ; [.0000.005A.0002.0304] # COMBINING MACRON (cc = 230)
+030A  ; [.0000.0043.0002.030A] # COMBINING RING ABOVE (cc = 230)
+0327  ; [.0000.0055.0002.0327] # COMBINING CEDILLA (cc = 202)
+031A  ; [.0000.006B.0002.031A] # COMBINING LEFT ANGLE ABOVE (cc = 232)
+0061  ; [.0A15.0020.0002.0061] # LATIN SMALL LETTER A
+0041  ; [.0A15.0020.0008.0041] # LATIN CAPITAL LETTER A
+007A  ; [.0C13.0020.0002.007A] # LATIN SMALL LETTER Z
+005A  ; [.0C13.0020.0008.005A] # LATIN CAPITAL LETTER Z
+00E5  ; [.0C25.0020.0002.00E5] # LATIN SMALL LETTER A WITH RING ABOVE; QQCM
+00C5  ; [.0C25.0020.0008.00C5] # LATIN CAPITAL LETTER A WITH RING ABOVE; QQCM
+0061 030A ; [.0C25.0020.0002.0061] # LATIN SMALL LETTER A WITH RING ABOVE
+0041 030A ; [.0C25.0020.0008.0041] # LATIN CAPITAL LETTER A WITH RING ABOVE
+ENTRIES
+
+#########################
+
+ok(1); # If we made it this far, we're ok.
+
+my $kjeNoN = Unicode::Collate->new(
+    level => 1,
+    table => undef,
+    normalization => undef,
+    entry => $kjeEntry,
+);
+
+ok($kjeNoN->lt("\x{043A}", "\x{043A}\x{0301}"));
+ok($kjeNoN->gt("\x{045C}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNoN->eq("\x{043A}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNoN->eq("\x{045C}", "\x{043A}\x{0301}\x{0334}"));
+
+our %sortkeys;
+
+$sortkeys{'KAac'} = $kjeNoN->viewSortKey("\x{043A}\x{0301}");
+$sortkeys{'KAta'} = $kjeNoN->viewSortKey("\x{043A}\x{0334}\x{0301}");
+$sortkeys{'KAat'} = $kjeNoN->viewSortKey("\x{043A}\x{0301}\x{0334}");
+
+eval { require Unicode::Normalize };
+if (!$@ && !$IsEBCDIC) {
+    my $kjeNFD = Unicode::Collate->new(
+       level => 1,
+       table => undef,
+       entry => $kjeEntry,
+    );
+ok($kjeNFD->lt("\x{043A}", "\x{043A}\x{0301}"));
+ok($kjeNFD->eq("\x{045C}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNFD->lt("\x{043A}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNFD->eq("\x{045C}", "\x{043A}\x{0301}\x{0334}"));
+
+    my $aaNFD = Unicode::Collate->new(
+       level => 1,
+       table => undef,
+       entry => $aaEntry,
+    );
+
+ok($aaNFD->lt("Z", "A\x{30A}\x{304}"));
+ok($aaNFD->eq("A", "A\x{304}\x{30A}"));
+ok($aaNFD->eq(pack('U', 0xE5), "A\x{30A}\x{304}"));
+ok($aaNFD->eq("A\x{304}", "A\x{304}\x{30A}"));
+ok($aaNFD->lt("Z", "A\x{327}\x{30A}"));
+ok($aaNFD->lt("Z", "A\x{30A}\x{327}"));
+ok($aaNFD->lt("Z", "A\x{31A}\x{30A}"));
+ok($aaNFD->lt("Z", "A\x{30A}\x{31A}"));
+
+    my $aaPre = Unicode::Collate->new(
+       level => 1,
+       normalization => "prenormalized",
+       table => undef,
+       entry => $aaEntry,
+    );
+
+ok($aaPre->lt("Z", "A\x{30A}\x{304}"));
+ok($aaPre->eq("A", "A\x{304}\x{30A}"));
+ok($aaPre->eq(pack('U', 0xE5), "A\x{30A}\x{304}"));
+ok($aaPre->eq("A\x{304}", "A\x{304}\x{30A}"));
+ok($aaPre->lt("Z", "A\x{327}\x{30A}"));
+ok($aaPre->lt("Z", "A\x{30A}\x{327}"));
+ok($aaPre->lt("Z", "A\x{31A}\x{30A}"));
+ok($aaPre->lt("Z", "A\x{30A}\x{31A}"));
+}
+else {
+  ok(1) for 1..20;
+}
+
+# again: loading Unicode::Normalize should not affect $kjeNoN.
+ok($kjeNoN->lt("\x{043A}", "\x{043A}\x{0301}"));
+ok($kjeNoN->gt("\x{045C}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNoN->eq("\x{043A}", "\x{043A}\x{0334}\x{0301}"));
+ok($kjeNoN->eq("\x{045C}", "\x{043A}\x{0301}\x{0334}"));
+
+ok($sortkeys{'KAac'}, $kjeNoN->viewSortKey("\x{043A}\x{0301}"));
+ok($sortkeys{'KAta'}, $kjeNoN->viewSortKey("\x{043A}\x{0334}\x{0301}"));
+ok($sortkeys{'KAat'}, $kjeNoN->viewSortKey("\x{043A}\x{0301}\x{0334}"));
+
+my $aaNoN = Unicode::Collate->new(
+    level => 1,
+    table => undef,
+    entry => $aaEntry,
+    normalization => undef,
+);
+
+ok($aaNoN->lt("Z", "A\x{30A}\x{304}"));
+ok($aaNoN->eq("A", "A\x{304}\x{30A}"));
+ok($aaNoN->eq(pack('U', 0xE5), "A\x{30A}\x{304}"));
+ok($aaNoN->eq("A\x{304}", "A\x{304}\x{30A}"));
+ok($aaNoN->eq("A", "A\x{327}\x{30A}"));
+ok($aaNoN->lt("Z", "A\x{30A}\x{327}"));
+ok($aaNoN->eq("A", "A\x{31A}\x{30A}"));
+ok($aaNoN->lt("Z", "A\x{30A}\x{31A}"));
+
diff --git a/lib/Unicode/Collate/t/test.t b/lib/Unicode/Collate/t/test.t

index de4ca59..0c170e4 100644 (file)
--- a/lib/Unicode/Collate/t/test.t
+++ b/lib/Unicode/Collate/t/test.t
@@ -15,7 +15,7 @@ BEGIN {
  }
  
  use Test;
-BEGIN { plan tests => 199 };
+BEGIN { plan tests => 200 };
  
  use strict;
  use warnings;
@@ -86,7 +86,8 @@ eval { require Unicode::Normalize };
  
  if (!$@ && !$IsEBCDIC) {
    my $NFD = Unicode::Collate->new(
-    table => undef,
+    table => 'keys.txt',
+    level => 1,
      entry => <<'ENTRIES',
  0430  ; [.0CB5.0020.0002.0430] # CYRILLIC SMALL LETTER A
  0410  ; [.0CB5.0020.0008.0410] # CYRILLIC CAPITAL LETTER A
@@ -101,14 +102,11 @@ ENTRIES
    ok($NFD->eq("\x{4D3}\x{325}", "\x{430}\x{308}\x{325}"));
    ok($NFD->lt("\x{430}\x{308}A", "\x{430}\x{308}B"));
    ok($NFD->lt("\x{430}\x{3099}B", "\x{430}\x{308}\x{3099}A"));
-  ok($NFD->eq("\x{0430}\x{3099}\x{309A}\x{0308}",
-              "\x{0430}\x{309A}\x{3099}\x{0308}") );
  }
  else {
    ok(1);
    ok(1);
    ok(1);
-  ok(1);
  }
  
  ##############
@@ -117,7 +115,7 @@ my $trad = Unicode::Collate->new(
    table => 'keys.txt',
    normalization => undef,
    ignoreName => qr/HANGUL|HIRAGANA|KATAKANA|BOPOMOFO/,
-  level => 4,
+  level => 3,
    entry => << 'ENTRIES',
   0063 0068 ; [.0A3F.0020.0002.0063] % "ch" in traditional Spanish
   0043 0068 ; [.0A3F.0020.0008.0043] # "Ch" in traditional Spanish
@@ -138,6 +136,8 @@ ok(
  );
  ok($trad->eq("ocho", "oc\cAho")); # UCA v9
  ok($trad->eq("ocho", "oc\0\cA\0\cBho")); # UCA v9
+ok($trad->eq("-", "")); # also UCA v8
+ok($trad->lt("oc-ho", "ocho")); # also UCA v8
  
  my $hiragana = "\x{3042}\x{3044}";
  my $katakana = "\x{30A2}\x{30A4}";
author	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>
	Sun, 7 Sep 2003 19:12:05 +0000 (19:12 +0000)
committer	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>
	Sun, 7 Sep 2003 19:12:05 +0000 (19:12 +0000)
MANIFEST		patch \| blob \| blame \| history
lib/Unicode/Collate.pm		patch \| blob \| blame \| history
lib/Unicode/Collate/Changes		patch \| blob \| blame \| history
lib/Unicode/Collate/README		patch \| blob \| blame \| history
lib/Unicode/Collate/t/contract.t	[new file with mode: 0644]	patch \| blob
lib/Unicode/Collate/t/test.t		patch \| blob \| blame \| history