Update Unicode-Collate to CPAN version 0.71

[perl5.git] / cpan / Unicode-Collate / Collate / Locale.pm
diff --git a/cpan/Unicode-Collate/Collate/Locale.pm b/cpan/Unicode-Collate/Collate/Locale.pm

index bbd6e1f..c589144 100644 (file)
--- a/cpan/Unicode-Collate/Collate/Locale.pm
+++ b/cpan/Unicode-Collate/Collate/Locale.pm
@@ -4,26 +4,29 @@ use strict;
  use Carp;
  use base qw(Unicode::Collate);
  
-our $VERSION = '0.63';
+our $VERSION = '0.71';
  
  use File::Spec;
  
  (my $ModPath = $INC{'Unicode/Collate/Locale.pm'}) =~ s/\.pm$//;
-my $KeyPath = File::Spec->catfile('allkeys.txt');
  my $PL_EXT  = '.pl';
  
  my %LocaleFile = map { ($_, $_) } qw(
     af ar az ca cs cy da eo es et fi fil fo fr ha haw
-   hr hu hy ig is kk kl lt lv mt nb nn nso om pl ro ru
-   se sk sl sq sv sw tn to tr uk vi wo yo
+   hr hu hy ig is ja kk kl ko lt lv mt nb nn nso om pl ro ru
+   se sk sl sq sv sw tn to tr uk vi wo yo zh
  );
     $LocaleFile{'default'}         = '';
     $LocaleFile{'de__phonebook'}   = 'de_phone';
     $LocaleFile{'es__traditional'} = 'es_trad';
-   $LocaleFile{'be'} = "ru";
-   $LocaleFile{'bg'} = "ru";
-   $LocaleFile{'mk'} = "ru";
-   $LocaleFile{'sr'} = "ru";
+   $LocaleFile{'be'} = 'ru';
+   $LocaleFile{'bg'} = 'ru';
+   $LocaleFile{'mk'} = 'ru';
+   $LocaleFile{'sr'} = 'ru';
+   $LocaleFile{'zh__big5han'}   = 'zh_big5';
+   $LocaleFile{'zh__gb2312han'} = 'zh_gb';
+   $LocaleFile{'zh__pinyin'}    = 'zh_pin';
+   $LocaleFile{'zh__stroke'}    = 'zh_strk';
  
  sub _locale {
      my $locale = shift;
@@ -32,6 +35,8 @@ sub _locale {
         $locale =~ tr/\-\ \./_/;
         $locale =~ s/_phone(?:bk)?\z/_phonebook/;
         $locale =~ s/_trad\z/_traditional/;
+       $locale =~ s/_big5\z/_big5han/;
+       $locale =~ s/_gb2312\z/_gb2312han/;
         $LocaleFile{$locale} and return $locale;
  
         my ($l,$t,$v) = split(/_/, $locale.'__');
@@ -65,7 +70,6 @@ sub new {
      if (exists $hash{table}) {
         croak "your table can't be used with Unicode::Collate::Locale";
      }
-    $hash{table} = $KeyPath;
  
      my $href = _fetchpl($hash{accepted_locale});
      while (my($k,$v) = each %$href) {
@@ -88,11 +92,22 @@ Unicode::Collate::Locale - Linguistic tailoring for DUCET via Unicode::Collate
  
    use Unicode::Collate::Locale;
  
+  #construct
    $Collator = Unicode::Collate::Locale->
        new(locale => $locale_name, %tailoring);
  
+  #sort
    @sorted = $Collator->sort(@not_sorted);
  
+  #compare
+  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
+
+B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
+according to Perl's Unicode support. See L<perlunicode>,
+L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
+Otherwise you can use C<preprocess> (cf. C<Unicode::Collate>)
+or should decode them before.
+
  =head1 DESCRIPTION
  
  This module provides linguistic tailoring for it
@@ -103,7 +118,7 @@ taking advantage of C<Unicode::Collate>.
  The C<new> method returns a collator object.
  
  A parameter list for the constructor is a hash, which can include
-a special key C<'locale'> and its value (case-insensitive) standing
+a special key C<locale> and its value (case-insensitive) standing
  for a two-letter language code (ISO-639) like C<'en'> for English.
  For example, C<Unicode::Collate::Locale-E<gt>new(locale =E<gt> 'FR')>
  returns a collator tailored for French.
@@ -122,10 +137,9 @@ fallback is selected in the following order:
      4. language
      5. default
  
-Tailoring tags provided by C<Unicode::Collate> are allowed
-as long as they are not used for C<'locale'> support.
-Esp. the C<table> tag is always untailorable
-since it is reserved for DUCET.
+Tailoring tags provided by C<Unicode::Collate> are allowed as long as
+they are not used for C<locale> support.  Esp. the C<table> tag
+is always untailorable since it is reserved for DUCET.
  
  E.g. a collator for French, which ignores diacritics and case difference
  (i.e. level 1), with reversed case ordering and no normalization.
@@ -137,6 +151,21 @@ E.g. a collator for French, which ignores diacritics and case difference
          normalization => undef
      )
  
+Overriding a behavior already tailored by C<locale> is disallowed
+if such a tailoring is passed to C<new()>.
+
+    Unicode::Collate::Locale->new(
+        locale => 'da',
+        upper_before_lower => 0, # causes error as reserved by 'da'
+    )
+
+However C<change()> inherited from C<Unicode::Collate> allows
+such a tailoring that is reserved by C<locale>. Examples:
+
+    new(locale => 'ca')->change(backwards => undef)
+    new(locale => 'da')->change(upper_before_lower => 0)
+    new(locale => 'ja')->change(overrideCJK => undef)
+
  =head2 Methods
  
  C<Unicode::Collate::Locale> is a subclass of C<Unicode::Collate>
@@ -184,8 +213,10 @@ this method returns a string C<'default'> meaning no special tailoring.
        hy                Armenian
        ig                Igbo
        is                Icelandic
+      ja                Japanese [1]
        kk                Kazakh
        kl                Kalaallisut
+      ko                Korean [2]
        lt                Lithuanian
        lv                Latvian
        mk                Macedonian
@@ -211,9 +242,14 @@ this method returns a string C<'default'> meaning no special tailoring.
        vi                Vietnamese
        wo                Wolof
        yo                Yoruba
+      zh                Chinese
+      zh__big5han       Chinese (ideographs: big5 order)
+      zh__gb2312han     Chinese (ideographs: GB-2312 order)
+      zh__pinyin        Chinese (ideographs: pinyin order)
+      zh__stroke        Chinese (ideographs: stroke order)
      ----------------------------------------------------------
  
-Locales according to default UCA rules include:
+Locales according to the default UCA rules include
  de (German),
  en (English),
  ga (Irish),
@@ -228,12 +264,25 @@ st (Southern Sotho),
  xh (Xhosa),
  zu (Zulu).
  
+B<Note>
+
+[1] ja: Ideographs are sorted in JIS X 0208 order.
+Fullwidth and halfwidth forms are identical to their normal form.
+The difference between hiragana and katakana is at the 4th level,
+the comparison also requires C<(variable =E<gt> 'Non-ignorable')>,
+and then C<katakana_before_hiragana> has no effect.
+
+[2] ko: Plenty of ideographs are sorted by their reading. Such
+an ideograph is primary (level 1) equal to, and secondary (level 2)
+greater than, the corresponding hangul syllable.
+
  =head1 INSTALL
  
-Installation of Unicode::Collate::Locale requires F<Collate/Locale.pm>,
-F<Collate/Locale/*.pm> and F<Collate/allkeys.txt>.  On building,
-Unicode::Collate::Locale doesn't require F<data/*.txt> and F<mklocale>.
-Tests for Unicode::Collate::Locale are named F<t/loc_*.t>.
+Installation of C<Unicode::Collate::Locale> requires F<Collate/Locale.pm>,
+F<Collate/Locale/*.pm>, F<Collate/CJK/*.pm> and F<Collate/allkeys.txt>.
+On building, C<Unicode::Collate::Locale> doesn't require any of F<data/*.txt>,
+F<gendata/*>, and F<mklocale>.
+Tests for C<Unicode::Collate::Locale> are named F<t/loc_*.t>.
  
  =head1 CAVEAT
  
@@ -241,12 +290,12 @@ Tests for Unicode::Collate::Locale are named F<t/loc_*.t>.
  
  =item tailoring is not maximum
  
-If a certain letter is tailored, its equivalents are not always
+Even if a certain letter is tailored, its equivalent would not always
  tailored as well as it. For example, even though W is tailored,
  fullwidth W (C<U+FF37>), W with acute (C<U+1E82>), etc. are not
  tailored. The result may depend on whether source strings are
  normalized or not, and whether decomposed or composed.
-Thus C<(normalization =E<gt> undef> is less preferred.
+Thus C<(normalization =E<gt> undef)> is less preferred.
  
  =back
  
@@ -254,7 +303,7 @@ Thus C<(normalization =E<gt> undef> is less preferred.
  
  The Unicode::Collate::Locale module for perl was written
  by SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>.
-This module is Copyright(C) 2004-2010, SADAHIRO Tomoyuki. Japan.
+This module is Copyright(C) 2004-2011, SADAHIRO Tomoyuki. Japan.
  All rights reserved.
  
  This module is free software; you can redistribute it and/or