cpan/Unicode-Collate/Collate/Locale.pm

   1 package Unicode::Collate::Locale;
   2
   3 use strict;
   4 use Carp;
   5 use base qw(Unicode::Collate);
   6
   7 our $VERSION = '0.67';
   8
   9 use File::Spec;
  10
  11 (my $ModPath = $INC{'Unicode/Collate/Locale.pm'}) =~ s/\.pm$//;
  12 my $KeyPath = File::Spec->catfile('allkeys.txt');
  13 my $PL_EXT  = '.pl';
  14
  15 my %LocaleFile = map { ($_, $_) } qw(
  16    af ar az ca cs cy da eo es et fi fil fo fr ha haw
  17    hr hu hy ig is ja kk kl ko lt lv mt nb nn nso om pl ro ru
  18    se sk sl sq sv sw tn to tr uk vi wo yo zh
  19 );
  20    $LocaleFile{'default'}         = '';
  21    $LocaleFile{'de__phonebook'}   = 'de_phone';
  22    $LocaleFile{'es__traditional'} = 'es_trad';
  23    $LocaleFile{'be'} = 'ru';
  24    $LocaleFile{'bg'} = 'ru';
  25    $LocaleFile{'mk'} = 'ru';
  26    $LocaleFile{'sr'} = 'ru';
  27    $LocaleFile{'zh__big5han'}   = 'zh_big5';
  28    $LocaleFile{'zh__gb2312han'} = 'zh_gb';
  29    $LocaleFile{'zh__pinyin'}    = 'zh_pin';
  30    $LocaleFile{'zh__stroke'}    = 'zh_strk';
  31
  32 sub _locale {
  33     my $locale = shift;
  34     if ($locale) {
  35         $locale = lc $locale;
  36         $locale =~ tr/\-\ \./_/;
  37         $locale =~ s/_phone(?:bk)?\z/_phonebook/;
  38         $locale =~ s/_trad\z/_traditional/;
  39         $locale =~ s/_big5\z/_big5han/;
  40         $locale =~ s/_gb2312\z/_gb2312han/;
  41         $LocaleFile{$locale} and return $locale;
  42
  43         my ($l,$t,$v) = split(/_/, $locale.'__');
  44         for my $loc ("${l}_${t}_$v", "${l}_$t", "${l}__$v", "${l}__$t", $l) {
  45             $LocaleFile{$loc} and return $loc;
  46         }
  47     }
  48     return 'default';
  49 }
  50
  51 sub getlocale {
  52     return shift->{accepted_locale};
  53 }
  54
  55 sub _fetchpl {
  56     my $accepted = shift;
  57     my $f = $LocaleFile{$accepted};
  58     return if !$f;
  59     $f .= $PL_EXT;
  60     my $path = File::Spec->catfile($ModPath, $f);
  61     my $h = do $path;
  62     croak "Unicode/Collate/Locale/$f can't be found" if !$h;
  63     return $h;
  64 }
  65
  66 sub new {
  67     my $class = shift;
  68     my %hash = @_;
  69     $hash{accepted_locale} = _locale($hash{locale});
  70
  71     if (exists $hash{table}) {
  72         croak "your table can't be used with Unicode::Collate::Locale";
  73     }
  74     $hash{table} = $KeyPath;
  75
  76     my $href = _fetchpl($hash{accepted_locale});
  77     while (my($k,$v) = each %$href) {
  78         if (exists $hash{$k}) {
  79             croak "$k is reserved by $hash{locale}, can't be overwritten";
  80         }
  81         $hash{$k} = $v;
  82     }
  83     return $class->SUPER::new(%hash);
  84 }
  85
  86 1;
  87 __END__
  88
  89 =head1 NAME
  90
  91 Unicode::Collate::Locale - Linguistic tailoring for DUCET via Unicode::Collate
  92
  93 =head1 SYNOPSIS
  94
  95   use Unicode::Collate::Locale;
  96
  97   #construct
  98   $Collator = Unicode::Collate::Locale->
  99       new(locale => $locale_name, %tailoring);
 100
 101   #sort
 102   @sorted = $Collator->sort(@not_sorted);
 103
 104   #compare
 105   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
 106
 107 B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
 108 according to Perl's Unicode support. See L<perlunicode>,
 109 L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
 110 Otherwise you can use C<preprocess> (cf. C<Unicode::Collate>)
 111 or should decode them before.
 112
 113 =head1 DESCRIPTION
 114
 115 This module provides linguistic tailoring for it
 116 taking advantage of C<Unicode::Collate>.
 117
 118 =head2 Constructor
 119
 120 The C<new> method returns a collator object.
 121
 122 A parameter list for the constructor is a hash, which can include
 123 a special key C<'locale'> and its value (case-insensitive) standing
 124 for a two-letter language code (ISO-639) like C<'en'> for English.
 125 For example, C<Unicode::Collate::Locale-E<gt>new(locale =E<gt> 'FR')>
 126 returns a collator tailored for French.
 127
 128 C<$locale_name> may be suffixed with a territory(country)
 129 code or a variant code, which are separated with C<'_'>.
 130 E.g. C<en_US> for English in USA,
 131 C<es_ES_traditional> for Spanish in Spain (Traditional),
 132
 133 If C<$localename> is not defined,
 134 fallback is selected in the following order:
 135
 136     1. language_territory_variant
 137     2. language_territory
 138     3. language__variant
 139     4. language
 140     5. default
 141
 142 Tailoring tags provided by C<Unicode::Collate> are allowed
 143 as long as they are not used for C<'locale'> support.
 144 Esp. the C<table> tag is always untailorable
 145 since it is reserved for DUCET.
 146
 147 E.g. a collator for French, which ignores diacritics and case difference
 148 (i.e. level 1), with reversed case ordering and no normalization.
 149
 150     Unicode::Collate::Locale->new(
 151         level => 1,
 152         locale => 'fr',
 153         upper_before_lower => 1,
 154         normalization => undef
 155     )
 156
 157 =head2 Methods
 158
 159 C<Unicode::Collate::Locale> is a subclass of C<Unicode::Collate>
 160 and methods other than C<new> are inherited from C<Unicode::Collate>.
 161
 162 Here is a list of additional methods:
 163
 164 =over 4
 165
 166 =item C<$Collator-E<gt>getlocale>
 167
 168 Returns a language code accepted and used actually on collation.
 169 If linguistic tailoring is not provided for a language code you passed
 170 (intensionally for some languages, or due to the incomplete implementation),
 171 this method returns a string C<'default'> meaning no special tailoring.
 172
 173 =back
 174
 175 =head2 A list of tailorable locales
 176
 177       locale name       description
 178     ----------------------------------------------------------
 179       af                Afrikaans
 180       ar                Arabic
 181       az                Azerbaijani (Azeri)
 182       be                Belarusian
 183       bg                Bulgarian
 184       ca                Catalan
 185       cs                Czech
 186       cy                Welsh
 187       da                Danish
 188       de__phonebook     German (umlaut as 'ae', 'oe', 'ue')
 189       eo                Esperanto
 190       es                Spanish
 191       es__traditional   Spanish ('ch' and 'll' as a grapheme)
 192       et                Estonian
 193       fi                Finnish
 194       fil               Filipino
 195       fo                Faroese
 196       fr                French
 197       ha                Hausa
 198       haw               Hawaiian
 199       hr                Croatian
 200       hu                Hungarian
 201       hy                Armenian
 202       ig                Igbo
 203       is                Icelandic
 204       ja                Japanese [1]
 205       kk                Kazakh
 206       kl                Kalaallisut
 207       ko                Korean [2]
 208       lt                Lithuanian
 209       lv                Latvian
 210       mk                Macedonian
 211       mt                Maltese
 212       nb                Norwegian Bokmal
 213       nn                Norwegian Nynorsk
 214       nso               Northern Sotho
 215       om                Oromo
 216       pl                Polish
 217       ro                Romanian
 218       ru                Russian
 219       se                Northern Sami
 220       sk                Slovak
 221       sl                Slovenian
 222       sq                Albanian
 223       sr                Serbian
 224       sv                Swedish
 225       sw                Swahili
 226       tn                Tswana
 227       to                Tonga
 228       tr                Turkish
 229       uk                Ukrainian
 230       vi                Vietnamese
 231       wo                Wolof
 232       yo                Yoruba
 233       zh                Chinese
 234       zh__big5han       Chinese (ideographs: big5 order)
 235       zh__gb2312han     Chinese (ideographs: GB-2312 order)
 236       zh__pinyin        Chinese (ideographs: pinyin order)
 237       zh__stroke        Chinese (ideographs: stroke order)
 238     ----------------------------------------------------------
 239
 240 Locales according to the default UCA rules include
 241 de (German),
 242 en (English),
 243 ga (Irish),
 244 id (Indonesian),
 245 it (Italian),
 246 ka (Georgian),
 247 ln (Lingala),
 248 ms (Malay),
 249 nl (Dutch),
 250 pt (Portuguese),
 251 st (Southern Sotho),
 252 xh (Xhosa),
 253 zu (Zulu).
 254
 255 B<Note>
 256
 257 [1] ja: Ideographs are sorted in JIS X 0208 order.
 258 Fullwidth and halfwidth forms are identical to their normal form.
 259 The difference between hiragana and katakana is at the 4th level,
 260 the comparison also requires C<(variable =E<gt> 'Non-ignorable')>,
 261 and then C<katakana_before_hiragana> has no effect.
 262
 263 [2] ko: Plenty of ideographs are sorted by their reading. Such
 264 an ideograph is primary (level 1) equal to, and secondary (level 2)
 265 greater than, the corresponding hangul syllable.
 266
 267 =head1 INSTALL
 268
 269 Installation of C<Unicode::Collate::Locale> requires F<Collate/Locale.pm>,
 270 F<Collate/Locale/*.pm>, F<Collate/CJK/*.pm> and F<Collate/allkeys.txt>.
 271 On building, C<Unicode::Collate::Locale> doesn't require any of F<data/*.txt>,
 272 F<gendata/*>, and F<mklocale>.
 273 Tests for C<Unicode::Collate::Locale> are named F<t/loc_*.t>.
 274
 275 =head1 CAVEAT
 276
 277 =over 4
 278
 279 =item tailoring is not maximum
 280
 281 Even if a certain letter is tailored, its equivalent would not always
 282 tailored as well as it. For example, even though W is tailored,
 283 fullwidth W (C<U+FF37>), W with acute (C<U+1E82>), etc. are not
 284 tailored. The result may depend on whether source strings are
 285 normalized or not, and whether decomposed or composed.
 286 Thus C<(normalization =E<gt> undef> is less preferred.
 287
 288 =back
 289
 290 =head1 AUTHOR
 291
 292 The Unicode::Collate::Locale module for perl was written
 293 by SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>.
 294 This module is Copyright(C) 2004-2010, SADAHIRO Tomoyuki. Japan.
 295 All rights reserved.
 296
 297 This module is free software; you can redistribute it and/or
 298 modify it under the same terms as Perl itself.
 299
 300 =head1 SEE ALSO
 301
 302 =over 4
 303
 304 =item Unicode Collation Algorithm - UTS #10
 305
 306 L<http://www.unicode.org/reports/tr10/>
 307
 308 =item The Default Unicode Collation Element Table (DUCET)
 309
 310 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
 311
 312 =item Unicode Locale Data Markup Language (LDML) - UTS #35
 313
 314 L<http://www.unicode.org/reports/tr35/>
 315
 316 =item CLDR - Unicode Common Locale Data Repository
 317
 318 L<http://cldr.unicode.org/>
 319
 320 =item L<Unicode::Collate>
 321
 322 =item L<Unicode::Normalize>
 323
 324 =back
 325
 326 =cut