cpan/Unicode-Collate/Collate/Locale.pm

   1 package Unicode::Collate::Locale;
   2
   3 use strict;
   4 use warnings;
   5 use Carp;
   6 use base qw(Unicode::Collate);
   7
   8 our $VERSION = '1.21';
   9
  10 my $PL_EXT  = '.pl';
  11
  12 my %LocaleFile = map { ($_, $_) } qw(
  13    af ar as az be bn ca cs cy da dsb ee eo es et fa fi fil fo gu
  14    ha haw he hi hr hu hy ig is ja kk kl kn ko kok lkt ln lt lv
  15    mk ml mr mt nb nn nso om or pa pl ro sa se si sk sl sq sr sv
  16    ta te th tn to tr uk ur vi vo wae wo yo zh
  17 );
  18    $LocaleFile{'default'} = '';
  19 # aliases
  20    $LocaleFile{'bs'}      = 'hr';
  21    $LocaleFile{'bs_Cyrl'} = 'sr';
  22    $LocaleFile{'sr_Latn'} = 'hr';
  23 # short file names
  24    $LocaleFile{'de__phonebook'}   = 'de_phone';
  25    $LocaleFile{'de_AT_phonebook'} = 'de_at_ph';
  26    $LocaleFile{'es__traditional'} = 'es_trad';
  27    $LocaleFile{'fr_CA'}           = 'fr_ca';
  28    $LocaleFile{'fi__phonebook'}   = 'fi_phone';
  29    $LocaleFile{'si__dictionary'}  = 'si_dict';
  30    $LocaleFile{'sv__reformed'}    = 'sv_refo';
  31    $LocaleFile{'ug_Cyrl'}         = 'ug_cyrl';
  32    $LocaleFile{'zh__big5han'}     = 'zh_big5';
  33    $LocaleFile{'zh__gb2312han'}   = 'zh_gb';
  34    $LocaleFile{'zh__pinyin'}      = 'zh_pin';
  35    $LocaleFile{'zh__stroke'}      = 'zh_strk';
  36    $LocaleFile{'zh__zhuyin'}      = 'zh_zhu';
  37
  38 my %TypeAlias = qw(
  39     phone     phonebook
  40     phonebk   phonebook
  41     dict      dictionary
  42     reform    reformed
  43     trad      traditional
  44     big5      big5han
  45     gb2312    gb2312han
  46 );
  47
  48 sub _locale {
  49     my $locale = shift;
  50     if ($locale) {
  51         $locale = lc $locale;
  52         $locale =~ tr/\-\ \./_/;
  53         $locale =~ s/_([0-9a-z]+)\z/$TypeAlias{$1} ?
  54                                   "_$TypeAlias{$1}" : "_$1"/e;
  55         $LocaleFile{$locale} and return $locale;
  56
  57         my @code = split /_/, $locale;
  58         my $lan = shift @code;
  59         my $scr = @code && length $code[0] == 4 ? ucfirst shift @code : '';
  60         my $reg = @code && length $code[0] <  4 ? uc      shift @code : '';
  61         my $var = @code                         ?         shift @code : '';
  62
  63         my @list;
  64         push @list, (
  65             "${lan}_${scr}_${reg}_$var",
  66             "${lan}_${scr}__$var", # empty $scr should not be ${lan}__$var.
  67             "${lan}_${reg}_$var",  # empty $reg may be ${lan}__$var.
  68             "${lan}__$var",
  69         ) if $var ne '';
  70         push @list, (
  71             "${lan}_${scr}_${reg}",
  72             "${lan}_${scr}",
  73             "${lan}_${reg}",
  74              ${lan},
  75         );
  76         for my $loc (@list) {
  77             $LocaleFile{$loc} and return $loc;
  78         }
  79     }
  80     return 'default';
  81 }
  82
  83 sub getlocale {
  84     return shift->{accepted_locale};
  85 }
  86
  87 sub locale_version {
  88     return shift->{locale_version};
  89 }
  90
  91 sub _fetchpl {
  92     my $accepted = shift;
  93     my $f = $LocaleFile{$accepted};
  94     return if !$f;
  95     $f .= $PL_EXT;
  96
  97     # allow to search @INC
  98 #   use File::Spec;
  99 #   my $path = File::Spec->catfile('Unicode', 'Collate', 'Locale', $f);
 100     my $path = "Unicode/Collate/Locale/$f";
 101     my $h = do $path;
 102     croak "Unicode/Collate/Locale/$f can't be found" if !$h;
 103     return $h;
 104 }
 105
 106 sub new {
 107     my $class = shift;
 108     my %hash = @_;
 109     $hash{accepted_locale} = _locale($hash{locale});
 110
 111     if (exists $hash{table}) {
 112         croak "your table can't be used with Unicode::Collate::Locale";
 113     }
 114
 115     my $href = _fetchpl($hash{accepted_locale});
 116     while (my($k,$v) = each %$href) {
 117         if (!exists $hash{$k}) {
 118             $hash{$k} = $v;
 119         } elsif ($k eq 'entry') {
 120             $hash{$k} = $v.$hash{$k};
 121         } else {
 122             croak "$k is reserved by $hash{locale}, can't be overwritten";
 123         }
 124     }
 125     return $class->SUPER::new(%hash);
 126 }
 127
 128 1;
 129 __END__
 130
 131 =head1 NAME
 132
 133 Unicode::Collate::Locale - Linguistic tailoring for DUCET via Unicode::Collate
 134
 135 =head1 SYNOPSIS
 136
 137   use Unicode::Collate::Locale;
 138
 139   #construct
 140   $Collator = Unicode::Collate::Locale->
 141       new(locale => $locale_name, %tailoring);
 142
 143   #sort
 144   @sorted = $Collator->sort(@not_sorted);
 145
 146   #compare
 147   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
 148
 149 B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
 150 according to Perl's Unicode support. See L<perlunicode>,
 151 L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
 152 Otherwise you can use C<preprocess> (cf. C<Unicode::Collate>)
 153 or should decode them before.
 154
 155 =head1 DESCRIPTION
 156
 157 This module provides linguistic tailoring for it
 158 taking advantage of C<Unicode::Collate>.
 159
 160 =head2 Constructor
 161
 162 The C<new> method returns a collator object.
 163
 164 A parameter list for the constructor is a hash, which can include
 165 a special key C<locale> and its value (case-insensitive) standing
 166 for a Unicode base language code (two or three-letter).
 167 For example, C<Unicode::Collate::Locale-E<gt>new(locale =E<gt> 'ES')>
 168 returns a collator tailored for Spanish.
 169
 170 C<$locale_name> may be suffixed with a Unicode script code (four-letter),
 171 a Unicode region (territory) code, a Unicode language variant code.
 172 These codes are case-insensitive, and separated with C<'_'> or C<'-'>.
 173 E.g. C<en_US> for English in USA,
 174 C<az_Cyrl> for Azerbaijani in the Cyrillic script,
 175 C<es_ES_traditional> for Spanish in Spain (Traditional).
 176
 177 If C<$locale_name> is not available,
 178 fallback is selected in the following order:
 179
 180     1. language with a variant code
 181     2. language with a script code
 182     3. language with a region code
 183     4. language
 184     5. default
 185
 186 Tailoring tags provided by C<Unicode::Collate> are allowed as long as
 187 they are not used for C<locale> support.  Esp. the C<table> tag
 188 is always untailorable, since it is reserved for DUCET.
 189
 190 However C<entry> is allowed, even if it is used for C<locale> support,
 191 to add or override mappings.
 192
 193 E.g. a collator for Spanish, which ignores diacritics and case difference
 194 (i.e. level 1), with reversed case ordering and no normalization.
 195
 196     Unicode::Collate::Locale->new(
 197         level => 1,
 198         locale => 'es',
 199         upper_before_lower => 1,
 200         normalization => undef
 201     )
 202
 203 Overriding a behavior already tailored by C<locale> is disallowed
 204 if such a tailoring is passed to C<new()>.
 205
 206     Unicode::Collate::Locale->new(
 207         locale => 'da',
 208         upper_before_lower => 0, # causes error as reserved by 'da'
 209     )
 210
 211 However C<change()> inherited from C<Unicode::Collate> allows
 212 such a tailoring that is reserved by C<locale>. Examples:
 213
 214     new(locale => 'fr_ca')->change(backwards => undef)
 215     new(locale => 'da')->change(upper_before_lower => 0)
 216     new(locale => 'ja')->change(overrideCJK => undef)
 217
 218 =head2 Methods
 219
 220 C<Unicode::Collate::Locale> is a subclass of C<Unicode::Collate>
 221 and methods other than C<new> are inherited from C<Unicode::Collate>.
 222
 223 Here is a list of additional methods:
 224
 225 =over 4
 226
 227 =item C<$Collator-E<gt>getlocale>
 228
 229 Returns a language code accepted and used actually on collation.
 230 If linguistic tailoring is not provided for a language code you passed
 231 (intensionally for some languages, or due to the incomplete implementation),
 232 this method returns a string C<'default'> meaning no special tailoring.
 233
 234 =item C<$Collator-E<gt>locale_version>
 235
 236 (Since Unicode::Collate::Locale 0.87)
 237 Returns the version number (perhaps C</\d\.\d\d/>) of the locale, as that
 238 of F<Locale/*.pl>.
 239
 240 B<Note:> F<Locale/*.pl> that a collator uses should be identified by
 241 a combination of return values from C<getlocale> and C<locale_version>.
 242
 243 =back
 244
 245 =head2 A list of tailorable locales
 246
 247       locale name       description
 248     --------------------------------------------------------------
 249       af                Afrikaans
 250       ar                Arabic
 251       as                Assamese
 252       az                Azerbaijani (Azeri)
 253       be                Belarusian
 254       bn                Bengali
 255       bs                Bosnian (tailored as Croatian)
 256       bs_Cyrl           Bosnian in Cyrillic (tailored as Serbian)
 257       ca                Catalan
 258       cs                Czech
 259       cy                Welsh
 260       da                Danish
 261       de__phonebook     German (umlaut as 'ae', 'oe', 'ue')
 262       de_AT_phonebook   Austrian German (umlaut primary greater)
 263       dsb               Lower Sorbian
 264       ee                Ewe
 265       eo                Esperanto
 266       es                Spanish
 267       es__traditional   Spanish ('ch' and 'll' as a grapheme)
 268       et                Estonian
 269       fa                Persian
 270       fi                Finnish (v and w are primary equal)
 271       fi__phonebook     Finnish (v and w as separate characters)
 272       fil               Filipino
 273       fo                Faroese
 274       fr_CA             Canadian French
 275       gu                Gujarati
 276       ha                Hausa
 277       haw               Hawaiian
 278       he                Hebrew
 279       hi                Hindi
 280       hr                Croatian
 281       hu                Hungarian
 282       hy                Armenian
 283       ig                Igbo
 284       is                Icelandic
 285       ja                Japanese [1]
 286       kk                Kazakh
 287       kl                Kalaallisut
 288       kn                Kannada
 289       ko                Korean [2]
 290       kok               Konkani
 291       lkt               Lakota
 292       ln                Lingala
 293       lt                Lithuanian
 294       lv                Latvian
 295       mk                Macedonian
 296       ml                Malayalam
 297       mr                Marathi
 298       mt                Maltese
 299       nb                Norwegian Bokmal
 300       nn                Norwegian Nynorsk
 301       nso               Northern Sotho
 302       om                Oromo
 303       or                Oriya
 304       pa                Punjabi
 305       pl                Polish
 306       ro                Romanian
 307       sa                Sanskrit
 308       se                Northern Sami
 309       si                Sinhala
 310       si__dictionary    Sinhala (U+0DA5 = U+0DA2,0DCA,0DA4)
 311       sk                Slovak
 312       sl                Slovenian
 313       sq                Albanian
 314       sr                Serbian
 315       sr_Latn           Serbian in Latin (tailored as Croatian)
 316       sv                Swedish (v and w are primary equal)
 317       sv__reformed      Swedish (v and w as separate characters)
 318       ta                Tamil
 319       te                Telugu
 320       th                Thai
 321       tn                Tswana
 322       to                Tonga
 323       tr                Turkish
 324       ug_Cyrl           Uyghur in Cyrillic
 325       uk                Ukrainian
 326       ur                Urdu
 327       vi                Vietnamese
 328       vo                Volapu"k
 329       wae               Walser
 330       wo                Wolof
 331       yo                Yoruba
 332       zh                Chinese
 333       zh__big5han       Chinese (ideographs: big5 order)
 334       zh__gb2312han     Chinese (ideographs: GB-2312 order)
 335       zh__pinyin        Chinese (ideographs: pinyin order) [3]
 336       zh__stroke        Chinese (ideographs: stroke order) [3]
 337       zh__zhuyin        Chinese (ideographs: zhuyin order) [3]
 338     --------------------------------------------------------------
 339
 340 Locales according to the default UCA rules include
 341 am (Amharic) without C<[reorder Ethi]>,
 342 bg (Bulgarian) without C<[reorder Cyrl]>,
 343 chr (Cherokee) without C<[reorder Cher]>,
 344 de (German),
 345 en (English),
 346 fr (French),
 347 ga (Irish),
 348 id (Indonesian),
 349 it (Italian),
 350 ka (Georgian) without C<[reorder Geor]>,
 351 mn (Mongolian) without C<[reorder Cyrl Mong]>,
 352 ms (Malay),
 353 nl (Dutch),
 354 pt (Portuguese),
 355 ru (Russian) without C<[reorder Cyrl]>,
 356 sw (Swahili),
 357 zu (Zulu).
 358
 359 B<Note>
 360
 361 [1] ja: Ideographs are sorted in JIS X 0208 order.
 362 Fullwidth and halfwidth forms are identical to their regular form.
 363 The difference between hiragana and katakana is at the 4th level,
 364 the comparison also requires C<(variable =E<gt> 'Non-ignorable')>,
 365 and then C<katakana_before_hiragana> has no effect.
 366
 367 [2] ko: Plenty of ideographs are sorted by their reading. Such
 368 an ideograph is primary (level 1) equal to, and secondary (level 2)
 369 greater than, the corresponding hangul syllable.
 370
 371 [3] zh__pinyin, zh__stroke and zh__zhuyin: implemented alt='short',
 372 where a smaller number of ideographs are tailored.
 373
 374 =head2 A list of variant codes and their aliases
 375
 376       variant code       alias
 377     ------------------------------------------
 378       dictionary         dict
 379       phonebook          phone     phonebk
 380       reformed           reform
 381       traditional        trad
 382     ------------------------------------------
 383       big5han            big5
 384       gb2312han          gb2312
 385       pinyin
 386       stroke
 387       zhuyin
 388     ------------------------------------------
 389
 390 Note: 'pinyin' is Han in Latin, 'zhuyin' is Han in Bopomofo.
 391
 392 =head1 INSTALL
 393
 394 Installation of C<Unicode::Collate::Locale> requires F<Collate/Locale.pm>,
 395 F<Collate/Locale/*.pm>, F<Collate/CJK/*.pm> and F<Collate/allkeys.txt>.
 396 On building, C<Unicode::Collate::Locale> doesn't require
 397 any of F<data/*.txt>, F<gendata/*>, and F<mklocale>.
 398 Tests for C<Unicode::Collate::Locale> are named F<t/loc_*.t>.
 399
 400 =head1 CAVEAT
 401
 402 =over 4
 403
 404 =item Tailoring is not maximum
 405
 406 Even if a certain letter is tailored, its equivalent would not always
 407 tailored as well as it. For example, even though W is tailored,
 408 fullwidth W (C<U+FF37>), W with acute (C<U+1E82>), etc. are not
 409 tailored. The result may depend on whether source strings are
 410 normalized or not, and whether decomposed or composed.
 411 Thus C<(normalization =E<gt> undef)> is less preferred.
 412
 413 =item Collation reordering is not supported
 414
 415 The order of any groups including scripts is not changed.
 416
 417 =back
 418
 419 =head2 Reference
 420
 421       locale            based CLDR or other reference
 422     --------------------------------------------------------------------
 423       af                30 = 1.8.1
 424       ar                30 = 28 ("compat" wo [reorder Arab]) = 1.9.0
 425       as                30 = 28 (without [reorder Beng..]) = 23
 426       az                30 = 24 ("standard" wo [reorder Latn Cyrl])
 427       be                30 = 28 (without [reorder Cyrl])
 428       bn                30 = 28 ("standard" wo [reorder Beng..]) = 2.0.1
 429       bs                30 = 28 (type="standard": [import hr])
 430       bs_Cyrl           30 = 28 (type="standard": [import sr])
 431       ca                30 = 23 (alt="proposed" type="standard")
 432       cs                30 = 1.8.1 (type="standard")
 433       cy                30 = 1.8.1
 434       da                22.1 = 1.8.1 (type="standard")
 435       de__phonebook     30 = 2.0 (type="phonebook")
 436       de_AT_phonebook   30 = 27 (type="phonebook")
 437       dsb               30 = 26
 438       ee                30 = 21
 439       eo                30 = 1.8.1
 440       es                30 = 1.9.0 (type="standard")
 441       es__traditional   30 = 1.8.1 (type="traditional")
 442       et                30 = 26
 443       fa                22.1 = 1.8.1
 444       fi                22.1 = 1.8.1 (type="standard" alt="proposed")
 445       fi__phonebook     22.1 = 1.8.1 (type="phonebook")
 446       fil               30 = 1.9.0 (type="standard") = 1.8.1
 447       fo                22.1 = 1.8.1 (alt="proposed" type="standard")
 448       fr_CA             30 = 1.9.0
 449       gu                30 = 28 ("standard" wo [reorder Gujr..]) = 1.9.0
 450       ha                30 = 1.9.0
 451       haw               30 = 24
 452       he                30 = 28 (without [reorder Hebr]) = 23
 453       hi                30 = 28 (without [reorder Deva..]) = 1.9.0
 454       hr                30 = 28 ("standard" wo [reorder Latn Cyrl]) = 1.9.0
 455       hu                22.1 = 1.8.1 (alt="proposed" type="standard")
 456       hy                30 = 28 (without [reorder Armn]) = 1.8.1
 457       ig                30 = 1.8.1
 458       is                22.1 = 1.8.1 (type="standard")
 459       ja                22.1 = 1.8.1 (type="standard")
 460       kk                30 = 28 (without [reorder Cyrl])
 461       kl                22.1 = 1.8.1 (type="standard")
 462       kn                30 = 28 ("standard" wo [reorder Knda..]) = 1.9.0
 463       ko                22.1 = 1.8.1 (type="standard")
 464       kok               30 = 28 (without [reorder Deva..]) = 1.8.1
 465       lkt               30 = 25
 466       ln                30 = 2.0 (type="standard") = 1.8.1
 467       lt                22.1 = 1.9.0
 468       lv                22.1 = 1.9.0 (type="standard") = 1.8.1
 469       mk                30 = 28 (without [reorder Cyrl])
 470       ml                22.1 = 1.9.0
 471       mr                30 = 28 (without [reorder Deva..]) = 1.8.1
 472       mt                22.1 = 1.9.0
 473       nb                22.1 = 2.0   (type="standard")
 474       nn                22.1 = 2.0   (type="standard")
 475       nso           [*] 26 = 1.8.1
 476       om                22.1 = 1.8.1
 477       or                30 = 28 (without [reorder Orya..]) = 1.9.0
 478       pa                22.1 = 1.8.1
 479       pl                30 = 1.8.1
 480       ro                30 = 1.9.0 (type="standard")
 481       sa            [*] 1.9.1 = 1.8.1 (type="standard" alt="proposed")
 482       se                22.1 = 1.8.1 (type="standard")
 483       si                30 = 28 ("standard" wo [reorder Sinh..]) = 1.9.0
 484       si__dictionary    30 = 28 ("dictionary" wo [reorder Sinh..]) = 1.9.0
 485       sk                22.1 = 1.9.0 (type="standard")
 486       sl                22.1 = 1.8.1 (type="standard" alt="proposed")
 487       sq                22.1 = 1.8.1 (alt="proposed" type="standard")
 488       sr                30 = 28 (without [reorder Cyrl])
 489       sr_Latn           30 = 28 (type="standard": [import hr])
 490       sv                22.1 = 1.9.0 (type="standard")
 491       sv__reformed      22.1 = 1.8.1 (type="reformed")
 492       ta                22.1 = 1.9.0
 493       te                30 = 28 (without [reorder Telu..]) = 1.9.0
 494       th                22.1 = 22
 495       tn            [*] 26 = 1.8.1
 496       to                22.1 = 22
 497       tr                22.1 = 1.8.1 (type="standard")
 498       uk                30 = 28 (without [reorder Cyrl])
 499       ug_Cyrl           https://en.wikipedia.org/wiki/Uyghur_Cyrillic_alphabet
 500       ur                22.1 = 1.9.0
 501       vi                22.1 = 1.8.1
 502       vo                30 = 25
 503       wae               30 = 2.0
 504       wo            [*] 1.9.1 = 1.8.1
 505       yo                30 = 1.8.1
 506       zh                22.1 = 1.8.1 (type="standard")
 507       zh__big5han       22.1 = 1.8.1 (type="big5han")
 508       zh__gb2312han     22.1 = 1.8.1 (type="gb2312han")
 509       zh__pinyin        22.1 = 2.0   (type='pinyin' alt='short')
 510       zh__stroke        22.1 = 1.9.1 (type='stroke' alt='short')
 511       zh__zhuyin        22.1 = 22    (type='zhuyin' alt='short')
 512     --------------------------------------------------------------------
 513
 514 [*] http://www.unicode.org/repos/cldr/tags/latest/seed/collation/
 515
 516 =head1 AUTHOR
 517
 518 The Unicode::Collate::Locale module for perl was written
 519 by SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>.
 520 This module is Copyright(C) 2004-2017, SADAHIRO Tomoyuki. Japan.
 521 All rights reserved.
 522
 523 This module is free software; you can redistribute it and/or
 524 modify it under the same terms as Perl itself.
 525
 526 =head1 SEE ALSO
 527
 528 =over 4
 529
 530 =item Unicode Collation Algorithm - UTS #10
 531
 532 L<http://www.unicode.org/reports/tr10/>
 533
 534 =item The Default Unicode Collation Element Table (DUCET)
 535
 536 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
 537
 538 =item Unicode Locale Data Markup Language (LDML) - UTS #35
 539
 540 L<http://www.unicode.org/reports/tr35/>
 541
 542 =item CLDR - Unicode Common Locale Data Repository
 543
 544 L<http://cldr.unicode.org/>
 545
 546 =item L<Unicode::Collate>
 547
 548 =item L<Unicode::Normalize>
 549
 550 =back
 551
 552 =cut