1 package Unicode::Collate::Locale;
6 use base qw(Unicode::Collate);
12 my %LocaleFile = map { ($_, $_) } qw(
13 af ar as az be bn ca cs cy da dsb ee eo es et fa fi fil fo gu
14 ha haw he hi hr hu hy ig is ja kk kl kn ko kok lkt ln lt lv
15 mk ml mr mt nb nn nso om or pa pl ro sa se si sk sl sq sr sv
16 ta te th tn to tr uk ur vi vo wae wo yo zh
18 $LocaleFile{'default'} = '';
20 $LocaleFile{'bs'} = 'hr';
21 $LocaleFile{'bs_Cyrl'} = 'sr';
22 $LocaleFile{'sr_Latn'} = 'hr';
24 $LocaleFile{'de__phonebook'} = 'de_phone';
25 $LocaleFile{'de_AT_phonebook'} = 'de_at_ph';
26 $LocaleFile{'es__traditional'} = 'es_trad';
27 $LocaleFile{'fr_CA'} = 'fr_ca';
28 $LocaleFile{'fi__phonebook'} = 'fi_phone';
29 $LocaleFile{'si__dictionary'} = 'si_dict';
30 $LocaleFile{'sv__reformed'} = 'sv_refo';
31 $LocaleFile{'ug_Cyrl'} = 'ug_cyrl';
32 $LocaleFile{'zh__big5han'} = 'zh_big5';
33 $LocaleFile{'zh__gb2312han'} = 'zh_gb';
34 $LocaleFile{'zh__pinyin'} = 'zh_pin';
35 $LocaleFile{'zh__stroke'} = 'zh_strk';
36 $LocaleFile{'zh__zhuyin'} = 'zh_zhu';
52 $locale =~ tr/\-\ \./_/;
53 $locale =~ s/_([0-9a-z]+)\z/$TypeAlias{$1} ?
54 "_$TypeAlias{$1}" : "_$1"/e;
55 $LocaleFile{$locale} and return $locale;
57 my @code = split /_/, $locale;
58 my $lan = shift @code;
59 my $scr = @code && length $code[0] == 4 ? ucfirst shift @code : '';
60 my $reg = @code && length $code[0] < 4 ? uc shift @code : '';
61 my $var = @code ? shift @code : '';
65 "${lan}_${scr}_${reg}_$var",
66 "${lan}_${scr}__$var", # empty $scr should not be ${lan}__$var.
67 "${lan}_${reg}_$var", # empty $reg may be ${lan}__$var.
71 "${lan}_${scr}_${reg}",
77 $LocaleFile{$loc} and return $loc;
84 return shift->{accepted_locale};
88 return shift->{locale_version};
93 my $f = $LocaleFile{$accepted};
97 # allow to search @INC
99 # my $path = File::Spec->catfile('Unicode', 'Collate', 'Locale', $f);
100 my $path = "Unicode/Collate/Locale/$f";
102 croak "Unicode/Collate/Locale/$f can't be found" if !$h;
109 $hash{accepted_locale} = _locale($hash{locale});
111 if (exists $hash{table}) {
112 croak "your table can't be used with Unicode::Collate::Locale";
115 my $href = _fetchpl($hash{accepted_locale});
116 while (my($k,$v) = each %$href) {
117 if (!exists $hash{$k}) {
119 } elsif ($k eq 'entry') {
120 $hash{$k} = $v.$hash{$k};
122 croak "$k is reserved by $hash{locale}, can't be overwritten";
125 return $class->SUPER::new(%hash);
133 Unicode::Collate::Locale - Linguistic tailoring for DUCET via Unicode::Collate
137 use Unicode::Collate::Locale;
140 $Collator = Unicode::Collate::Locale->
141 new(locale => $locale_name, %tailoring);
144 @sorted = $Collator->sort(@not_sorted);
147 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
149 B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
150 according to Perl's Unicode support. See L<perlunicode>,
151 L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
152 Otherwise you can use C<preprocess> (cf. C<Unicode::Collate>)
153 or should decode them before.
157 This module provides linguistic tailoring for it
158 taking advantage of C<Unicode::Collate>.
162 The C<new> method returns a collator object.
164 A parameter list for the constructor is a hash, which can include
165 a special key C<locale> and its value (case-insensitive) standing
166 for a Unicode base language code (two or three-letter).
167 For example, C<Unicode::Collate::Locale-E<gt>new(locale =E<gt> 'ES')>
168 returns a collator tailored for Spanish.
170 C<$locale_name> may be suffixed with a Unicode script code (four-letter),
171 a Unicode region (territory) code, a Unicode language variant code.
172 These codes are case-insensitive, and separated with C<'_'> or C<'-'>.
173 E.g. C<en_US> for English in USA,
174 C<az_Cyrl> for Azerbaijani in the Cyrillic script,
175 C<es_ES_traditional> for Spanish in Spain (Traditional).
177 If C<$locale_name> is not available,
178 fallback is selected in the following order:
180 1. language with a variant code
181 2. language with a script code
182 3. language with a region code
186 Tailoring tags provided by C<Unicode::Collate> are allowed as long as
187 they are not used for C<locale> support. Esp. the C<table> tag
188 is always untailorable, since it is reserved for DUCET.
190 However C<entry> is allowed, even if it is used for C<locale> support,
191 to add or override mappings.
193 E.g. a collator for Spanish, which ignores diacritics and case difference
194 (i.e. level 1), with reversed case ordering and no normalization.
196 Unicode::Collate::Locale->new(
199 upper_before_lower => 1,
200 normalization => undef
203 Overriding a behavior already tailored by C<locale> is disallowed
204 if such a tailoring is passed to C<new()>.
206 Unicode::Collate::Locale->new(
208 upper_before_lower => 0, # causes error as reserved by 'da'
211 However C<change()> inherited from C<Unicode::Collate> allows
212 such a tailoring that is reserved by C<locale>. Examples:
214 new(locale => 'fr_ca')->change(backwards => undef)
215 new(locale => 'da')->change(upper_before_lower => 0)
216 new(locale => 'ja')->change(overrideCJK => undef)
220 C<Unicode::Collate::Locale> is a subclass of C<Unicode::Collate>
221 and methods other than C<new> are inherited from C<Unicode::Collate>.
223 Here is a list of additional methods:
227 =item C<$Collator-E<gt>getlocale>
229 Returns a language code accepted and used actually on collation.
230 If linguistic tailoring is not provided for a language code you passed
231 (intensionally for some languages, or due to the incomplete implementation),
232 this method returns a string C<'default'> meaning no special tailoring.
234 =item C<$Collator-E<gt>locale_version>
236 (Since Unicode::Collate::Locale 0.87)
237 Returns the version number (perhaps C</\d\.\d\d/>) of the locale, as that
240 B<Note:> F<Locale/*.pl> that a collator uses should be identified by
241 a combination of return values from C<getlocale> and C<locale_version>.
245 =head2 A list of tailorable locales
247 locale name description
248 --------------------------------------------------------------
252 az Azerbaijani (Azeri)
255 bs Bosnian (tailored as Croatian)
256 bs_Cyrl Bosnian in Cyrillic (tailored as Serbian)
261 de__phonebook German (umlaut as 'ae', 'oe', 'ue')
262 de_AT_phonebook Austrian German (umlaut primary greater)
267 es__traditional Spanish ('ch' and 'll' as a grapheme)
270 fi Finnish (v and w are primary equal)
271 fi__phonebook Finnish (v and w as separate characters)
274 fr_CA Canadian French
310 si__dictionary Sinhala (U+0DA5 = U+0DA2,0DCA,0DA4)
315 sr_Latn Serbian in Latin (tailored as Croatian)
316 sv Swedish (v and w are primary equal)
317 sv__reformed Swedish (v and w as separate characters)
324 ug_Cyrl Uyghur in Cyrillic
333 zh__big5han Chinese (ideographs: big5 order)
334 zh__gb2312han Chinese (ideographs: GB-2312 order)
335 zh__pinyin Chinese (ideographs: pinyin order) [3]
336 zh__stroke Chinese (ideographs: stroke order) [3]
337 zh__zhuyin Chinese (ideographs: zhuyin order) [3]
338 --------------------------------------------------------------
340 Locales according to the default UCA rules include
341 am (Amharic) without C<[reorder Ethi]>,
342 bg (Bulgarian) without C<[reorder Cyrl]>,
343 chr (Cherokee) without C<[reorder Cher]>,
350 ka (Georgian) without C<[reorder Geor]>,
351 mn (Mongolian) without C<[reorder Cyrl Mong]>,
355 ru (Russian) without C<[reorder Cyrl]>,
361 [1] ja: Ideographs are sorted in JIS X 0208 order.
362 Fullwidth and halfwidth forms are identical to their regular form.
363 The difference between hiragana and katakana is at the 4th level,
364 the comparison also requires C<(variable =E<gt> 'Non-ignorable')>,
365 and then C<katakana_before_hiragana> has no effect.
367 [2] ko: Plenty of ideographs are sorted by their reading. Such
368 an ideograph is primary (level 1) equal to, and secondary (level 2)
369 greater than, the corresponding hangul syllable.
371 [3] zh__pinyin, zh__stroke and zh__zhuyin: implemented alt='short',
372 where a smaller number of ideographs are tailored.
374 =head2 A list of variant codes and their aliases
377 ------------------------------------------
379 phonebook phone phonebk
382 ------------------------------------------
388 ------------------------------------------
390 Note: 'pinyin' is Han in Latin, 'zhuyin' is Han in Bopomofo.
394 Installation of C<Unicode::Collate::Locale> requires F<Collate/Locale.pm>,
395 F<Collate/Locale/*.pm>, F<Collate/CJK/*.pm> and F<Collate/allkeys.txt>.
396 On building, C<Unicode::Collate::Locale> doesn't require
397 any of F<data/*.txt>, F<gendata/*>, and F<mklocale>.
398 Tests for C<Unicode::Collate::Locale> are named F<t/loc_*.t>.
404 =item Tailoring is not maximum
406 Even if a certain letter is tailored, its equivalent would not always
407 tailored as well as it. For example, even though W is tailored,
408 fullwidth W (C<U+FF37>), W with acute (C<U+1E82>), etc. are not
409 tailored. The result may depend on whether source strings are
410 normalized or not, and whether decomposed or composed.
411 Thus C<(normalization =E<gt> undef)> is less preferred.
413 =item Collation reordering is not supported
415 The order of any groups including scripts is not changed.
421 locale based CLDR or other reference
422 --------------------------------------------------------------------
424 ar 30 = 28 ("compat" wo [reorder Arab]) = 1.9.0
425 as 30 = 28 (without [reorder Beng..]) = 23
426 az 30 = 24 ("standard" wo [reorder Latn Cyrl])
427 be 30 = 28 (without [reorder Cyrl])
428 bn 30 = 28 ("standard" wo [reorder Beng..]) = 2.0.1
429 bs 30 = 28 (type="standard": [import hr])
430 bs_Cyrl 30 = 28 (type="standard": [import sr])
431 ca 30 = 23 (alt="proposed" type="standard")
432 cs 30 = 1.8.1 (type="standard")
434 da 22.1 = 1.8.1 (type="standard")
435 de__phonebook 30 = 2.0 (type="phonebook")
436 de_AT_phonebook 30 = 27 (type="phonebook")
440 es 30 = 1.9.0 (type="standard")
441 es__traditional 30 = 1.8.1 (type="traditional")
444 fi 22.1 = 1.8.1 (type="standard" alt="proposed")
445 fi__phonebook 22.1 = 1.8.1 (type="phonebook")
446 fil 30 = 1.9.0 (type="standard") = 1.8.1
447 fo 22.1 = 1.8.1 (alt="proposed" type="standard")
449 gu 30 = 28 ("standard" wo [reorder Gujr..]) = 1.9.0
452 he 30 = 28 (without [reorder Hebr]) = 23
453 hi 30 = 28 (without [reorder Deva..]) = 1.9.0
454 hr 30 = 28 ("standard" wo [reorder Latn Cyrl]) = 1.9.0
455 hu 22.1 = 1.8.1 (alt="proposed" type="standard")
456 hy 30 = 28 (without [reorder Armn]) = 1.8.1
458 is 22.1 = 1.8.1 (type="standard")
459 ja 22.1 = 1.8.1 (type="standard")
460 kk 30 = 28 (without [reorder Cyrl])
461 kl 22.1 = 1.8.1 (type="standard")
462 kn 30 = 28 ("standard" wo [reorder Knda..]) = 1.9.0
463 ko 22.1 = 1.8.1 (type="standard")
464 kok 30 = 28 (without [reorder Deva..]) = 1.8.1
466 ln 30 = 2.0 (type="standard") = 1.8.1
468 lv 22.1 = 1.9.0 (type="standard") = 1.8.1
469 mk 30 = 28 (without [reorder Cyrl])
471 mr 30 = 28 (without [reorder Deva..]) = 1.8.1
473 nb 22.1 = 2.0 (type="standard")
474 nn 22.1 = 2.0 (type="standard")
477 or 30 = 28 (without [reorder Orya..]) = 1.9.0
480 ro 30 = 1.9.0 (type="standard")
481 sa [*] 1.9.1 = 1.8.1 (type="standard" alt="proposed")
482 se 22.1 = 1.8.1 (type="standard")
483 si 30 = 28 ("standard" wo [reorder Sinh..]) = 1.9.0
484 si__dictionary 30 = 28 ("dictionary" wo [reorder Sinh..]) = 1.9.0
485 sk 22.1 = 1.9.0 (type="standard")
486 sl 22.1 = 1.8.1 (type="standard" alt="proposed")
487 sq 22.1 = 1.8.1 (alt="proposed" type="standard")
488 sr 30 = 28 (without [reorder Cyrl])
489 sr_Latn 30 = 28 (type="standard": [import hr])
490 sv 22.1 = 1.9.0 (type="standard")
491 sv__reformed 22.1 = 1.8.1 (type="reformed")
493 te 30 = 28 (without [reorder Telu..]) = 1.9.0
497 tr 22.1 = 1.8.1 (type="standard")
498 uk 30 = 28 (without [reorder Cyrl])
499 ug_Cyrl https://en.wikipedia.org/wiki/Uyghur_Cyrillic_alphabet
506 zh 22.1 = 1.8.1 (type="standard")
507 zh__big5han 22.1 = 1.8.1 (type="big5han")
508 zh__gb2312han 22.1 = 1.8.1 (type="gb2312han")
509 zh__pinyin 22.1 = 2.0 (type='pinyin' alt='short')
510 zh__stroke 22.1 = 1.9.1 (type='stroke' alt='short')
511 zh__zhuyin 22.1 = 22 (type='zhuyin' alt='short')
512 --------------------------------------------------------------------
514 [*] http://www.unicode.org/repos/cldr/tags/latest/seed/collation/
518 The Unicode::Collate::Locale module for perl was written
519 by SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>.
520 This module is Copyright(C) 2004-2017, SADAHIRO Tomoyuki. Japan.
523 This module is free software; you can redistribute it and/or
524 modify it under the same terms as Perl itself.
530 =item Unicode Collation Algorithm - UTS #10
532 L<http://www.unicode.org/reports/tr10/>
534 =item The Default Unicode Collation Element Table (DUCET)
536 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
538 =item Unicode Locale Data Markup Language (LDML) - UTS #35
540 L<http://www.unicode.org/reports/tr35/>
542 =item CLDR - Unicode Common Locale Data Repository
544 L<http://cldr.unicode.org/>
546 =item L<Unicode::Collate>
548 =item L<Unicode::Normalize>