1 package Unicode::Collate::Locale;
5 use base qw(Unicode::Collate);
11 my %LocaleFile = map { ($_, $_) } qw(
12 af ar as az be bg bn ca cs cy da eo es et fa fi fil fo fr
13 gu ha haw hi hr hu hy ig is ja kk kl kn ko kok ln lt lv
14 mk ml mr mt nb nn nso om or pa pl ro ru sa se si sk sl sq
15 sr sv ta te th tn to tr uk ur vi wae wo yo zh
17 $LocaleFile{'default'} = '';
19 $LocaleFile{'bs'} = 'hr';
20 $LocaleFile{'sr_Latn'} = 'hr';
22 $LocaleFile{'de__phonebook'} = 'de_phone';
23 $LocaleFile{'es__traditional'} = 'es_trad';
24 $LocaleFile{'fi__phonebook'} = 'fi_phone';
25 $LocaleFile{'si__dictionary'} = 'si_dict';
26 $LocaleFile{'sv__reformed'} = 'sv_refo';
27 $LocaleFile{'zh__big5han'} = 'zh_big5';
28 $LocaleFile{'zh__gb2312han'} = 'zh_gb';
29 $LocaleFile{'zh__pinyin'} = 'zh_pin';
30 $LocaleFile{'zh__stroke'} = 'zh_strk';
31 $LocaleFile{'zh__zhuyin'} = 'zh_zhu';
47 $locale =~ tr/\-\ \./_/;
48 $locale =~ s/_([0-9a-z]+)\z/$TypeAlias{$1} ?
49 "_$TypeAlias{$1}" : "_$1"/e;
50 $LocaleFile{$locale} and return $locale;
52 my @code = split /_/, $locale;
53 my $lan = shift @code;
54 my $scr = @code && length $code[0] == 4 ? ucfirst shift @code : '';
55 my $reg = @code && length $code[0] < 4 ? uc shift @code : '';
56 my $var = @code ? shift @code : '';
60 "${lan}_${scr}_${reg}_$var",
61 "${lan}_${scr}__$var", # empty $scr should not be ${lan}__$var.
62 "${lan}_${reg}_$var", # empty $reg may be ${lan}__$var.
66 "${lan}_${scr}_${reg}",
72 $LocaleFile{$loc} and return $loc;
79 return shift->{accepted_locale};
83 return shift->{locale_version};
88 my $f = $LocaleFile{$accepted};
92 # allow to search @INC
94 # my $path = File::Spec->catfile('Unicode', 'Collate', 'Locale', $f);
95 my $path = "Unicode/Collate/Locale/$f";
97 croak "Unicode/Collate/Locale/$f can't be found" if !$h;
104 $hash{accepted_locale} = _locale($hash{locale});
106 if (exists $hash{table}) {
107 croak "your table can't be used with Unicode::Collate::Locale";
110 my $href = _fetchpl($hash{accepted_locale});
111 while (my($k,$v) = each %$href) {
112 if (exists $hash{$k}) {
113 croak "$k is reserved by $hash{locale}, can't be overwritten";
117 return $class->SUPER::new(%hash);
123 MEMORANDA for developing
126 ----------------------------------------------------------------------------
130 az 2.0 = 1.8.1 (type="standard")
133 bn 2.0.1 (type="standard")
134 bs 2.0 (alias source="hr")
135 ca 2.0 = 1.8.1 (alt="proposed" type="standard")
136 cs 2.0 = 1.8.1 (type="standard")
138 da 2.0 = 1.8.1 (type="standard") [modify aA to pass CLDR tests]
139 de__phonebook 2.0 (type="phonebook")
141 es 2.0 (type="standard")
142 es__traditional 2.0 = 1.8.1 (type="traditional")
145 fi 2.0 = 1.8.1 (type="standard" alt="proposed")
146 fi__phonebook 2.0 = 1.8.1 (type="phonebook")
147 fil 2.0 (type="standard") = 1.8.1
148 fo 2.0 = 1.8.1 (alt="proposed" type="standard")
149 fr 2.0 (fr_CA, backwards="on")
150 gu 2.0 (type="standard")
153 hi 2.0 (type="standard")
154 hr 2.0 (type="standard")
155 hu 2.0 = 1.8.1 (alt="proposed" type="standard")
158 is 2.0 = 1.8.1 (type="standard")
159 ja 22.1 = 2.0 = 1.8.1 (type="standard")
161 kl 2.0 = 1.8.1 (type="standard")
162 kn 2.0 (type="standard")
163 ko 22.1 = 2.0 = 1.8.1 (type="standard")
165 ln 2.0 (type="standard") = 1.8.1
167 lv 2.0 (type="standard") = 1.8.1
172 nb 2.0 (type="standard")
173 nn 2.0 (type="standard")
179 ro 2.0 (type="standard")
181 sa 1.8.1 (type="standard" alt="proposed") [currently in /seed]
182 se 2.0 = 1.8.1 (type="standard")
183 si 2.0 (type="standard")
184 si__dictionary 2.0 (type="dictionary")
185 sk 2.0 (type="standard")
186 sl 2.0 = 1.8.1 (type="standard" alt="proposed")
187 sq 2.0 = 1.8.1 (alt="proposed" type="standard")
188 sr 2.0 (type="standard")
189 sr_Latn 2.0 = 1.8.1 (alias source="hr")
190 sv 2.0 (type="standard")
191 sv__reformed 2.0 = 1.8.1 (type="reformed")
194 th 2.0 (type="standard")
196 to 2.0 = 1.8.1 (type="standard" alt="proposed")
197 tr 2.0 = 1.8.1 (type="standard")
202 wo 1.8.1 [currently in /seed]
204 zh 22.1 = 2.0 = 1.8.1 (type="standard")
205 zh__big5han 22.1 = 2.0 = 1.8.1 (type="big5han")
206 zh__gb2312han 22.1 = 2.0 = 1.8.1 (type="gb2312han")
207 zh__pinyin 22.1 = 2.0 (type='pinyin' alt='short')
208 zh__stroke 22.1 = 2.0 = 1.9.1 (type='stroke' alt='short')
209 zh__zhuyin 22.1 = 22 (type='zhuyin' alt='short')
210 ----------------------------------------------------------------------------
214 Unicode::Collate::Locale - Linguistic tailoring for DUCET via Unicode::Collate
218 use Unicode::Collate::Locale;
221 $Collator = Unicode::Collate::Locale->
222 new(locale => $locale_name, %tailoring);
225 @sorted = $Collator->sort(@not_sorted);
228 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
230 B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
231 according to Perl's Unicode support. See L<perlunicode>,
232 L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
233 Otherwise you can use C<preprocess> (cf. C<Unicode::Collate>)
234 or should decode them before.
238 This module provides linguistic tailoring for it
239 taking advantage of C<Unicode::Collate>.
243 The C<new> method returns a collator object.
245 A parameter list for the constructor is a hash, which can include
246 a special key C<locale> and its value (case-insensitive) standing
247 for a Unicode base language code (two or three-letter).
248 For example, C<Unicode::Collate::Locale-E<gt>new(locale =E<gt> 'FR')>
249 returns a collator tailored for French.
251 C<$locale_name> may be suffixed with a Unicode script code (four-letter),
252 a Unicode region code, a Unicode language variant code. These codes are
253 case-insensitive, and separated with C<'_'> or C<'-'>.
254 E.g. C<en_US> for English in USA,
255 C<az_Cyrl> for Azerbaijani in the Cyrillic script,
256 C<es_ES_traditional> for Spanish in Spain (Traditional).
258 If C<$locale_name> is not available,
259 fallback is selected in the following order:
261 1. language with a variant code
262 2. language with a script code
263 3. language with a region code
267 Tailoring tags provided by C<Unicode::Collate> are allowed as long as
268 they are not used for C<locale> support. Esp. the C<table> tag
269 is always untailorable, since it is reserved for DUCET.
271 E.g. a collator for French, which ignores diacritics and case difference
272 (i.e. level 1), with reversed case ordering and no normalization.
274 Unicode::Collate::Locale->new(
277 upper_before_lower => 1,
278 normalization => undef
281 Overriding a behavior already tailored by C<locale> is disallowed
282 if such a tailoring is passed to C<new()>.
284 Unicode::Collate::Locale->new(
286 upper_before_lower => 0, # causes error as reserved by 'da'
289 However C<change()> inherited from C<Unicode::Collate> allows
290 such a tailoring that is reserved by C<locale>. Examples:
292 new(locale => 'ca')->change(backwards => undef)
293 new(locale => 'da')->change(upper_before_lower => 0)
294 new(locale => 'ja')->change(overrideCJK => undef)
298 C<Unicode::Collate::Locale> is a subclass of C<Unicode::Collate>
299 and methods other than C<new> are inherited from C<Unicode::Collate>.
301 Here is a list of additional methods:
305 =item C<$Collator-E<gt>getlocale>
307 Returns a language code accepted and used actually on collation.
308 If linguistic tailoring is not provided for a language code you passed
309 (intensionally for some languages, or due to the incomplete implementation),
310 this method returns a string C<'default'> meaning no special tailoring.
312 =item C<$Collator-E<gt>locale_version>
314 (Since Unicode::Collate::Locale 0.87)
315 Returns the version number (perhaps C</\d\.\d\d/>) of the locale, as that
318 B<Note:> F<Locale/*.pl> that a collator uses should be identified by
319 a combination of return values from C<getlocale> and C<locale_version>.
323 =head2 A list of tailorable locales
325 locale name description
326 --------------------------------------------------------------
330 az Azerbaijani (Azeri)
339 de__phonebook German (umlaut as 'ae', 'oe', 'ue')
342 es__traditional Spanish ('ch' and 'll' as a grapheme)
345 fi Finnish (v and w are primary equal)
346 fi__phonebook Finnish (v and w as separate characters)
384 si__dictionary Sinhala (U+0DA5 = U+0DA2,0DCA,0DA4)
389 sr_Latn Serbian in Latin (tailored as Croatian)
390 sv Swedish (v and w are primary equal)
391 sv__reformed Swedish (v and w as separate characters)
405 zh__big5han Chinese (ideographs: big5 order)
406 zh__gb2312han Chinese (ideographs: GB-2312 order)
407 zh__pinyin Chinese (ideographs: pinyin order) [3]
408 zh__stroke Chinese (ideographs: stroke order) [3]
409 zh__zhuyin Chinese (ideographs: zhuyin order) [3]
410 --------------------------------------------------------------
412 Locales according to the default UCA rules include
430 [1] ja: Ideographs are sorted in JIS X 0208 order.
431 Fullwidth and halfwidth forms are identical to their normal form.
432 The difference between hiragana and katakana is at the 4th level,
433 the comparison also requires C<(variable =E<gt> 'Non-ignorable')>,
434 and then C<katakana_before_hiragana> has no effect.
436 [2] ko: Plenty of ideographs are sorted by their reading. Such
437 an ideograph is primary (level 1) equal to, and secondary (level 2)
438 greater than, the corresponding hangul syllable.
440 [3] zh__pinyin, zh__stroke and zh__zhuyin: implemented alt='short',
441 where a smaller number of ideographs are tailored.
443 Note: 'pinyin' is in latin, 'zhuyin' is in bopomofo.
447 Installation of C<Unicode::Collate::Locale> requires F<Collate/Locale.pm>,
448 F<Collate/Locale/*.pm>, F<Collate/CJK/*.pm> and F<Collate/allkeys.txt>.
449 On building, C<Unicode::Collate::Locale> doesn't require any of F<data/*.txt>,
450 F<gendata/*>, and F<mklocale>.
451 Tests for C<Unicode::Collate::Locale> are named F<t/loc_*.t>.
457 =item tailoring is not maximum
459 Even if a certain letter is tailored, its equivalent would not always
460 tailored as well as it. For example, even though W is tailored,
461 fullwidth W (C<U+FF37>), W with acute (C<U+1E82>), etc. are not
462 tailored. The result may depend on whether source strings are
463 normalized or not, and whether decomposed or composed.
464 Thus C<(normalization =E<gt> undef)> is less preferred.
470 The Unicode::Collate::Locale module for perl was written
471 by SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>.
472 This module is Copyright(C) 2004-2012, SADAHIRO Tomoyuki. Japan.
475 This module is free software; you can redistribute it and/or
476 modify it under the same terms as Perl itself.
482 =item Unicode Collation Algorithm - UTS #10
484 L<http://www.unicode.org/reports/tr10/>
486 =item The Default Unicode Collation Element Table (DUCET)
488 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
490 =item Unicode Locale Data Markup Language (LDML) - UTS #35
492 L<http://www.unicode.org/reports/tr35/>
494 =item CLDR - Unicode Common Locale Data Repository
496 L<http://cldr.unicode.org/>
498 =item L<Unicode::Collate>
500 =item L<Unicode::Normalize>