This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update Unicode-Collate to CPAN version 0.71
[perl5.git] / cpan / Unicode-Collate / Collate / Locale.pm
CommitLineData
00e00351
CBW
1package Unicode::Collate::Locale;
2
3use strict;
4use Carp;
5use base qw(Unicode::Collate);
6
cac3df65 7our $VERSION = '0.71';
00e00351
CBW
8
9use File::Spec;
10
11(my $ModPath = $INC{'Unicode/Collate/Locale.pm'}) =~ s/\.pm$//;
00e00351
CBW
12my $PL_EXT = '.pl';
13
64dc7822 14my %LocaleFile = map { ($_, $_) } qw(
6709de88 15 af ar az ca cs cy da eo es et fi fil fo fr ha haw
584e761d 16 hr hu hy ig is ja kk kl ko lt lv mt nb nn nso om pl ro ru
028d3bfa 17 se sk sl sq sv sw tn to tr uk vi wo yo zh
64dc7822 18);
456a1446 19 $LocaleFile{'default'} = '';
1393fe00 20 $LocaleFile{'de__phonebook'} = 'de_phone';
456a1446 21 $LocaleFile{'es__traditional'} = 'es_trad';
028d3bfa
CBW
22 $LocaleFile{'be'} = 'ru';
23 $LocaleFile{'bg'} = 'ru';
24 $LocaleFile{'mk'} = 'ru';
25 $LocaleFile{'sr'} = 'ru';
26 $LocaleFile{'zh__big5han'} = 'zh_big5';
27 $LocaleFile{'zh__gb2312han'} = 'zh_gb';
28 $LocaleFile{'zh__pinyin'} = 'zh_pin';
29 $LocaleFile{'zh__stroke'} = 'zh_strk';
00e00351
CBW
30
31sub _locale {
32 my $locale = shift;
33 if ($locale) {
34 $locale = lc $locale;
35 $locale =~ tr/\-\ \./_/;
6709de88 36 $locale =~ s/_phone(?:bk)?\z/_phonebook/;
456a1446 37 $locale =~ s/_trad\z/_traditional/;
028d3bfa
CBW
38 $locale =~ s/_big5\z/_big5han/;
39 $locale =~ s/_gb2312\z/_gb2312han/;
00e00351
CBW
40 $LocaleFile{$locale} and return $locale;
41
42 my ($l,$t,$v) = split(/_/, $locale.'__');
456a1446 43 for my $loc ("${l}_${t}_$v", "${l}_$t", "${l}__$v", "${l}__$t", $l) {
00e00351
CBW
44 $LocaleFile{$loc} and return $loc;
45 }
46 }
47 return 'default';
48}
49
50sub getlocale {
51 return shift->{accepted_locale};
52}
53
f1a7422f 54sub _fetchpl {
6484f676
CBW
55 my $accepted = shift;
56 my $f = $LocaleFile{$accepted};
57 return if !$f;
58 $f .= $PL_EXT;
59 my $path = File::Spec->catfile($ModPath, $f);
60 my $h = do $path;
61 croak "Unicode/Collate/Locale/$f can't be found" if !$h;
62 return $h;
63}
64
00e00351
CBW
65sub new {
66 my $class = shift;
67 my %hash = @_;
00e00351
CBW
68 $hash{accepted_locale} = _locale($hash{locale});
69
64dc7822
CBW
70 if (exists $hash{table}) {
71 croak "your table can't be used with Unicode::Collate::Locale";
72 }
00e00351 73
f1a7422f 74 my $href = _fetchpl($hash{accepted_locale});
00e00351
CBW
75 while (my($k,$v) = each %$href) {
76 if (exists $hash{$k}) {
77 croak "$k is reserved by $hash{locale}, can't be overwritten";
78 }
79 $hash{$k} = $v;
80 }
81 return $class->SUPER::new(%hash);
82}
83
841;
85__END__
86
87=head1 NAME
88
89Unicode::Collate::Locale - Linguistic tailoring for DUCET via Unicode::Collate
90
91=head1 SYNOPSIS
92
93 use Unicode::Collate::Locale;
94
539ce3d8 95 #construct
00e00351
CBW
96 $Collator = Unicode::Collate::Locale->
97 new(locale => $locale_name, %tailoring);
98
539ce3d8 99 #sort
00e00351
CBW
100 @sorted = $Collator->sort(@not_sorted);
101
539ce3d8
CBW
102 #compare
103 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
104
105B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
106according to Perl's Unicode support. See L<perlunicode>,
107L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
108Otherwise you can use C<preprocess> (cf. C<Unicode::Collate>)
109or should decode them before.
110
00e00351
CBW
111=head1 DESCRIPTION
112
113This module provides linguistic tailoring for it
114taking advantage of C<Unicode::Collate>.
115
116=head2 Constructor
117
118The C<new> method returns a collator object.
119
120A parameter list for the constructor is a hash, which can include
68adb2b0 121a special key C<locale> and its value (case-insensitive) standing
00e00351
CBW
122for a two-letter language code (ISO-639) like C<'en'> for English.
123For example, C<Unicode::Collate::Locale-E<gt>new(locale =E<gt> 'FR')>
124returns a collator tailored for French.
125
126C<$locale_name> may be suffixed with a territory(country)
127code or a variant code, which are separated with C<'_'>.
128E.g. C<en_US> for English in USA,
129C<es_ES_traditional> for Spanish in Spain (Traditional),
130
131If C<$localename> is not defined,
132fallback is selected in the following order:
133
64dc7822
CBW
134 1. language_territory_variant
135 2. language_territory
136 3. language__variant
137 4. language
138 5. default
00e00351 139
68adb2b0
CBW
140Tailoring tags provided by C<Unicode::Collate> are allowed as long as
141they are not used for C<locale> support. Esp. the C<table> tag
142is always untailorable since it is reserved for DUCET.
00e00351
CBW
143
144E.g. a collator for French, which ignores diacritics and case difference
145(i.e. level 1), with reversed case ordering and no normalization.
146
147 Unicode::Collate::Locale->new(
64dc7822
CBW
148 level => 1,
149 locale => 'fr',
150 upper_before_lower => 1,
151 normalization => undef
00e00351
CBW
152 )
153
68adb2b0
CBW
154Overriding a behavior already tailored by C<locale> is disallowed
155if such a tailoring is passed to C<new()>.
156
157 Unicode::Collate::Locale->new(
158 locale => 'da',
159 upper_before_lower => 0, # causes error as reserved by 'da'
160 )
161
162However C<change()> inherited from C<Unicode::Collate> allows
163such a tailoring that is reserved by C<locale>. Examples:
164
165 new(locale => 'ca')->change(backwards => undef)
166 new(locale => 'da')->change(upper_before_lower => 0)
167 new(locale => 'ja')->change(overrideCJK => undef)
168
00e00351
CBW
169=head2 Methods
170
171C<Unicode::Collate::Locale> is a subclass of C<Unicode::Collate>
172and methods other than C<new> are inherited from C<Unicode::Collate>.
173
174Here is a list of additional methods:
175
176=over 4
177
178=item C<$Collator-E<gt>getlocale>
179
180Returns a language code accepted and used actually on collation.
181If linguistic tailoring is not provided for a language code you passed
182(intensionally for some languages, or due to the incomplete implementation),
183this method returns a string C<'default'> meaning no special tailoring.
184
185=back
186
187=head2 A list of tailorable locales
188
64dc7822
CBW
189 locale name description
190 ----------------------------------------------------------
6484f676 191 af Afrikaans
6709de88 192 ar Arabic
f1a7422f 193 az Azerbaijani (Azeri)
aa7758f7
CBW
194 be Belarusian
195 bg Bulgarian
64dc7822 196 ca Catalan
00e00351 197 cs Czech
6484f676
CBW
198 cy Welsh
199 da Danish
1393fe00 200 de__phonebook German (umlaut as 'ae', 'oe', 'ue')
456a1446 201 eo Esperanto
00e00351
CBW
202 es Spanish
203 es__traditional Spanish ('ch' and 'll' as a grapheme)
64dc7822
CBW
204 et Estonian
205 fi Finnish
f1a7422f 206 fil Filipino
6484f676 207 fo Faroese
00e00351 208 fr French
f1a7422f 209 ha Hausa
6484f676 210 haw Hawaiian
c02ee425 211 hr Croatian
6709de88
CBW
212 hu Hungarian
213 hy Armenian
c02ee425 214 ig Igbo
6484f676 215 is Icelandic
539ce3d8 216 ja Japanese [1]
aa7758f7 217 kk Kazakh
6484f676 218 kl Kalaallisut
584e761d 219 ko Korean [2]
f1a7422f 220 lt Lithuanian
64dc7822 221 lv Latvian
aa7758f7 222 mk Macedonian
f1a7422f 223 mt Maltese
456a1446 224 nb Norwegian Bokmal
00e00351 225 nn Norwegian Nynorsk
1393fe00
CBW
226 nso Northern Sotho
227 om Oromo
00e00351 228 pl Polish
456a1446 229 ro Romanian
aa7758f7 230 ru Russian
6709de88 231 se Northern Sami
64dc7822
CBW
232 sk Slovak
233 sl Slovenian
c02ee425 234 sq Albanian
aa7758f7 235 sr Serbian
456a1446 236 sv Swedish
6484f676 237 sw Swahili
1393fe00 238 tn Tswana
6709de88 239 to Tonga
f1a7422f 240 tr Turkish
6709de88 241 uk Ukrainian
1393fe00 242 vi Vietnamese
f1a7422f
CBW
243 wo Wolof
244 yo Yoruba
028d3bfa
CBW
245 zh Chinese
246 zh__big5han Chinese (ideographs: big5 order)
247 zh__gb2312han Chinese (ideographs: GB-2312 order)
248 zh__pinyin Chinese (ideographs: pinyin order)
249 zh__stroke Chinese (ideographs: stroke order)
aa7758f7
CBW
250 ----------------------------------------------------------
251
539ce3d8 252Locales according to the default UCA rules include
aa7758f7
CBW
253de (German),
254en (English),
255ga (Irish),
256id (Indonesian),
257it (Italian),
258ka (Georgian),
259ln (Lingala),
260ms (Malay),
261nl (Dutch),
262pt (Portuguese),
263st (Southern Sotho),
264xh (Xhosa),
265zu (Zulu).
f1a7422f 266
539ce3d8
CBW
267B<Note>
268
269[1] ja: Ideographs are sorted in JIS X 0208 order.
270Fullwidth and halfwidth forms are identical to their normal form.
271The difference between hiragana and katakana is at the 4th level,
272the comparison also requires C<(variable =E<gt> 'Non-ignorable')>,
273and then C<katakana_before_hiragana> has no effect.
274
584e761d
CBW
275[2] ko: Plenty of ideographs are sorted by their reading. Such
276an ideograph is primary (level 1) equal to, and secondary (level 2)
277greater than, the corresponding hangul syllable.
278
f1a7422f
CBW
279=head1 INSTALL
280
539ce3d8
CBW
281Installation of C<Unicode::Collate::Locale> requires F<Collate/Locale.pm>,
282F<Collate/Locale/*.pm>, F<Collate/CJK/*.pm> and F<Collate/allkeys.txt>.
283On building, C<Unicode::Collate::Locale> doesn't require any of F<data/*.txt>,
284F<gendata/*>, and F<mklocale>.
285Tests for C<Unicode::Collate::Locale> are named F<t/loc_*.t>.
00e00351 286
1393fe00
CBW
287=head1 CAVEAT
288
289=over 4
290
291=item tailoring is not maximum
292
539ce3d8 293Even if a certain letter is tailored, its equivalent would not always
1393fe00
CBW
294tailored as well as it. For example, even though W is tailored,
295fullwidth W (C<U+FF37>), W with acute (C<U+1E82>), etc. are not
c02ee425
CBW
296tailored. The result may depend on whether source strings are
297normalized or not, and whether decomposed or composed.
211cc501 298Thus C<(normalization =E<gt> undef)> is less preferred.
1393fe00
CBW
299
300=back
301
00e00351
CBW
302=head1 AUTHOR
303
304The Unicode::Collate::Locale module for perl was written
305by SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>.
211cc501 306This module is Copyright(C) 2004-2011, SADAHIRO Tomoyuki. Japan.
00e00351
CBW
307All rights reserved.
308
309This module is free software; you can redistribute it and/or
310modify it under the same terms as Perl itself.
311
312=head1 SEE ALSO
313
314=over 4
315
316=item Unicode Collation Algorithm - UTS #10
317
318L<http://www.unicode.org/reports/tr10/>
319
320=item The Default Unicode Collation Element Table (DUCET)
321
322L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
323
aa7758f7
CBW
324=item Unicode Locale Data Markup Language (LDML) - UTS #35
325
326L<http://www.unicode.org/reports/tr35/>
327
00e00351
CBW
328=item CLDR - Unicode Common Locale Data Repository
329
330L<http://cldr.unicode.org/>
331
332=item L<Unicode::Collate>
333
334=item L<Unicode::Normalize>
335
336=back
337
338=cut