This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Document the refcount of version functions’ retval
[perl5.git] / cpan / Unicode-Collate / Collate / Locale.pm
CommitLineData
00e00351
CBW
1package Unicode::Collate::Locale;
2
3use strict;
4use Carp;
5use base qw(Unicode::Collate);
6
b5d9a953 7our $VERSION = '0.67';
00e00351
CBW
8
9use File::Spec;
10
11(my $ModPath = $INC{'Unicode/Collate/Locale.pm'}) =~ s/\.pm$//;
12my $KeyPath = File::Spec->catfile('allkeys.txt');
13my $PL_EXT = '.pl';
14
64dc7822 15my %LocaleFile = map { ($_, $_) } qw(
6709de88 16 af ar az ca cs cy da eo es et fi fil fo fr ha haw
584e761d 17 hr hu hy ig is ja kk kl ko lt lv mt nb nn nso om pl ro ru
028d3bfa 18 se sk sl sq sv sw tn to tr uk vi wo yo zh
64dc7822 19);
456a1446 20 $LocaleFile{'default'} = '';
1393fe00 21 $LocaleFile{'de__phonebook'} = 'de_phone';
456a1446 22 $LocaleFile{'es__traditional'} = 'es_trad';
028d3bfa
CBW
23 $LocaleFile{'be'} = 'ru';
24 $LocaleFile{'bg'} = 'ru';
25 $LocaleFile{'mk'} = 'ru';
26 $LocaleFile{'sr'} = 'ru';
27 $LocaleFile{'zh__big5han'} = 'zh_big5';
28 $LocaleFile{'zh__gb2312han'} = 'zh_gb';
29 $LocaleFile{'zh__pinyin'} = 'zh_pin';
30 $LocaleFile{'zh__stroke'} = 'zh_strk';
00e00351
CBW
31
32sub _locale {
33 my $locale = shift;
34 if ($locale) {
35 $locale = lc $locale;
36 $locale =~ tr/\-\ \./_/;
6709de88 37 $locale =~ s/_phone(?:bk)?\z/_phonebook/;
456a1446 38 $locale =~ s/_trad\z/_traditional/;
028d3bfa
CBW
39 $locale =~ s/_big5\z/_big5han/;
40 $locale =~ s/_gb2312\z/_gb2312han/;
00e00351
CBW
41 $LocaleFile{$locale} and return $locale;
42
43 my ($l,$t,$v) = split(/_/, $locale.'__');
456a1446 44 for my $loc ("${l}_${t}_$v", "${l}_$t", "${l}__$v", "${l}__$t", $l) {
00e00351
CBW
45 $LocaleFile{$loc} and return $loc;
46 }
47 }
48 return 'default';
49}
50
51sub getlocale {
52 return shift->{accepted_locale};
53}
54
f1a7422f 55sub _fetchpl {
6484f676
CBW
56 my $accepted = shift;
57 my $f = $LocaleFile{$accepted};
58 return if !$f;
59 $f .= $PL_EXT;
60 my $path = File::Spec->catfile($ModPath, $f);
61 my $h = do $path;
62 croak "Unicode/Collate/Locale/$f can't be found" if !$h;
63 return $h;
64}
65
00e00351
CBW
66sub new {
67 my $class = shift;
68 my %hash = @_;
00e00351
CBW
69 $hash{accepted_locale} = _locale($hash{locale});
70
64dc7822
CBW
71 if (exists $hash{table}) {
72 croak "your table can't be used with Unicode::Collate::Locale";
73 }
6484f676 74 $hash{table} = $KeyPath;
00e00351 75
f1a7422f 76 my $href = _fetchpl($hash{accepted_locale});
00e00351
CBW
77 while (my($k,$v) = each %$href) {
78 if (exists $hash{$k}) {
79 croak "$k is reserved by $hash{locale}, can't be overwritten";
80 }
81 $hash{$k} = $v;
82 }
83 return $class->SUPER::new(%hash);
84}
85
861;
87__END__
88
89=head1 NAME
90
91Unicode::Collate::Locale - Linguistic tailoring for DUCET via Unicode::Collate
92
93=head1 SYNOPSIS
94
95 use Unicode::Collate::Locale;
96
539ce3d8 97 #construct
00e00351
CBW
98 $Collator = Unicode::Collate::Locale->
99 new(locale => $locale_name, %tailoring);
100
539ce3d8 101 #sort
00e00351
CBW
102 @sorted = $Collator->sort(@not_sorted);
103
539ce3d8
CBW
104 #compare
105 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
106
107B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
108according to Perl's Unicode support. See L<perlunicode>,
109L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
110Otherwise you can use C<preprocess> (cf. C<Unicode::Collate>)
111or should decode them before.
112
00e00351
CBW
113=head1 DESCRIPTION
114
115This module provides linguistic tailoring for it
116taking advantage of C<Unicode::Collate>.
117
118=head2 Constructor
119
120The C<new> method returns a collator object.
121
122A parameter list for the constructor is a hash, which can include
123a special key C<'locale'> and its value (case-insensitive) standing
124for a two-letter language code (ISO-639) like C<'en'> for English.
125For example, C<Unicode::Collate::Locale-E<gt>new(locale =E<gt> 'FR')>
126returns a collator tailored for French.
127
128C<$locale_name> may be suffixed with a territory(country)
129code or a variant code, which are separated with C<'_'>.
130E.g. C<en_US> for English in USA,
131C<es_ES_traditional> for Spanish in Spain (Traditional),
132
133If C<$localename> is not defined,
134fallback is selected in the following order:
135
64dc7822
CBW
136 1. language_territory_variant
137 2. language_territory
138 3. language__variant
139 4. language
140 5. default
00e00351
CBW
141
142Tailoring tags provided by C<Unicode::Collate> are allowed
143as long as they are not used for C<'locale'> support.
144Esp. the C<table> tag is always untailorable
145since it is reserved for DUCET.
146
147E.g. a collator for French, which ignores diacritics and case difference
148(i.e. level 1), with reversed case ordering and no normalization.
149
150 Unicode::Collate::Locale->new(
64dc7822
CBW
151 level => 1,
152 locale => 'fr',
153 upper_before_lower => 1,
154 normalization => undef
00e00351
CBW
155 )
156
157=head2 Methods
158
159C<Unicode::Collate::Locale> is a subclass of C<Unicode::Collate>
160and methods other than C<new> are inherited from C<Unicode::Collate>.
161
162Here is a list of additional methods:
163
164=over 4
165
166=item C<$Collator-E<gt>getlocale>
167
168Returns a language code accepted and used actually on collation.
169If linguistic tailoring is not provided for a language code you passed
170(intensionally for some languages, or due to the incomplete implementation),
171this method returns a string C<'default'> meaning no special tailoring.
172
173=back
174
175=head2 A list of tailorable locales
176
64dc7822
CBW
177 locale name description
178 ----------------------------------------------------------
6484f676 179 af Afrikaans
6709de88 180 ar Arabic
f1a7422f 181 az Azerbaijani (Azeri)
aa7758f7
CBW
182 be Belarusian
183 bg Bulgarian
64dc7822 184 ca Catalan
00e00351 185 cs Czech
6484f676
CBW
186 cy Welsh
187 da Danish
1393fe00 188 de__phonebook German (umlaut as 'ae', 'oe', 'ue')
456a1446 189 eo Esperanto
00e00351
CBW
190 es Spanish
191 es__traditional Spanish ('ch' and 'll' as a grapheme)
64dc7822
CBW
192 et Estonian
193 fi Finnish
f1a7422f 194 fil Filipino
6484f676 195 fo Faroese
00e00351 196 fr French
f1a7422f 197 ha Hausa
6484f676 198 haw Hawaiian
c02ee425 199 hr Croatian
6709de88
CBW
200 hu Hungarian
201 hy Armenian
c02ee425 202 ig Igbo
6484f676 203 is Icelandic
539ce3d8 204 ja Japanese [1]
aa7758f7 205 kk Kazakh
6484f676 206 kl Kalaallisut
584e761d 207 ko Korean [2]
f1a7422f 208 lt Lithuanian
64dc7822 209 lv Latvian
aa7758f7 210 mk Macedonian
f1a7422f 211 mt Maltese
456a1446 212 nb Norwegian Bokmal
00e00351 213 nn Norwegian Nynorsk
1393fe00
CBW
214 nso Northern Sotho
215 om Oromo
00e00351 216 pl Polish
456a1446 217 ro Romanian
aa7758f7 218 ru Russian
6709de88 219 se Northern Sami
64dc7822
CBW
220 sk Slovak
221 sl Slovenian
c02ee425 222 sq Albanian
aa7758f7 223 sr Serbian
456a1446 224 sv Swedish
6484f676 225 sw Swahili
1393fe00 226 tn Tswana
6709de88 227 to Tonga
f1a7422f 228 tr Turkish
6709de88 229 uk Ukrainian
1393fe00 230 vi Vietnamese
f1a7422f
CBW
231 wo Wolof
232 yo Yoruba
028d3bfa
CBW
233 zh Chinese
234 zh__big5han Chinese (ideographs: big5 order)
235 zh__gb2312han Chinese (ideographs: GB-2312 order)
236 zh__pinyin Chinese (ideographs: pinyin order)
237 zh__stroke Chinese (ideographs: stroke order)
aa7758f7
CBW
238 ----------------------------------------------------------
239
539ce3d8 240Locales according to the default UCA rules include
aa7758f7
CBW
241de (German),
242en (English),
243ga (Irish),
244id (Indonesian),
245it (Italian),
246ka (Georgian),
247ln (Lingala),
248ms (Malay),
249nl (Dutch),
250pt (Portuguese),
251st (Southern Sotho),
252xh (Xhosa),
253zu (Zulu).
f1a7422f 254
539ce3d8
CBW
255B<Note>
256
257[1] ja: Ideographs are sorted in JIS X 0208 order.
258Fullwidth and halfwidth forms are identical to their normal form.
259The difference between hiragana and katakana is at the 4th level,
260the comparison also requires C<(variable =E<gt> 'Non-ignorable')>,
261and then C<katakana_before_hiragana> has no effect.
262
584e761d
CBW
263[2] ko: Plenty of ideographs are sorted by their reading. Such
264an ideograph is primary (level 1) equal to, and secondary (level 2)
265greater than, the corresponding hangul syllable.
266
f1a7422f
CBW
267=head1 INSTALL
268
539ce3d8
CBW
269Installation of C<Unicode::Collate::Locale> requires F<Collate/Locale.pm>,
270F<Collate/Locale/*.pm>, F<Collate/CJK/*.pm> and F<Collate/allkeys.txt>.
271On building, C<Unicode::Collate::Locale> doesn't require any of F<data/*.txt>,
272F<gendata/*>, and F<mklocale>.
273Tests for C<Unicode::Collate::Locale> are named F<t/loc_*.t>.
00e00351 274
1393fe00
CBW
275=head1 CAVEAT
276
277=over 4
278
279=item tailoring is not maximum
280
539ce3d8 281Even if a certain letter is tailored, its equivalent would not always
1393fe00
CBW
282tailored as well as it. For example, even though W is tailored,
283fullwidth W (C<U+FF37>), W with acute (C<U+1E82>), etc. are not
c02ee425
CBW
284tailored. The result may depend on whether source strings are
285normalized or not, and whether decomposed or composed.
286Thus C<(normalization =E<gt> undef> is less preferred.
1393fe00
CBW
287
288=back
289
00e00351
CBW
290=head1 AUTHOR
291
292The Unicode::Collate::Locale module for perl was written
293by SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>.
294This module is Copyright(C) 2004-2010, SADAHIRO Tomoyuki. Japan.
295All rights reserved.
296
297This module is free software; you can redistribute it and/or
298modify it under the same terms as Perl itself.
299
300=head1 SEE ALSO
301
302=over 4
303
304=item Unicode Collation Algorithm - UTS #10
305
306L<http://www.unicode.org/reports/tr10/>
307
308=item The Default Unicode Collation Element Table (DUCET)
309
310L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
311
aa7758f7
CBW
312=item Unicode Locale Data Markup Language (LDML) - UTS #35
313
314L<http://www.unicode.org/reports/tr35/>
315
00e00351
CBW
316=item CLDR - Unicode Common Locale Data Repository
317
318L<http://cldr.unicode.org/>
319
320=item L<Unicode::Collate>
321
322=item L<Unicode::Normalize>
323
324=back
325
326=cut