This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Merge the implementation of B::{dowarn,sub_generation} using ALIAS.
[perl5.git] / cpan / Unicode-Collate / Collate / Locale.pm
CommitLineData
00e00351
CBW
1package Unicode::Collate::Locale;
2
3use strict;
4use Carp;
5use base qw(Unicode::Collate);
6
aa7758f7 7our $VERSION = '0.63';
00e00351
CBW
8
9use File::Spec;
10
11(my $ModPath = $INC{'Unicode/Collate/Locale.pm'}) =~ s/\.pm$//;
12my $KeyPath = File::Spec->catfile('allkeys.txt');
13my $PL_EXT = '.pl';
14
64dc7822 15my %LocaleFile = map { ($_, $_) } qw(
6709de88 16 af ar az ca cs cy da eo es et fi fil fo fr ha haw
aa7758f7 17 hr hu hy ig is kk kl lt lv mt nb nn nso om pl ro ru
6709de88 18 se sk sl sq sv sw tn to tr uk vi wo yo
64dc7822 19);
456a1446 20 $LocaleFile{'default'} = '';
1393fe00 21 $LocaleFile{'de__phonebook'} = 'de_phone';
456a1446 22 $LocaleFile{'es__traditional'} = 'es_trad';
aa7758f7
CBW
23 $LocaleFile{'be'} = "ru";
24 $LocaleFile{'bg'} = "ru";
25 $LocaleFile{'mk'} = "ru";
26 $LocaleFile{'sr'} = "ru";
00e00351
CBW
27
28sub _locale {
29 my $locale = shift;
30 if ($locale) {
31 $locale = lc $locale;
32 $locale =~ tr/\-\ \./_/;
6709de88 33 $locale =~ s/_phone(?:bk)?\z/_phonebook/;
456a1446 34 $locale =~ s/_trad\z/_traditional/;
00e00351
CBW
35 $LocaleFile{$locale} and return $locale;
36
37 my ($l,$t,$v) = split(/_/, $locale.'__');
456a1446 38 for my $loc ("${l}_${t}_$v", "${l}_$t", "${l}__$v", "${l}__$t", $l) {
00e00351
CBW
39 $LocaleFile{$loc} and return $loc;
40 }
41 }
42 return 'default';
43}
44
45sub getlocale {
46 return shift->{accepted_locale};
47}
48
f1a7422f 49sub _fetchpl {
6484f676
CBW
50 my $accepted = shift;
51 my $f = $LocaleFile{$accepted};
52 return if !$f;
53 $f .= $PL_EXT;
54 my $path = File::Spec->catfile($ModPath, $f);
55 my $h = do $path;
56 croak "Unicode/Collate/Locale/$f can't be found" if !$h;
57 return $h;
58}
59
00e00351
CBW
60sub new {
61 my $class = shift;
62 my %hash = @_;
00e00351
CBW
63 $hash{accepted_locale} = _locale($hash{locale});
64
64dc7822
CBW
65 if (exists $hash{table}) {
66 croak "your table can't be used with Unicode::Collate::Locale";
67 }
6484f676 68 $hash{table} = $KeyPath;
00e00351 69
f1a7422f 70 my $href = _fetchpl($hash{accepted_locale});
00e00351
CBW
71 while (my($k,$v) = each %$href) {
72 if (exists $hash{$k}) {
73 croak "$k is reserved by $hash{locale}, can't be overwritten";
74 }
75 $hash{$k} = $v;
76 }
77 return $class->SUPER::new(%hash);
78}
79
801;
81__END__
82
83=head1 NAME
84
85Unicode::Collate::Locale - Linguistic tailoring for DUCET via Unicode::Collate
86
87=head1 SYNOPSIS
88
89 use Unicode::Collate::Locale;
90
91 $Collator = Unicode::Collate::Locale->
92 new(locale => $locale_name, %tailoring);
93
94 @sorted = $Collator->sort(@not_sorted);
95
96=head1 DESCRIPTION
97
98This module provides linguistic tailoring for it
99taking advantage of C<Unicode::Collate>.
100
101=head2 Constructor
102
103The C<new> method returns a collator object.
104
105A parameter list for the constructor is a hash, which can include
106a special key C<'locale'> and its value (case-insensitive) standing
107for a two-letter language code (ISO-639) like C<'en'> for English.
108For example, C<Unicode::Collate::Locale-E<gt>new(locale =E<gt> 'FR')>
109returns a collator tailored for French.
110
111C<$locale_name> may be suffixed with a territory(country)
112code or a variant code, which are separated with C<'_'>.
113E.g. C<en_US> for English in USA,
114C<es_ES_traditional> for Spanish in Spain (Traditional),
115
116If C<$localename> is not defined,
117fallback is selected in the following order:
118
64dc7822
CBW
119 1. language_territory_variant
120 2. language_territory
121 3. language__variant
122 4. language
123 5. default
00e00351
CBW
124
125Tailoring tags provided by C<Unicode::Collate> are allowed
126as long as they are not used for C<'locale'> support.
127Esp. the C<table> tag is always untailorable
128since it is reserved for DUCET.
129
130E.g. a collator for French, which ignores diacritics and case difference
131(i.e. level 1), with reversed case ordering and no normalization.
132
133 Unicode::Collate::Locale->new(
64dc7822
CBW
134 level => 1,
135 locale => 'fr',
136 upper_before_lower => 1,
137 normalization => undef
00e00351
CBW
138 )
139
140=head2 Methods
141
142C<Unicode::Collate::Locale> is a subclass of C<Unicode::Collate>
143and methods other than C<new> are inherited from C<Unicode::Collate>.
144
145Here is a list of additional methods:
146
147=over 4
148
149=item C<$Collator-E<gt>getlocale>
150
151Returns a language code accepted and used actually on collation.
152If linguistic tailoring is not provided for a language code you passed
153(intensionally for some languages, or due to the incomplete implementation),
154this method returns a string C<'default'> meaning no special tailoring.
155
156=back
157
158=head2 A list of tailorable locales
159
64dc7822
CBW
160 locale name description
161 ----------------------------------------------------------
6484f676 162 af Afrikaans
6709de88 163 ar Arabic
f1a7422f 164 az Azerbaijani (Azeri)
aa7758f7
CBW
165 be Belarusian
166 bg Bulgarian
64dc7822 167 ca Catalan
00e00351 168 cs Czech
6484f676
CBW
169 cy Welsh
170 da Danish
1393fe00 171 de__phonebook German (umlaut as 'ae', 'oe', 'ue')
456a1446 172 eo Esperanto
00e00351
CBW
173 es Spanish
174 es__traditional Spanish ('ch' and 'll' as a grapheme)
64dc7822
CBW
175 et Estonian
176 fi Finnish
f1a7422f 177 fil Filipino
6484f676 178 fo Faroese
00e00351 179 fr French
f1a7422f 180 ha Hausa
6484f676 181 haw Hawaiian
c02ee425 182 hr Croatian
6709de88
CBW
183 hu Hungarian
184 hy Armenian
c02ee425 185 ig Igbo
6484f676 186 is Icelandic
aa7758f7 187 kk Kazakh
6484f676 188 kl Kalaallisut
f1a7422f 189 lt Lithuanian
64dc7822 190 lv Latvian
aa7758f7 191 mk Macedonian
f1a7422f 192 mt Maltese
456a1446 193 nb Norwegian Bokmal
00e00351 194 nn Norwegian Nynorsk
1393fe00
CBW
195 nso Northern Sotho
196 om Oromo
00e00351 197 pl Polish
456a1446 198 ro Romanian
aa7758f7 199 ru Russian
6709de88 200 se Northern Sami
64dc7822
CBW
201 sk Slovak
202 sl Slovenian
c02ee425 203 sq Albanian
aa7758f7 204 sr Serbian
456a1446 205 sv Swedish
6484f676 206 sw Swahili
1393fe00 207 tn Tswana
6709de88 208 to Tonga
f1a7422f 209 tr Turkish
6709de88 210 uk Ukrainian
1393fe00 211 vi Vietnamese
f1a7422f
CBW
212 wo Wolof
213 yo Yoruba
aa7758f7
CBW
214 ----------------------------------------------------------
215
216Locales according to default UCA rules include:
217de (German),
218en (English),
219ga (Irish),
220id (Indonesian),
221it (Italian),
222ka (Georgian),
223ln (Lingala),
224ms (Malay),
225nl (Dutch),
226pt (Portuguese),
227st (Southern Sotho),
228xh (Xhosa),
229zu (Zulu).
f1a7422f
CBW
230
231=head1 INSTALL
232
233Installation of Unicode::Collate::Locale requires F<Collate/Locale.pm>,
234F<Collate/Locale/*.pm> and F<Collate/allkeys.txt>. On building,
235Unicode::Collate::Locale doesn't require F<data/*.txt> and F<mklocale>.
236Tests for Unicode::Collate::Locale are named F<t/loc_*.t>.
00e00351 237
1393fe00
CBW
238=head1 CAVEAT
239
240=over 4
241
242=item tailoring is not maximum
243
244If a certain letter is tailored, its equivalents are not always
245tailored as well as it. For example, even though W is tailored,
246fullwidth W (C<U+FF37>), W with acute (C<U+1E82>), etc. are not
c02ee425
CBW
247tailored. The result may depend on whether source strings are
248normalized or not, and whether decomposed or composed.
249Thus C<(normalization =E<gt> undef> is less preferred.
1393fe00
CBW
250
251=back
252
00e00351
CBW
253=head1 AUTHOR
254
255The Unicode::Collate::Locale module for perl was written
256by SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>.
257This module is Copyright(C) 2004-2010, SADAHIRO Tomoyuki. Japan.
258All rights reserved.
259
260This module is free software; you can redistribute it and/or
261modify it under the same terms as Perl itself.
262
263=head1 SEE ALSO
264
265=over 4
266
267=item Unicode Collation Algorithm - UTS #10
268
269L<http://www.unicode.org/reports/tr10/>
270
271=item The Default Unicode Collation Element Table (DUCET)
272
273L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
274
aa7758f7
CBW
275=item Unicode Locale Data Markup Language (LDML) - UTS #35
276
277L<http://www.unicode.org/reports/tr35/>
278
00e00351
CBW
279=item CLDR - Unicode Common Locale Data Repository
280
281L<http://cldr.unicode.org/>
282
283=item L<Unicode::Collate>
284
285=item L<Unicode::Normalize>
286
287=back
288
289=cut