Commit | Line | Data |
---|---|---|
4b053158 | 1 | |
77b20956 | 2 | # Time-stamp: "2004-10-06 23:26:33 ADT" |
4b053158 JH |
3 | # Sean M. Burke <sburke@cpan.org> |
4 | ||
5 | require 5.000; | |
6 | package I18N::LangTags; | |
7 | use strict; | |
21aeefd5 | 8 | use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic); |
4b053158 | 9 | require Exporter; |
4b053158 JH |
10 | @ISA = qw(Exporter); |
11 | @EXPORT = qw(); | |
12 | @EXPORT_OK = qw(is_language_tag same_language_tag | |
13 | extract_language_tags super_languages | |
14 | similarity_language_tag is_dialect_of | |
15 | locale2language_tag alternate_language_tags | |
21aeefd5 | 16 | encode_language_tag panic_languages |
8000a3fa RGS |
17 | implicate_supers |
18 | implicate_supers_strictly | |
4b053158 | 19 | ); |
21aeefd5 | 20 | %EXPORT_TAGS = ('ALL' => \@EXPORT_OK); |
4b053158 | 21 | |
77b20956 | 22 | $VERSION = "0.35"; |
8000a3fa RGS |
23 | |
24 | sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function | |
25 | ||
4b053158 JH |
26 | |
27 | =head1 NAME | |
28 | ||
29 | I18N::LangTags - functions for dealing with RFC3066-style language tags | |
30 | ||
31 | =head1 SYNOPSIS | |
32 | ||
8000a3fa RGS |
33 | use I18N::LangTags(); |
34 | ||
35 | ...or specify whichever of those functions you want to import, like so: | |
4b053158 | 36 | |
8000a3fa RGS |
37 | use I18N::LangTags qw(implicate_supers similarity_language_tag); |
38 | ||
39 | All the exportable functions are listed below -- you're free to import | |
40 | only some, or none at all. By default, none are imported. If you | |
41 | say: | |
21aeefd5 JH |
42 | |
43 | use I18N::LangTags qw(:ALL) | |
44 | ||
45 | ...then all are exported. (This saves you from having to use | |
46 | something less obvious like C<use I18N::LangTags qw(/./)>.) | |
4b053158 JH |
47 | |
48 | If you don't import any of these functions, assume a C<&I18N::LangTags::> | |
49 | in front of all the function names in the following examples. | |
50 | ||
51 | =head1 DESCRIPTION | |
52 | ||
53 | Language tags are a formalism, described in RFC 3066 (obsoleting | |
54 | 1766), for declaring what language form (language and possibly | |
55 | dialect) a given chunk of information is in. | |
56 | ||
57 | This library provides functions for common tasks involving language | |
58 | tags as they are needed in a variety of protocols and applications. | |
59 | ||
60 | Please see the "See Also" references for a thorough explanation | |
61 | of how to correctly use language tags. | |
62 | ||
63 | =over | |
64 | ||
65 | =cut | |
66 | ||
67 | ########################################################################### | |
68 | ||
69 | =item * the function is_language_tag($lang1) | |
70 | ||
71 | Returns true iff $lang1 is a formally valid language tag. | |
72 | ||
73 | is_language_tag("fr") is TRUE | |
74 | is_language_tag("x-jicarilla") is FALSE | |
75 | (Subtags can be 8 chars long at most -- 'jicarilla' is 9) | |
76 | ||
77 | is_language_tag("sgn-US") is TRUE | |
78 | (That's American Sign Language) | |
79 | ||
80 | is_language_tag("i-Klikitat") is TRUE | |
81 | (True without regard to the fact noone has actually | |
82 | registered Klikitat -- it's a formally valid tag) | |
83 | ||
84 | is_language_tag("fr-patois") is TRUE | |
85 | (Formally valid -- altho descriptively weak!) | |
86 | ||
87 | is_language_tag("Spanish") is FALSE | |
88 | is_language_tag("french-patois") is FALSE | |
89 | (No good -- first subtag has to match | |
90 | /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066) | |
91 | ||
92 | is_language_tag("x-borg-prot2532") is TRUE | |
93 | (Yes, subtags can contain digits, as of RFC3066) | |
94 | ||
95 | =cut | |
96 | ||
97 | sub is_language_tag { | |
98 | ||
99 | ## Changes in the language tagging standards may have to be reflected here. | |
100 | ||
101 | my($tag) = lc($_[0]); | |
102 | ||
103 | return 0 if $tag eq "i" or $tag eq "x"; | |
21aeefd5 | 104 | # Bad degenerate cases that the following |
4b053158 JH |
105 | # regexp would erroneously let pass |
106 | ||
107 | return $tag =~ | |
108 | /^(?: # First subtag | |
109 | [xi] | [a-z]{2,3} | |
110 | ) | |
111 | (?: # Subtags thereafter | |
112 | - # separator | |
113 | [a-z0-9]{1,8} # subtag | |
114 | )* | |
115 | $/xs ? 1 : 0; | |
116 | } | |
117 | ||
118 | ########################################################################### | |
119 | ||
120 | =item * the function extract_language_tags($whatever) | |
121 | ||
122 | Returns a list of whatever looks like formally valid language tags | |
123 | in $whatever. Not very smart, so don't get too creative with | |
124 | what you want to feed it. | |
125 | ||
126 | extract_language_tags("fr, fr-ca, i-mingo") | |
127 | returns: ('fr', 'fr-ca', 'i-mingo') | |
128 | ||
129 | extract_language_tags("It's like this: I'm in fr -- French!") | |
130 | returns: ('It', 'in', 'fr') | |
131 | (So don't just feed it any old thing.) | |
132 | ||
133 | The output is untainted. If you don't know what tainting is, | |
134 | don't worry about it. | |
135 | ||
136 | =cut | |
137 | ||
138 | sub extract_language_tags { | |
139 | ||
140 | ## Changes in the language tagging standards may have to be reflected here. | |
141 | ||
142 | my($text) = | |
143 | $_[0] =~ m/(.+)/ # to make for an untainted result | |
144 | ? $1 : '' | |
145 | ; | |
146 | ||
147 | return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags | |
148 | $text =~ | |
149 | m/ | |
150 | \b | |
151 | (?: # First subtag | |
152 | [iIxX] | [a-zA-Z]{2,3} | |
153 | ) | |
154 | (?: # Subtags thereafter | |
155 | - # separator | |
156 | [a-zA-Z0-9]{1,8} # subtag | |
157 | )* | |
158 | \b | |
159 | /xsg | |
160 | ); | |
161 | } | |
162 | ||
163 | ########################################################################### | |
164 | ||
165 | =item * the function same_language_tag($lang1, $lang2) | |
166 | ||
167 | Returns true iff $lang1 and $lang2 are acceptable variant tags | |
168 | representing the same language-form. | |
169 | ||
170 | same_language_tag('x-kadara', 'i-kadara') is TRUE | |
171 | (The x/i- alternation doesn't matter) | |
172 | same_language_tag('X-KADARA', 'i-kadara') is TRUE | |
173 | (...and neither does case) | |
174 | same_language_tag('en', 'en-US') is FALSE | |
175 | (all-English is not the SAME as US English) | |
176 | same_language_tag('x-kadara', 'x-kadar') is FALSE | |
177 | (these are totally unrelated tags) | |
21aeefd5 JH |
178 | same_language_tag('no-bok', 'nb') is TRUE |
179 | (no-bok is a legacy tag for nb (Norwegian Bokmal)) | |
4b053158 JH |
180 | |
181 | C<same_language_tag> works by just seeing whether | |
182 | C<encode_language_tag($lang1)> is the same as | |
183 | C<encode_language_tag($lang2)>. | |
184 | ||
185 | (Yes, I know this function is named a bit oddly. Call it historic | |
186 | reasons.) | |
187 | ||
188 | =cut | |
189 | ||
190 | sub same_language_tag { | |
191 | my $el1 = &encode_language_tag($_[0]); | |
192 | return 0 unless defined $el1; | |
193 | # this avoids the problem of | |
194 | # encode_language_tag($lang1) eq and encode_language_tag($lang2) | |
195 | # being true if $lang1 and $lang2 are both undef | |
196 | ||
197 | return $el1 eq &encode_language_tag($_[1]) ? 1 : 0; | |
198 | } | |
199 | ||
200 | ########################################################################### | |
201 | ||
202 | =item * the function similarity_language_tag($lang1, $lang2) | |
203 | ||
204 | Returns an integer representing the degree of similarity between | |
205 | tags $lang1 and $lang2 (the order of which does not matter), where | |
206 | similarity is the number of common elements on the left, | |
207 | without regard to case and to x/i- alternation. | |
208 | ||
209 | similarity_language_tag('fr', 'fr-ca') is 1 | |
210 | (one element in common) | |
211 | similarity_language_tag('fr-ca', 'fr-FR') is 1 | |
212 | (one element in common) | |
213 | ||
214 | similarity_language_tag('fr-CA-joual', | |
215 | 'fr-CA-PEI') is 2 | |
216 | similarity_language_tag('fr-CA-joual', 'fr-CA') is 2 | |
217 | (two elements in common) | |
218 | ||
219 | similarity_language_tag('x-kadara', 'i-kadara') is 1 | |
220 | (x/i- doesn't matter) | |
221 | ||
222 | similarity_language_tag('en', 'x-kadar') is 0 | |
223 | similarity_language_tag('x-kadara', 'x-kadar') is 0 | |
224 | (unrelated tags -- no similarity) | |
225 | ||
226 | similarity_language_tag('i-cree-syllabic', | |
227 | 'i-cherokee-syllabic') is 0 | |
228 | (no B<leftmost> elements in common!) | |
229 | ||
230 | =cut | |
231 | ||
232 | sub similarity_language_tag { | |
233 | my $lang1 = &encode_language_tag($_[0]); | |
234 | my $lang2 = &encode_language_tag($_[1]); | |
21aeefd5 JH |
235 | # And encode_language_tag takes care of the whole |
236 | # no-nyn==nn, i-hakka==zh-hakka, etc, things | |
237 | ||
4b053158 JH |
238 | # NB: (i-sil-...)? (i-sgn-...)? |
239 | ||
240 | return undef if !defined($lang1) and !defined($lang2); | |
241 | return 0 if !defined($lang1) or !defined($lang2); | |
242 | ||
243 | my @l1_subtags = split('-', $lang1); | |
244 | my @l2_subtags = split('-', $lang2); | |
245 | my $similarity = 0; | |
246 | ||
247 | while(@l1_subtags and @l2_subtags) { | |
248 | if(shift(@l1_subtags) eq shift(@l2_subtags)) { | |
249 | ++$similarity; | |
250 | } else { | |
251 | last; | |
252 | } | |
253 | } | |
254 | return $similarity; | |
255 | } | |
256 | ||
257 | ########################################################################### | |
258 | ||
259 | =item * the function is_dialect_of($lang1, $lang2) | |
260 | ||
21aeefd5 | 261 | Returns true iff language tag $lang1 represents a subform of |
4b053158 JH |
262 | language tag $lang2. |
263 | ||
264 | B<Get the order right! It doesn't work the other way around!> | |
265 | ||
266 | is_dialect_of('en-US', 'en') is TRUE | |
267 | (American English IS a dialect of all-English) | |
268 | ||
269 | is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE | |
270 | is_dialect_of('fr-CA-joual', 'fr') is TRUE | |
271 | (Joual is a dialect of (a dialect of) French) | |
272 | ||
273 | is_dialect_of('en', 'en-US') is FALSE | |
274 | (all-English is a NOT dialect of American English) | |
275 | ||
276 | is_dialect_of('fr', 'en-CA') is FALSE | |
277 | ||
21aeefd5 JH |
278 | is_dialect_of('en', 'en' ) is TRUE |
279 | is_dialect_of('en-US', 'en-US') is TRUE | |
4b053158 JH |
280 | (B<Note:> these are degenerate cases) |
281 | ||
282 | is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE | |
283 | (the x/i thing doesn't matter, nor does case) | |
284 | ||
21aeefd5 JH |
285 | is_dialect_of('nn', 'no') is TRUE |
286 | (because 'nn' (New Norse) is aliased to 'no-nyn', | |
287 | as a special legacy case, and 'no-nyn' is a | |
288 | subform of 'no' (Norwegian)) | |
289 | ||
4b053158 JH |
290 | =cut |
291 | ||
292 | sub is_dialect_of { | |
293 | ||
294 | my $lang1 = &encode_language_tag($_[0]); | |
295 | my $lang2 = &encode_language_tag($_[1]); | |
296 | ||
297 | return undef if !defined($lang1) and !defined($lang2); | |
298 | return 0 if !defined($lang1) or !defined($lang2); | |
299 | ||
300 | return 1 if $lang1 eq $lang2; | |
301 | return 0 if length($lang1) < length($lang2); | |
302 | ||
303 | $lang1 .= '-'; | |
304 | $lang2 .= '-'; | |
305 | return | |
306 | (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0; | |
307 | } | |
308 | ||
309 | ########################################################################### | |
310 | ||
311 | =item * the function super_languages($lang1) | |
312 | ||
313 | Returns a list of language tags that are superordinate tags to $lang1 | |
314 | -- it gets this by removing subtags from the end of $lang1 until | |
315 | nothing (or just "i" or "x") is left. | |
316 | ||
317 | super_languages("fr-CA-joual") is ("fr-CA", "fr") | |
318 | ||
319 | super_languages("en-AU") is ("en") | |
320 | ||
321 | super_languages("en") is empty-list, () | |
322 | ||
323 | super_languages("i-cherokee") is empty-list, () | |
324 | ...not ("i"), which would be illegal as well as pointless. | |
325 | ||
326 | If $lang1 is not a valid language tag, returns empty-list in | |
327 | a list context, undef in a scalar context. | |
328 | ||
329 | A notable and rather unavoidable problem with this method: | |
330 | "x-mingo-tom" has an "x" because the whole tag isn't an | |
331 | IANA-registered tag -- but super_languages('x-mingo-tom') is | |
332 | ('x-mingo') -- which isn't really right, since 'i-mingo' is | |
333 | registered. But this module has no way of knowing that. (But note | |
334 | that same_language_tag('x-mingo', 'i-mingo') is TRUE.) | |
335 | ||
336 | More importantly, you assume I<at your peril> that superordinates of | |
337 | $lang1 are mutually intelligible with $lang1. Consider this | |
338 | carefully. | |
339 | ||
8000a3fa | 340 | =cut |
4b053158 JH |
341 | |
342 | sub super_languages { | |
343 | my $lang1 = $_[0]; | |
344 | return() unless defined($lang1) && &is_language_tag($lang1); | |
21aeefd5 JH |
345 | |
346 | # a hack for those annoying new (2001) tags: | |
347 | $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards | |
348 | $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards | |
349 | $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way | |
350 | # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark | |
351 | ||
4b053158 JH |
352 | my @l1_subtags = split('-', $lang1); |
353 | ||
354 | ## Changes in the language tagging standards may have to be reflected here. | |
355 | ||
356 | # NB: (i-sil-...)? | |
357 | ||
358 | my @supers = (); | |
359 | foreach my $bit (@l1_subtags) { | |
360 | push @supers, | |
361 | scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; | |
362 | } | |
363 | pop @supers if @supers; | |
364 | shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s; | |
365 | return reverse @supers; | |
366 | } | |
367 | ||
368 | ########################################################################### | |
369 | ||
370 | =item * the function locale2language_tag($locale_identifier) | |
371 | ||
372 | This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1") | |
373 | and maps it to a language tag. If it's not mappable (as with, | |
374 | notably, "C" and "POSIX"), this returns empty-list in a list context, | |
375 | or undef in a scalar context. | |
376 | ||
377 | locale2language_tag("en") is "en" | |
378 | ||
379 | locale2language_tag("en_US") is "en-US" | |
380 | ||
381 | locale2language_tag("en_US.ISO8859-1") is "en-US" | |
382 | ||
383 | locale2language_tag("C") is undef or () | |
384 | ||
385 | locale2language_tag("POSIX") is undef or () | |
386 | ||
387 | locale2language_tag("POSIX") is undef or () | |
388 | ||
389 | I'm not totally sure that locale names map satisfactorily to language | |
390 | tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED. | |
391 | ||
392 | The output is untainted. If you don't know what tainting is, | |
393 | don't worry about it. | |
394 | ||
8000a3fa | 395 | =cut |
4b053158 JH |
396 | |
397 | sub locale2language_tag { | |
398 | my $lang = | |
399 | $_[0] =~ m/(.+)/ # to make for an untainted result | |
400 | ? $1 : '' | |
401 | ; | |
402 | ||
403 | return $lang if &is_language_tag($lang); # like "en" | |
404 | ||
405 | $lang =~ tr<_><->; # "en_US" -> en-US | |
77b20956 NC |
406 | $lang =~ s<(?:[\.\@][-_a-zA-Z0-9]+)+$><>s; # "en_US.ISO8859-1" -> en-US |
407 | # it_IT.utf8@euro => it-IT | |
4b053158 JH |
408 | |
409 | return $lang if &is_language_tag($lang); | |
410 | ||
411 | return; | |
412 | } | |
413 | ||
414 | ########################################################################### | |
415 | ||
416 | =item * the function encode_language_tag($lang1) | |
417 | ||
418 | This function, if given a language tag, returns an encoding of it such | |
419 | that: | |
420 | ||
421 | * tags representing different languages never get the same encoding. | |
422 | ||
423 | * tags representing the same language always get the same encoding. | |
424 | ||
425 | * an encoding of a formally valid language tag always is a string | |
426 | value that is defined, has length, and is true if considered as a | |
427 | boolean. | |
428 | ||
429 | Note that the encoding itself is B<not> a formally valid language tag. | |
430 | Note also that you cannot, currently, go from an encoding back to a | |
431 | language tag that it's an encoding of. | |
432 | ||
433 | Note also that you B<must> consider the encoded value as atomic; i.e., | |
434 | you should not consider it as anything but an opaque, unanalysable | |
435 | string value. (The internals of the encoding method may change in | |
436 | future versions, as the language tagging standard changes over time.) | |
437 | ||
438 | C<encode_language_tag> returns undef if given anything other than a | |
439 | formally valid language tag. | |
440 | ||
441 | The reason C<encode_language_tag> exists is because different language | |
442 | tags may represent the same language; this is normally treatable with | |
443 | C<same_language_tag>, but consider this situation: | |
444 | ||
445 | You have a data file that expresses greetings in different languages. | |
446 | Its format is "[language tag]=[how to say 'Hello']", like: | |
447 | ||
448 | en-US=Hiho | |
449 | fr=Bonjour | |
450 | i-mingo=Hau' | |
451 | ||
452 | And suppose you write a program that reads that file and then runs as | |
453 | a daemon, answering client requests that specify a language tag and | |
454 | then expect the string that says how to greet in that language. So an | |
455 | interaction looks like: | |
456 | ||
457 | greeting-client asks: fr | |
458 | greeting-server answers: Bonjour | |
459 | ||
460 | So far so good. But suppose the way you're implementing this is: | |
461 | ||
462 | my %greetings; | |
463 | die unless open(IN, "<in.dat"); | |
464 | while(<IN>) { | |
465 | chomp; | |
466 | next unless /^([^=]+)=(.+)/s; | |
467 | my($lang, $expr) = ($1, $2); | |
468 | $greetings{$lang} = $expr; | |
469 | } | |
470 | close(IN); | |
471 | ||
472 | at which point %greetings has the contents: | |
473 | ||
474 | "en-US" => "Hiho" | |
475 | "fr" => "Bonjour" | |
476 | "i-mingo" => "Hau'" | |
477 | ||
478 | And suppose then that you answer client requests for language $wanted | |
479 | by just looking up $greetings{$wanted}. | |
480 | ||
481 | If the client asks for "fr", that will look up successfully in | |
482 | %greetings, to the value "Bonjour". And if the client asks for | |
483 | "i-mingo", that will look up successfully in %greetings, to the value | |
484 | "Hau'". | |
485 | ||
486 | But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the | |
487 | lookup in %greetings fails. That's the Wrong Thing. | |
488 | ||
489 | You could instead do lookups on $wanted with: | |
490 | ||
491 | use I18N::LangTags qw(same_language_tag); | |
3c4b39be | 492 | my $response = ''; |
4b053158 JH |
493 | foreach my $l2 (keys %greetings) { |
494 | if(same_language_tag($wanted, $l2)) { | |
495 | $response = $greetings{$l2}; | |
496 | last; | |
497 | } | |
498 | } | |
499 | ||
500 | But that's rather inefficient. A better way to do it is to start your | |
501 | program with: | |
502 | ||
503 | use I18N::LangTags qw(encode_language_tag); | |
504 | my %greetings; | |
505 | die unless open(IN, "<in.dat"); | |
506 | while(<IN>) { | |
507 | chomp; | |
508 | next unless /^([^=]+)=(.+)/s; | |
509 | my($lang, $expr) = ($1, $2); | |
510 | $greetings{ | |
511 | encode_language_tag($lang) | |
512 | } = $expr; | |
513 | } | |
514 | close(IN); | |
515 | ||
516 | and then just answer client requests for language $wanted by just | |
517 | looking up | |
518 | ||
519 | $greetings{encode_language_tag($wanted)} | |
520 | ||
521 | And that does the Right Thing. | |
522 | ||
523 | =cut | |
524 | ||
525 | sub encode_language_tag { | |
526 | # Only similarity_language_tag() is allowed to analyse encodings! | |
527 | ||
528 | ## Changes in the language tagging standards may have to be reflected here. | |
529 | ||
21aeefd5 | 530 | my($tag) = $_[0] || return undef; |
4b053158 | 531 | return undef unless &is_language_tag($tag); |
21aeefd5 JH |
532 | |
533 | # For the moment, these legacy variances are few enough that | |
534 | # we can just handle them here with regexps. | |
535 | $tag =~ s/^iw\b/he/i; # Hebrew | |
536 | $tag =~ s/^in\b/id/i; # Indonesian | |
aaf52a42 JH |
537 | $tag =~ s/^cre\b/cr/i; # Cree |
538 | $tag =~ s/^jw\b/jv/i; # Javanese | |
21aeefd5 JH |
539 | $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger |
540 | $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo | |
541 | $tag =~ s/^ji\b/yi/i; # Yiddish | |
aaf52a42 JH |
542 | # SMB 2003 -- Hm. There's a bunch of new XXX->YY variances now, |
543 | # but maybe they're all so obscure I can ignore them. "Obscure" | |
544 | # meaning either that the language is obscure, and/or that the | |
545 | # XXX form was extant so briefly that it's unlikely it was ever | |
546 | # used. I hope. | |
21aeefd5 JH |
547 | # |
548 | # These go FROM the simplex to complex form, to get | |
549 | # similarity-comparison right. And that's okay, since | |
550 | # similarity_language_tag is the only thing that | |
551 | # analyzes our output. | |
552 | $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka | |
553 | $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal | |
554 | $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk | |
4b053158 JH |
555 | |
556 | $tag =~ s/^[xiXI]-//s; | |
557 | # Just lop off any leading "x/i-" | |
4b053158 | 558 | |
21aeefd5 | 559 | return "~" . uc($tag); |
4b053158 JH |
560 | } |
561 | ||
562 | #-------------------------------------------------------------------------- | |
563 | ||
564 | =item * the function alternate_language_tags($lang1) | |
565 | ||
566 | This function, if given a language tag, returns all language tags that | |
21aeefd5 JH |
567 | are alternate forms of this language tag. (I.e., tags which refer to |
568 | the same language.) This is meant to handle legacy tags caused by | |
569 | the minor changes in language tag standards over the years; and | |
570 | the x-/i- alternation is also dealt with. | |
571 | ||
572 | Note that this function does I<not> try to equate new (and never-used, | |
573 | and unusable) | |
574 | ISO639-2 three-letter tags to old (and still in use) ISO639-1 | |
575 | two-letter equivalents -- like "ara" -> "ar" -- because | |
576 | "ara" has I<never> been in use as an Internet language tag, | |
577 | and RFC 3066 stipulates that it never should be, since a shorter | |
578 | tag ("ar") exists. | |
579 | ||
580 | Examples: | |
581 | ||
582 | alternate_language_tags('no-bok') is ('nb') | |
583 | alternate_language_tags('nb') is ('no-bok') | |
584 | alternate_language_tags('he') is ('iw') | |
585 | alternate_language_tags('iw') is ('he') | |
586 | alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka') | |
587 | alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka') | |
588 | alternate_language_tags('en') is () | |
589 | alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') | |
590 | alternate_language_tags('x-klikitat') is ('i-klikitat') | |
591 | alternate_language_tags('i-klikitat') is ('x-klikitat') | |
592 | ||
593 | This function returns empty-list if given anything other than a formally | |
4b053158 JH |
594 | valid language tag. |
595 | ||
596 | =cut | |
597 | ||
598 | my %alt = qw( i x x i I X X I ); | |
599 | sub alternate_language_tags { | |
4b053158 JH |
600 | my $tag = $_[0]; |
601 | return() unless &is_language_tag($tag); | |
602 | ||
21aeefd5 JH |
603 | my @em; # push 'em real goood! |
604 | ||
605 | # For the moment, these legacy variances are few enough that | |
606 | # we can just handle them here with regexps. | |
607 | ||
608 | if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1"; | |
609 | } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1"; | |
610 | ||
611 | } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1"; | |
612 | } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1"; | |
613 | ||
614 | } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1"; | |
615 | } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1"; | |
616 | ||
617 | } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1"; | |
618 | } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1"; | |
619 | ||
620 | } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1"; | |
621 | } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1"; | |
622 | ||
623 | } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1"; | |
624 | } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1"; | |
625 | ||
626 | } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1"; | |
627 | } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1"; | |
628 | ||
629 | } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1"; | |
630 | } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1"; | |
631 | } | |
632 | ||
633 | push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/; | |
634 | return @em; | |
635 | } | |
636 | ||
637 | ########################################################################### | |
638 | ||
639 | { | |
640 | # Init %Panic... | |
641 | ||
642 | my @panic = ( # MUST all be lowercase! | |
643 | # Only large ("national") languages make it in this list. | |
644 | # If you, as a user, are so bizarre that the /only/ language | |
645 | # you claim to accept is Galician, then no, we won't do you | |
646 | # the favor of providing Catalan as a panic-fallback for | |
647 | # you. Because if I start trying to add "little languages" in | |
648 | # here, I'll just go crazy. | |
649 | ||
4cf5bee0 JH |
650 | # Scandinavian lgs. All based on opinion and hearsay. |
651 | 'sv' => [qw(nb no da nn)], | |
652 | 'da' => [qw(nb no sv nn)], # I guess | |
653 | [qw(no nn nb)], [qw(no nn nb sv da)], | |
654 | 'is' => [qw(da sv no nb nn)], | |
655 | 'fo' => [qw(da is no nb nn sv)], # I guess | |
21aeefd5 JH |
656 | |
657 | # I think this is about the extent of tolerable intelligibility | |
658 | # among large modern Romance languages. | |
659 | 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French | |
660 | 'ca' => [qw(es pt it fr)], | |
661 | 'es' => [qw(ca it fr pt)], | |
662 | 'it' => [qw(es fr ca pt)], | |
663 | 'fr' => [qw(es it ca pt)], | |
664 | ||
665 | # Also assume that speakers of the main Indian languages prefer | |
666 | # to read/hear Hindi over English | |
667 | [qw( | |
668 | as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur | |
669 | )] => 'hi', | |
670 | # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri, | |
671 | # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya, | |
672 | # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu. | |
673 | 'hi' => [qw(bn pa as or)], | |
674 | # I welcome finer data for the other Indian languages. | |
675 | # E.g., what should Oriya's list be, besides just Hindi? | |
676 | ||
677 | # And the panic languages for English is, of course, nil! | |
678 | ||
679 | # My guesses at Slavic intelligibility: | |
680 | ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian | |
681 | 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat | |
682 | 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak | |
683 | ||
684 | 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian | |
685 | ||
686 | 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish | |
687 | ||
688 | #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai | |
689 | ||
690 | ); | |
691 | my($k,$v); | |
692 | while(@panic) { | |
693 | ($k,$v) = splice(@panic,0,2); | |
694 | foreach my $k (ref($k) ? @$k : $k) { | |
695 | foreach my $v (ref($v) ? @$v : $v) { | |
696 | push @{$Panic{$k} ||= []}, $v unless $k eq $v; | |
697 | } | |
698 | } | |
699 | } | |
700 | } | |
701 | ||
702 | =item * the function @langs = panic_languages(@accept_languages) | |
703 | ||
704 | This function takes a list of 0 or more language | |
705 | tags that constitute a given user's Accept-Language list, and | |
706 | returns a list of tags for I<other> (non-super) | |
707 | languages that are probably acceptable to the user, to be | |
708 | used I<if all else fails>. | |
709 | ||
710 | For example, if a user accepts only 'ca' (Catalan) and | |
711 | 'es' (Spanish), and the documents/interfaces you have | |
712 | available are just in German, Italian, and Chinese, then | |
713 | the user will most likely want the Italian one (and not | |
714 | the Chinese or German one!), instead of getting | |
715 | nothing. So C<panic_languages('ca', 'es')> returns | |
716 | a list containing 'it' (Italian). | |
717 | ||
718 | English ('en') is I<always> in the return list, but | |
719 | whether it's at the very end or not depends | |
720 | on the input languages. This function works by consulting | |
721 | an internal table that stipulates what common | |
722 | languages are "close" to each other. | |
723 | ||
724 | A useful construct you might consider using is: | |
725 | ||
726 | @fallbacks = super_languages(@accept_languages); | |
727 | push @fallbacks, panic_languages( | |
728 | @accept_languages, @fallbacks, | |
729 | ); | |
730 | ||
731 | =cut | |
4b053158 | 732 | |
21aeefd5 JH |
733 | sub panic_languages { |
734 | # When in panic or in doubt, run in circles, scream, and shout! | |
735 | my(@out, %seen); | |
736 | foreach my $t (@_) { | |
737 | next unless $t; | |
738 | next if $seen{$t}++; # so we don't return it or hit it again | |
739 | # push @out, super_languages($t); # nah, keep that separate | |
740 | push @out, @{ $Panic{lc $t} || next }; | |
4b053158 | 741 | } |
21aeefd5 | 742 | return grep !$seen{$_}++, @out, 'en'; |
4b053158 JH |
743 | } |
744 | ||
8000a3fa RGS |
745 | #--------------------------------------------------------------------------- |
746 | #--------------------------------------------------------------------------- | |
747 | ||
748 | =item * the function implicate_supers( ...languages... ) | |
749 | ||
750 | This takes a list of strings (which are presumed to be language-tags; | |
751 | strings that aren't, are ignored); and after each one, this function | |
752 | inserts super-ordinate forms that don't already appear in the list. | |
753 | The original list, plus these insertions, is returned. | |
754 | ||
755 | In other words, it takes this: | |
756 | ||
757 | pt-br de-DE en-US fr pt-br-janeiro | |
758 | ||
759 | and returns this: | |
760 | ||
761 | pt-br pt de-DE de en-US en fr pt-br-janeiro | |
762 | ||
763 | This function is most useful in the idiom | |
764 | ||
765 | implicate_supers( I18N::LangTags::Detect::detect() ); | |
766 | ||
767 | (See L<I18N::LangTags::Detect>.) | |
768 | ||
769 | ||
770 | =item * the function implicate_supers_strictly( ...languages... ) | |
771 | ||
772 | This works like C<implicate_supers> except that the implicated | |
773 | forms are added to the end of the return list. | |
774 | ||
775 | In other words, implicate_supers_strictly takes a list of strings | |
776 | (which are presumed to be language-tags; strings that aren't, are | |
777 | ignored) and after the whole given list, it inserts the super-ordinate forms | |
778 | of all given tags, minus any tags that already appear in the input list. | |
779 | ||
780 | In other words, it takes this: | |
781 | ||
782 | pt-br de-DE en-US fr pt-br-janeiro | |
783 | ||
784 | and returns this: | |
785 | ||
786 | pt-br de-DE en-US fr pt-br-janeiro pt de en | |
787 | ||
788 | The reason this function has "_strictly" in its name is that when | |
789 | you're processing an Accept-Language list according to the RFCs, if | |
790 | you interpret the RFCs quite strictly, then you would use | |
791 | implicate_supers_strictly, but for normal use (i.e., common-sense use, | |
792 | as far as I'm concerned) you'd use implicate_supers. | |
793 | ||
794 | =cut | |
795 | ||
796 | sub implicate_supers { | |
797 | my @languages = grep is_language_tag($_), @_; | |
798 | my %seen_encoded; | |
799 | foreach my $lang (@languages) { | |
800 | $seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1 | |
801 | } | |
802 | ||
803 | my(@output_languages); | |
804 | foreach my $lang (@languages) { | |
805 | push @output_languages, $lang; | |
806 | foreach my $s ( I18N::LangTags::super_languages($lang) ) { | |
807 | # Note that super_languages returns the longest first. | |
808 | last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) }; | |
809 | push @output_languages, $s; | |
810 | } | |
811 | } | |
812 | return uniq( @output_languages ); | |
813 | ||
814 | } | |
815 | ||
816 | sub implicate_supers_strictly { | |
817 | my @tags = grep is_language_tag($_), @_; | |
818 | return uniq( @_, map super_languages($_), @_ ); | |
819 | } | |
820 | ||
821 | ||
822 | ||
4b053158 | 823 | ########################################################################### |
21aeefd5 JH |
824 | 1; |
825 | __END__ | |
4b053158 JH |
826 | |
827 | =back | |
828 | ||
829 | =head1 ABOUT LOWERCASING | |
830 | ||
831 | I've considered making all the above functions that output language | |
832 | tags return all those tags strictly in lowercase. Having all your | |
833 | language tags in lowercase does make some things easier. But you | |
834 | might as well just lowercase as you like, or call | |
835 | C<encode_language_tag($lang1)> where appropriate. | |
836 | ||
837 | =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS | |
838 | ||
839 | In some future version of I18N::LangTags, I plan to include support | |
840 | for RFC2482-style language tags -- which are basically just normal | |
841 | language tags with their ASCII characters shifted into Plane 14. | |
842 | ||
843 | =head1 SEE ALSO | |
844 | ||
e7525a17 JH |
845 | * L<I18N::LangTags::List|I18N::LangTags::List> |
846 | ||
4b053158 JH |
847 | * RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the |
848 | Identification of Languages". (Obsoletes RFC 1766) | |
849 | ||
850 | * RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on | |
851 | Character Sets and Languages". | |
852 | ||
853 | * RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter | |
854 | Value and Encoded Word Extensions: Character Sets, Languages, and | |
855 | Continuations". | |
856 | ||
8000a3fa | 857 | * RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>, |
4b053158 JH |
858 | "Language Tagging in Unicode Plain Text". |
859 | ||
860 | * Locale::Codes, in | |
f70da2ef | 861 | C<http://www.perl.com/CPAN/modules/by-module/Locale/> |
4b053158 | 862 | |
4b053158 | 863 | * ISO 639-2, "Codes for the representation of names of languages", |
aaf52a42 JH |
864 | including two-letter and three-letter codes, |
865 | C<http://www.loc.gov/standards/iso639-2/langcodes.html> | |
4b053158 JH |
866 | |
867 | * The IANA list of registered languages (hopefully up-to-date), | |
aaf52a42 | 868 | C<http://www.iana.org/assignments/language-tags> |
4b053158 JH |
869 | |
870 | =head1 COPYRIGHT | |
871 | ||
77b20956 | 872 | Copyright (c) 1998+ Sean M. Burke. All rights reserved. |
4b053158 JH |
873 | |
874 | This library is free software; you can redistribute it and/or | |
875 | modify it under the same terms as Perl itself. | |
876 | ||
877 | The programs and documentation in this dist are distributed in | |
878 | the hope that they will be useful, but without any warranty; without | |
879 | even the implied warranty of merchantability or fitness for a | |
880 | particular purpose. | |
881 | ||
882 | =head1 AUTHOR | |
883 | ||
884 | Sean M. Burke C<sburke@cpan.org> | |
885 | ||
886 | =cut | |
887 |