Commit | Line | Data |
---|---|---|
4b053158 | 1 | |
acff0af7 | 2 | # Time-stamp: "2001-06-21 22:50:34 MDT" |
4b053158 JH |
3 | # Sean M. Burke <sburke@cpan.org> |
4 | ||
5 | require 5.000; | |
6 | package I18N::LangTags; | |
7 | use strict; | |
21aeefd5 | 8 | use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic); |
4b053158 | 9 | require Exporter; |
4b053158 JH |
10 | @ISA = qw(Exporter); |
11 | @EXPORT = qw(); | |
12 | @EXPORT_OK = qw(is_language_tag same_language_tag | |
13 | extract_language_tags super_languages | |
14 | similarity_language_tag is_dialect_of | |
15 | locale2language_tag alternate_language_tags | |
21aeefd5 | 16 | encode_language_tag panic_languages |
4b053158 | 17 | ); |
21aeefd5 | 18 | %EXPORT_TAGS = ('ALL' => \@EXPORT_OK); |
4b053158 | 19 | |
acff0af7 | 20 | $VERSION = "0.26"; |
4b053158 JH |
21 | |
22 | =head1 NAME | |
23 | ||
24 | I18N::LangTags - functions for dealing with RFC3066-style language tags | |
25 | ||
26 | =head1 SYNOPSIS | |
27 | ||
28 | use I18N::LangTags qw(is_language_tag same_language_tag | |
29 | extract_language_tags super_languages | |
30 | similarity_language_tag is_dialect_of | |
31 | locale2language_tag alternate_language_tags | |
21aeefd5 | 32 | encode_language_tag panic_languages |
4b053158 JH |
33 | ); |
34 | ||
35 | ...or whatever of those functions you want to import. Those are | |
36 | all the exportable functions -- you're free to import only some, | |
21aeefd5 JH |
37 | or none at all. By default, none are imported. If you say: |
38 | ||
39 | use I18N::LangTags qw(:ALL) | |
40 | ||
41 | ...then all are exported. (This saves you from having to use | |
42 | something less obvious like C<use I18N::LangTags qw(/./)>.) | |
4b053158 JH |
43 | |
44 | If you don't import any of these functions, assume a C<&I18N::LangTags::> | |
45 | in front of all the function names in the following examples. | |
46 | ||
47 | =head1 DESCRIPTION | |
48 | ||
49 | Language tags are a formalism, described in RFC 3066 (obsoleting | |
50 | 1766), for declaring what language form (language and possibly | |
51 | dialect) a given chunk of information is in. | |
52 | ||
53 | This library provides functions for common tasks involving language | |
54 | tags as they are needed in a variety of protocols and applications. | |
55 | ||
56 | Please see the "See Also" references for a thorough explanation | |
57 | of how to correctly use language tags. | |
58 | ||
59 | =over | |
60 | ||
61 | =cut | |
62 | ||
63 | ########################################################################### | |
64 | ||
65 | =item * the function is_language_tag($lang1) | |
66 | ||
67 | Returns true iff $lang1 is a formally valid language tag. | |
68 | ||
69 | is_language_tag("fr") is TRUE | |
70 | is_language_tag("x-jicarilla") is FALSE | |
71 | (Subtags can be 8 chars long at most -- 'jicarilla' is 9) | |
72 | ||
73 | is_language_tag("sgn-US") is TRUE | |
74 | (That's American Sign Language) | |
75 | ||
76 | is_language_tag("i-Klikitat") is TRUE | |
77 | (True without regard to the fact noone has actually | |
78 | registered Klikitat -- it's a formally valid tag) | |
79 | ||
80 | is_language_tag("fr-patois") is TRUE | |
81 | (Formally valid -- altho descriptively weak!) | |
82 | ||
83 | is_language_tag("Spanish") is FALSE | |
84 | is_language_tag("french-patois") is FALSE | |
85 | (No good -- first subtag has to match | |
86 | /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066) | |
87 | ||
88 | is_language_tag("x-borg-prot2532") is TRUE | |
89 | (Yes, subtags can contain digits, as of RFC3066) | |
90 | ||
91 | =cut | |
92 | ||
93 | sub is_language_tag { | |
94 | ||
95 | ## Changes in the language tagging standards may have to be reflected here. | |
96 | ||
97 | my($tag) = lc($_[0]); | |
98 | ||
99 | return 0 if $tag eq "i" or $tag eq "x"; | |
21aeefd5 | 100 | # Bad degenerate cases that the following |
4b053158 JH |
101 | # regexp would erroneously let pass |
102 | ||
103 | return $tag =~ | |
104 | /^(?: # First subtag | |
105 | [xi] | [a-z]{2,3} | |
106 | ) | |
107 | (?: # Subtags thereafter | |
108 | - # separator | |
109 | [a-z0-9]{1,8} # subtag | |
110 | )* | |
111 | $/xs ? 1 : 0; | |
112 | } | |
113 | ||
114 | ########################################################################### | |
115 | ||
116 | =item * the function extract_language_tags($whatever) | |
117 | ||
118 | Returns a list of whatever looks like formally valid language tags | |
119 | in $whatever. Not very smart, so don't get too creative with | |
120 | what you want to feed it. | |
121 | ||
122 | extract_language_tags("fr, fr-ca, i-mingo") | |
123 | returns: ('fr', 'fr-ca', 'i-mingo') | |
124 | ||
125 | extract_language_tags("It's like this: I'm in fr -- French!") | |
126 | returns: ('It', 'in', 'fr') | |
127 | (So don't just feed it any old thing.) | |
128 | ||
129 | The output is untainted. If you don't know what tainting is, | |
130 | don't worry about it. | |
131 | ||
132 | =cut | |
133 | ||
134 | sub extract_language_tags { | |
135 | ||
136 | ## Changes in the language tagging standards may have to be reflected here. | |
137 | ||
138 | my($text) = | |
139 | $_[0] =~ m/(.+)/ # to make for an untainted result | |
140 | ? $1 : '' | |
141 | ; | |
142 | ||
143 | return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags | |
144 | $text =~ | |
145 | m/ | |
146 | \b | |
147 | (?: # First subtag | |
148 | [iIxX] | [a-zA-Z]{2,3} | |
149 | ) | |
150 | (?: # Subtags thereafter | |
151 | - # separator | |
152 | [a-zA-Z0-9]{1,8} # subtag | |
153 | )* | |
154 | \b | |
155 | /xsg | |
156 | ); | |
157 | } | |
158 | ||
159 | ########################################################################### | |
160 | ||
161 | =item * the function same_language_tag($lang1, $lang2) | |
162 | ||
163 | Returns true iff $lang1 and $lang2 are acceptable variant tags | |
164 | representing the same language-form. | |
165 | ||
166 | same_language_tag('x-kadara', 'i-kadara') is TRUE | |
167 | (The x/i- alternation doesn't matter) | |
168 | same_language_tag('X-KADARA', 'i-kadara') is TRUE | |
169 | (...and neither does case) | |
170 | same_language_tag('en', 'en-US') is FALSE | |
171 | (all-English is not the SAME as US English) | |
172 | same_language_tag('x-kadara', 'x-kadar') is FALSE | |
173 | (these are totally unrelated tags) | |
21aeefd5 JH |
174 | same_language_tag('no-bok', 'nb') is TRUE |
175 | (no-bok is a legacy tag for nb (Norwegian Bokmal)) | |
4b053158 JH |
176 | |
177 | C<same_language_tag> works by just seeing whether | |
178 | C<encode_language_tag($lang1)> is the same as | |
179 | C<encode_language_tag($lang2)>. | |
180 | ||
181 | (Yes, I know this function is named a bit oddly. Call it historic | |
182 | reasons.) | |
183 | ||
184 | =cut | |
185 | ||
186 | sub same_language_tag { | |
187 | my $el1 = &encode_language_tag($_[0]); | |
188 | return 0 unless defined $el1; | |
189 | # this avoids the problem of | |
190 | # encode_language_tag($lang1) eq and encode_language_tag($lang2) | |
191 | # being true if $lang1 and $lang2 are both undef | |
192 | ||
193 | return $el1 eq &encode_language_tag($_[1]) ? 1 : 0; | |
194 | } | |
195 | ||
196 | ########################################################################### | |
197 | ||
198 | =item * the function similarity_language_tag($lang1, $lang2) | |
199 | ||
200 | Returns an integer representing the degree of similarity between | |
201 | tags $lang1 and $lang2 (the order of which does not matter), where | |
202 | similarity is the number of common elements on the left, | |
203 | without regard to case and to x/i- alternation. | |
204 | ||
205 | similarity_language_tag('fr', 'fr-ca') is 1 | |
206 | (one element in common) | |
207 | similarity_language_tag('fr-ca', 'fr-FR') is 1 | |
208 | (one element in common) | |
209 | ||
210 | similarity_language_tag('fr-CA-joual', | |
211 | 'fr-CA-PEI') is 2 | |
212 | similarity_language_tag('fr-CA-joual', 'fr-CA') is 2 | |
213 | (two elements in common) | |
214 | ||
215 | similarity_language_tag('x-kadara', 'i-kadara') is 1 | |
216 | (x/i- doesn't matter) | |
217 | ||
218 | similarity_language_tag('en', 'x-kadar') is 0 | |
219 | similarity_language_tag('x-kadara', 'x-kadar') is 0 | |
220 | (unrelated tags -- no similarity) | |
221 | ||
222 | similarity_language_tag('i-cree-syllabic', | |
223 | 'i-cherokee-syllabic') is 0 | |
224 | (no B<leftmost> elements in common!) | |
225 | ||
226 | =cut | |
227 | ||
228 | sub similarity_language_tag { | |
229 | my $lang1 = &encode_language_tag($_[0]); | |
230 | my $lang2 = &encode_language_tag($_[1]); | |
21aeefd5 JH |
231 | # And encode_language_tag takes care of the whole |
232 | # no-nyn==nn, i-hakka==zh-hakka, etc, things | |
233 | ||
4b053158 JH |
234 | # NB: (i-sil-...)? (i-sgn-...)? |
235 | ||
236 | return undef if !defined($lang1) and !defined($lang2); | |
237 | return 0 if !defined($lang1) or !defined($lang2); | |
238 | ||
239 | my @l1_subtags = split('-', $lang1); | |
240 | my @l2_subtags = split('-', $lang2); | |
241 | my $similarity = 0; | |
242 | ||
243 | while(@l1_subtags and @l2_subtags) { | |
244 | if(shift(@l1_subtags) eq shift(@l2_subtags)) { | |
245 | ++$similarity; | |
246 | } else { | |
247 | last; | |
248 | } | |
249 | } | |
250 | return $similarity; | |
251 | } | |
252 | ||
253 | ########################################################################### | |
254 | ||
255 | =item * the function is_dialect_of($lang1, $lang2) | |
256 | ||
21aeefd5 | 257 | Returns true iff language tag $lang1 represents a subform of |
4b053158 JH |
258 | language tag $lang2. |
259 | ||
260 | B<Get the order right! It doesn't work the other way around!> | |
261 | ||
262 | is_dialect_of('en-US', 'en') is TRUE | |
263 | (American English IS a dialect of all-English) | |
264 | ||
265 | is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE | |
266 | is_dialect_of('fr-CA-joual', 'fr') is TRUE | |
267 | (Joual is a dialect of (a dialect of) French) | |
268 | ||
269 | is_dialect_of('en', 'en-US') is FALSE | |
270 | (all-English is a NOT dialect of American English) | |
271 | ||
272 | is_dialect_of('fr', 'en-CA') is FALSE | |
273 | ||
21aeefd5 JH |
274 | is_dialect_of('en', 'en' ) is TRUE |
275 | is_dialect_of('en-US', 'en-US') is TRUE | |
4b053158 JH |
276 | (B<Note:> these are degenerate cases) |
277 | ||
278 | is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE | |
279 | (the x/i thing doesn't matter, nor does case) | |
280 | ||
21aeefd5 JH |
281 | is_dialect_of('nn', 'no') is TRUE |
282 | (because 'nn' (New Norse) is aliased to 'no-nyn', | |
283 | as a special legacy case, and 'no-nyn' is a | |
284 | subform of 'no' (Norwegian)) | |
285 | ||
4b053158 JH |
286 | =cut |
287 | ||
288 | sub is_dialect_of { | |
289 | ||
290 | my $lang1 = &encode_language_tag($_[0]); | |
291 | my $lang2 = &encode_language_tag($_[1]); | |
292 | ||
293 | return undef if !defined($lang1) and !defined($lang2); | |
294 | return 0 if !defined($lang1) or !defined($lang2); | |
295 | ||
296 | return 1 if $lang1 eq $lang2; | |
297 | return 0 if length($lang1) < length($lang2); | |
298 | ||
299 | $lang1 .= '-'; | |
300 | $lang2 .= '-'; | |
301 | return | |
302 | (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0; | |
303 | } | |
304 | ||
305 | ########################################################################### | |
306 | ||
307 | =item * the function super_languages($lang1) | |
308 | ||
309 | Returns a list of language tags that are superordinate tags to $lang1 | |
310 | -- it gets this by removing subtags from the end of $lang1 until | |
311 | nothing (or just "i" or "x") is left. | |
312 | ||
313 | super_languages("fr-CA-joual") is ("fr-CA", "fr") | |
314 | ||
315 | super_languages("en-AU") is ("en") | |
316 | ||
317 | super_languages("en") is empty-list, () | |
318 | ||
319 | super_languages("i-cherokee") is empty-list, () | |
320 | ...not ("i"), which would be illegal as well as pointless. | |
321 | ||
322 | If $lang1 is not a valid language tag, returns empty-list in | |
323 | a list context, undef in a scalar context. | |
324 | ||
325 | A notable and rather unavoidable problem with this method: | |
326 | "x-mingo-tom" has an "x" because the whole tag isn't an | |
327 | IANA-registered tag -- but super_languages('x-mingo-tom') is | |
328 | ('x-mingo') -- which isn't really right, since 'i-mingo' is | |
329 | registered. But this module has no way of knowing that. (But note | |
330 | that same_language_tag('x-mingo', 'i-mingo') is TRUE.) | |
331 | ||
332 | More importantly, you assume I<at your peril> that superordinates of | |
333 | $lang1 are mutually intelligible with $lang1. Consider this | |
334 | carefully. | |
335 | ||
336 | =cut | |
337 | ||
338 | sub super_languages { | |
339 | my $lang1 = $_[0]; | |
340 | return() unless defined($lang1) && &is_language_tag($lang1); | |
21aeefd5 JH |
341 | |
342 | # a hack for those annoying new (2001) tags: | |
343 | $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards | |
344 | $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards | |
345 | $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way | |
346 | # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark | |
347 | ||
4b053158 JH |
348 | my @l1_subtags = split('-', $lang1); |
349 | ||
350 | ## Changes in the language tagging standards may have to be reflected here. | |
351 | ||
352 | # NB: (i-sil-...)? | |
353 | ||
354 | my @supers = (); | |
355 | foreach my $bit (@l1_subtags) { | |
356 | push @supers, | |
357 | scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; | |
358 | } | |
359 | pop @supers if @supers; | |
360 | shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s; | |
361 | return reverse @supers; | |
362 | } | |
363 | ||
364 | ########################################################################### | |
365 | ||
366 | =item * the function locale2language_tag($locale_identifier) | |
367 | ||
368 | This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1") | |
369 | and maps it to a language tag. If it's not mappable (as with, | |
370 | notably, "C" and "POSIX"), this returns empty-list in a list context, | |
371 | or undef in a scalar context. | |
372 | ||
373 | locale2language_tag("en") is "en" | |
374 | ||
375 | locale2language_tag("en_US") is "en-US" | |
376 | ||
377 | locale2language_tag("en_US.ISO8859-1") is "en-US" | |
378 | ||
379 | locale2language_tag("C") is undef or () | |
380 | ||
381 | locale2language_tag("POSIX") is undef or () | |
382 | ||
383 | locale2language_tag("POSIX") is undef or () | |
384 | ||
385 | I'm not totally sure that locale names map satisfactorily to language | |
386 | tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED. | |
387 | ||
388 | The output is untainted. If you don't know what tainting is, | |
389 | don't worry about it. | |
390 | ||
391 | =cut | |
392 | ||
393 | sub locale2language_tag { | |
394 | my $lang = | |
395 | $_[0] =~ m/(.+)/ # to make for an untainted result | |
396 | ? $1 : '' | |
397 | ; | |
398 | ||
399 | return $lang if &is_language_tag($lang); # like "en" | |
400 | ||
401 | $lang =~ tr<_><->; # "en_US" -> en-US | |
402 | $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s; # "en_US.ISO8859-1" -> en-US | |
403 | ||
404 | return $lang if &is_language_tag($lang); | |
405 | ||
406 | return; | |
407 | } | |
408 | ||
409 | ########################################################################### | |
410 | ||
411 | =item * the function encode_language_tag($lang1) | |
412 | ||
413 | This function, if given a language tag, returns an encoding of it such | |
414 | that: | |
415 | ||
416 | * tags representing different languages never get the same encoding. | |
417 | ||
418 | * tags representing the same language always get the same encoding. | |
419 | ||
420 | * an encoding of a formally valid language tag always is a string | |
421 | value that is defined, has length, and is true if considered as a | |
422 | boolean. | |
423 | ||
424 | Note that the encoding itself is B<not> a formally valid language tag. | |
425 | Note also that you cannot, currently, go from an encoding back to a | |
426 | language tag that it's an encoding of. | |
427 | ||
428 | Note also that you B<must> consider the encoded value as atomic; i.e., | |
429 | you should not consider it as anything but an opaque, unanalysable | |
430 | string value. (The internals of the encoding method may change in | |
431 | future versions, as the language tagging standard changes over time.) | |
432 | ||
433 | C<encode_language_tag> returns undef if given anything other than a | |
434 | formally valid language tag. | |
435 | ||
436 | The reason C<encode_language_tag> exists is because different language | |
437 | tags may represent the same language; this is normally treatable with | |
438 | C<same_language_tag>, but consider this situation: | |
439 | ||
440 | You have a data file that expresses greetings in different languages. | |
441 | Its format is "[language tag]=[how to say 'Hello']", like: | |
442 | ||
443 | en-US=Hiho | |
444 | fr=Bonjour | |
445 | i-mingo=Hau' | |
446 | ||
447 | And suppose you write a program that reads that file and then runs as | |
448 | a daemon, answering client requests that specify a language tag and | |
449 | then expect the string that says how to greet in that language. So an | |
450 | interaction looks like: | |
451 | ||
452 | greeting-client asks: fr | |
453 | greeting-server answers: Bonjour | |
454 | ||
455 | So far so good. But suppose the way you're implementing this is: | |
456 | ||
457 | my %greetings; | |
458 | die unless open(IN, "<in.dat"); | |
459 | while(<IN>) { | |
460 | chomp; | |
461 | next unless /^([^=]+)=(.+)/s; | |
462 | my($lang, $expr) = ($1, $2); | |
463 | $greetings{$lang} = $expr; | |
464 | } | |
465 | close(IN); | |
466 | ||
467 | at which point %greetings has the contents: | |
468 | ||
469 | "en-US" => "Hiho" | |
470 | "fr" => "Bonjour" | |
471 | "i-mingo" => "Hau'" | |
472 | ||
473 | And suppose then that you answer client requests for language $wanted | |
474 | by just looking up $greetings{$wanted}. | |
475 | ||
476 | If the client asks for "fr", that will look up successfully in | |
477 | %greetings, to the value "Bonjour". And if the client asks for | |
478 | "i-mingo", that will look up successfully in %greetings, to the value | |
479 | "Hau'". | |
480 | ||
481 | But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the | |
482 | lookup in %greetings fails. That's the Wrong Thing. | |
483 | ||
484 | You could instead do lookups on $wanted with: | |
485 | ||
486 | use I18N::LangTags qw(same_language_tag); | |
487 | my $repsonse = ''; | |
488 | foreach my $l2 (keys %greetings) { | |
489 | if(same_language_tag($wanted, $l2)) { | |
490 | $response = $greetings{$l2}; | |
491 | last; | |
492 | } | |
493 | } | |
494 | ||
495 | But that's rather inefficient. A better way to do it is to start your | |
496 | program with: | |
497 | ||
498 | use I18N::LangTags qw(encode_language_tag); | |
499 | my %greetings; | |
500 | die unless open(IN, "<in.dat"); | |
501 | while(<IN>) { | |
502 | chomp; | |
503 | next unless /^([^=]+)=(.+)/s; | |
504 | my($lang, $expr) = ($1, $2); | |
505 | $greetings{ | |
506 | encode_language_tag($lang) | |
507 | } = $expr; | |
508 | } | |
509 | close(IN); | |
510 | ||
511 | and then just answer client requests for language $wanted by just | |
512 | looking up | |
513 | ||
514 | $greetings{encode_language_tag($wanted)} | |
515 | ||
516 | And that does the Right Thing. | |
517 | ||
518 | =cut | |
519 | ||
520 | sub encode_language_tag { | |
521 | # Only similarity_language_tag() is allowed to analyse encodings! | |
522 | ||
523 | ## Changes in the language tagging standards may have to be reflected here. | |
524 | ||
21aeefd5 | 525 | my($tag) = $_[0] || return undef; |
4b053158 | 526 | return undef unless &is_language_tag($tag); |
21aeefd5 JH |
527 | |
528 | # For the moment, these legacy variances are few enough that | |
529 | # we can just handle them here with regexps. | |
530 | $tag =~ s/^iw\b/he/i; # Hebrew | |
531 | $tag =~ s/^in\b/id/i; # Indonesian | |
532 | $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger | |
533 | $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo | |
534 | $tag =~ s/^ji\b/yi/i; # Yiddish | |
535 | # | |
536 | # These go FROM the simplex to complex form, to get | |
537 | # similarity-comparison right. And that's okay, since | |
538 | # similarity_language_tag is the only thing that | |
539 | # analyzes our output. | |
540 | $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka | |
541 | $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal | |
542 | $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk | |
4b053158 JH |
543 | |
544 | $tag =~ s/^[xiXI]-//s; | |
545 | # Just lop off any leading "x/i-" | |
4b053158 | 546 | |
21aeefd5 | 547 | return "~" . uc($tag); |
4b053158 JH |
548 | } |
549 | ||
550 | #-------------------------------------------------------------------------- | |
551 | ||
552 | =item * the function alternate_language_tags($lang1) | |
553 | ||
554 | This function, if given a language tag, returns all language tags that | |
21aeefd5 JH |
555 | are alternate forms of this language tag. (I.e., tags which refer to |
556 | the same language.) This is meant to handle legacy tags caused by | |
557 | the minor changes in language tag standards over the years; and | |
558 | the x-/i- alternation is also dealt with. | |
559 | ||
560 | Note that this function does I<not> try to equate new (and never-used, | |
561 | and unusable) | |
562 | ISO639-2 three-letter tags to old (and still in use) ISO639-1 | |
563 | two-letter equivalents -- like "ara" -> "ar" -- because | |
564 | "ara" has I<never> been in use as an Internet language tag, | |
565 | and RFC 3066 stipulates that it never should be, since a shorter | |
566 | tag ("ar") exists. | |
567 | ||
568 | Examples: | |
569 | ||
570 | alternate_language_tags('no-bok') is ('nb') | |
571 | alternate_language_tags('nb') is ('no-bok') | |
572 | alternate_language_tags('he') is ('iw') | |
573 | alternate_language_tags('iw') is ('he') | |
574 | alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka') | |
575 | alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka') | |
576 | alternate_language_tags('en') is () | |
577 | alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') | |
578 | alternate_language_tags('x-klikitat') is ('i-klikitat') | |
579 | alternate_language_tags('i-klikitat') is ('x-klikitat') | |
580 | ||
581 | This function returns empty-list if given anything other than a formally | |
4b053158 JH |
582 | valid language tag. |
583 | ||
584 | =cut | |
585 | ||
586 | my %alt = qw( i x x i I X X I ); | |
587 | sub alternate_language_tags { | |
4b053158 JH |
588 | my $tag = $_[0]; |
589 | return() unless &is_language_tag($tag); | |
590 | ||
21aeefd5 JH |
591 | my @em; # push 'em real goood! |
592 | ||
593 | # For the moment, these legacy variances are few enough that | |
594 | # we can just handle them here with regexps. | |
595 | ||
596 | if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1"; | |
597 | } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1"; | |
598 | ||
599 | } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1"; | |
600 | } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1"; | |
601 | ||
602 | } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1"; | |
603 | } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1"; | |
604 | ||
605 | } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1"; | |
606 | } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1"; | |
607 | ||
608 | } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1"; | |
609 | } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1"; | |
610 | ||
611 | } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1"; | |
612 | } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1"; | |
613 | ||
614 | } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1"; | |
615 | } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1"; | |
616 | ||
617 | } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1"; | |
618 | } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1"; | |
619 | } | |
620 | ||
621 | push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/; | |
622 | return @em; | |
623 | } | |
624 | ||
625 | ########################################################################### | |
626 | ||
627 | { | |
628 | # Init %Panic... | |
629 | ||
630 | my @panic = ( # MUST all be lowercase! | |
631 | # Only large ("national") languages make it in this list. | |
632 | # If you, as a user, are so bizarre that the /only/ language | |
633 | # you claim to accept is Galician, then no, we won't do you | |
634 | # the favor of providing Catalan as a panic-fallback for | |
635 | # you. Because if I start trying to add "little languages" in | |
636 | # here, I'll just go crazy. | |
637 | ||
4cf5bee0 JH |
638 | # Scandinavian lgs. All based on opinion and hearsay. |
639 | 'sv' => [qw(nb no da nn)], | |
640 | 'da' => [qw(nb no sv nn)], # I guess | |
641 | [qw(no nn nb)], [qw(no nn nb sv da)], | |
642 | 'is' => [qw(da sv no nb nn)], | |
643 | 'fo' => [qw(da is no nb nn sv)], # I guess | |
21aeefd5 JH |
644 | |
645 | # I think this is about the extent of tolerable intelligibility | |
646 | # among large modern Romance languages. | |
647 | 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French | |
648 | 'ca' => [qw(es pt it fr)], | |
649 | 'es' => [qw(ca it fr pt)], | |
650 | 'it' => [qw(es fr ca pt)], | |
651 | 'fr' => [qw(es it ca pt)], | |
652 | ||
653 | # Also assume that speakers of the main Indian languages prefer | |
654 | # to read/hear Hindi over English | |
655 | [qw( | |
656 | as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur | |
657 | )] => 'hi', | |
658 | # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri, | |
659 | # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya, | |
660 | # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu. | |
661 | 'hi' => [qw(bn pa as or)], | |
662 | # I welcome finer data for the other Indian languages. | |
663 | # E.g., what should Oriya's list be, besides just Hindi? | |
664 | ||
665 | # And the panic languages for English is, of course, nil! | |
666 | ||
667 | # My guesses at Slavic intelligibility: | |
668 | ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian | |
669 | 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat | |
670 | 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak | |
671 | ||
672 | 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian | |
673 | ||
674 | 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish | |
675 | ||
676 | #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai | |
677 | ||
678 | ); | |
679 | my($k,$v); | |
680 | while(@panic) { | |
681 | ($k,$v) = splice(@panic,0,2); | |
682 | foreach my $k (ref($k) ? @$k : $k) { | |
683 | foreach my $v (ref($v) ? @$v : $v) { | |
684 | push @{$Panic{$k} ||= []}, $v unless $k eq $v; | |
685 | } | |
686 | } | |
687 | } | |
688 | } | |
689 | ||
690 | =item * the function @langs = panic_languages(@accept_languages) | |
691 | ||
692 | This function takes a list of 0 or more language | |
693 | tags that constitute a given user's Accept-Language list, and | |
694 | returns a list of tags for I<other> (non-super) | |
695 | languages that are probably acceptable to the user, to be | |
696 | used I<if all else fails>. | |
697 | ||
698 | For example, if a user accepts only 'ca' (Catalan) and | |
699 | 'es' (Spanish), and the documents/interfaces you have | |
700 | available are just in German, Italian, and Chinese, then | |
701 | the user will most likely want the Italian one (and not | |
702 | the Chinese or German one!), instead of getting | |
703 | nothing. So C<panic_languages('ca', 'es')> returns | |
704 | a list containing 'it' (Italian). | |
705 | ||
706 | English ('en') is I<always> in the return list, but | |
707 | whether it's at the very end or not depends | |
708 | on the input languages. This function works by consulting | |
709 | an internal table that stipulates what common | |
710 | languages are "close" to each other. | |
711 | ||
712 | A useful construct you might consider using is: | |
713 | ||
714 | @fallbacks = super_languages(@accept_languages); | |
715 | push @fallbacks, panic_languages( | |
716 | @accept_languages, @fallbacks, | |
717 | ); | |
718 | ||
719 | =cut | |
4b053158 | 720 | |
21aeefd5 JH |
721 | sub panic_languages { |
722 | # When in panic or in doubt, run in circles, scream, and shout! | |
723 | my(@out, %seen); | |
724 | foreach my $t (@_) { | |
725 | next unless $t; | |
726 | next if $seen{$t}++; # so we don't return it or hit it again | |
727 | # push @out, super_languages($t); # nah, keep that separate | |
728 | push @out, @{ $Panic{lc $t} || next }; | |
4b053158 | 729 | } |
21aeefd5 | 730 | return grep !$seen{$_}++, @out, 'en'; |
4b053158 JH |
731 | } |
732 | ||
733 | ########################################################################### | |
21aeefd5 JH |
734 | 1; |
735 | __END__ | |
4b053158 JH |
736 | |
737 | =back | |
738 | ||
739 | =head1 ABOUT LOWERCASING | |
740 | ||
741 | I've considered making all the above functions that output language | |
742 | tags return all those tags strictly in lowercase. Having all your | |
743 | language tags in lowercase does make some things easier. But you | |
744 | might as well just lowercase as you like, or call | |
745 | C<encode_language_tag($lang1)> where appropriate. | |
746 | ||
747 | =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS | |
748 | ||
749 | In some future version of I18N::LangTags, I plan to include support | |
750 | for RFC2482-style language tags -- which are basically just normal | |
751 | language tags with their ASCII characters shifted into Plane 14. | |
752 | ||
753 | =head1 SEE ALSO | |
754 | ||
e7525a17 JH |
755 | * L<I18N::LangTags::List|I18N::LangTags::List> |
756 | ||
4b053158 JH |
757 | * RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the |
758 | Identification of Languages". (Obsoletes RFC 1766) | |
759 | ||
760 | * RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on | |
761 | Character Sets and Languages". | |
762 | ||
763 | * RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter | |
764 | Value and Encoded Word Extensions: Character Sets, Languages, and | |
765 | Continuations". | |
766 | ||
767 | * RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>, | |
768 | "Language Tagging in Unicode Plain Text". | |
769 | ||
770 | * Locale::Codes, in | |
771 | C<http://www.perl.com/CPAN/modules/by-module/Locale/> | |
772 | ||
773 | * ISO 639, "Code for the representation of names of languages", | |
774 | C<http://www.indigo.ie/egt/standards/iso639/iso639-1-en.html> | |
775 | ||
776 | * ISO 639-2, "Codes for the representation of names of languages", | |
777 | including three-letter codes, | |
778 | C<http://lcweb.loc.gov/standards/iso639-2/bibcodes.html> | |
779 | ||
780 | * The IANA list of registered languages (hopefully up-to-date), | |
781 | C<ftp://ftp.isi.edu/in-notes/iana/assignments/languages/> | |
782 | ||
783 | =head1 COPYRIGHT | |
784 | ||
785 | Copyright (c) 1998-2001 Sean M. Burke. All rights reserved. | |
786 | ||
787 | This library is free software; you can redistribute it and/or | |
788 | modify it under the same terms as Perl itself. | |
789 | ||
790 | The programs and documentation in this dist are distributed in | |
791 | the hope that they will be useful, but without any warranty; without | |
792 | even the implied warranty of merchantability or fitness for a | |
793 | particular purpose. | |
794 | ||
795 | =head1 AUTHOR | |
796 | ||
797 | Sean M. Burke C<sburke@cpan.org> | |
798 | ||
799 | =cut | |
800 |