This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Try ExtUtils::Command.t everywhere, not just on Win32
[perl5.git] / lib / I18N / LangTags.pm
CommitLineData
4b053158 1
acff0af7 2# Time-stamp: "2001-06-21 22:50:34 MDT"
4b053158
JH
3# Sean M. Burke <sburke@cpan.org>
4
5require 5.000;
6package I18N::LangTags;
7use strict;
21aeefd5 8use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic);
4b053158 9require Exporter;
4b053158
JH
10@ISA = qw(Exporter);
11@EXPORT = qw();
12@EXPORT_OK = qw(is_language_tag same_language_tag
13 extract_language_tags super_languages
14 similarity_language_tag is_dialect_of
15 locale2language_tag alternate_language_tags
21aeefd5 16 encode_language_tag panic_languages
4b053158 17 );
21aeefd5 18%EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
4b053158 19
acff0af7 20$VERSION = "0.26";
4b053158
JH
21
22=head1 NAME
23
24I18N::LangTags - functions for dealing with RFC3066-style language tags
25
26=head1 SYNOPSIS
27
28 use I18N::LangTags qw(is_language_tag same_language_tag
29 extract_language_tags super_languages
30 similarity_language_tag is_dialect_of
31 locale2language_tag alternate_language_tags
21aeefd5 32 encode_language_tag panic_languages
4b053158
JH
33 );
34
35...or whatever of those functions you want to import. Those are
36all the exportable functions -- you're free to import only some,
21aeefd5
JH
37or none at all. By default, none are imported. If you say:
38
39 use I18N::LangTags qw(:ALL)
40
41...then all are exported. (This saves you from having to use
42something less obvious like C<use I18N::LangTags qw(/./)>.)
4b053158
JH
43
44If you don't import any of these functions, assume a C<&I18N::LangTags::>
45in front of all the function names in the following examples.
46
47=head1 DESCRIPTION
48
49Language tags are a formalism, described in RFC 3066 (obsoleting
501766), for declaring what language form (language and possibly
51dialect) a given chunk of information is in.
52
53This library provides functions for common tasks involving language
54tags as they are needed in a variety of protocols and applications.
55
56Please see the "See Also" references for a thorough explanation
57of how to correctly use language tags.
58
59=over
60
61=cut
62
63###########################################################################
64
65=item * the function is_language_tag($lang1)
66
67Returns true iff $lang1 is a formally valid language tag.
68
69 is_language_tag("fr") is TRUE
70 is_language_tag("x-jicarilla") is FALSE
71 (Subtags can be 8 chars long at most -- 'jicarilla' is 9)
72
73 is_language_tag("sgn-US") is TRUE
74 (That's American Sign Language)
75
76 is_language_tag("i-Klikitat") is TRUE
77 (True without regard to the fact noone has actually
78 registered Klikitat -- it's a formally valid tag)
79
80 is_language_tag("fr-patois") is TRUE
81 (Formally valid -- altho descriptively weak!)
82
83 is_language_tag("Spanish") is FALSE
84 is_language_tag("french-patois") is FALSE
85 (No good -- first subtag has to match
86 /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
87
88 is_language_tag("x-borg-prot2532") is TRUE
89 (Yes, subtags can contain digits, as of RFC3066)
90
91=cut
92
93sub is_language_tag {
94
95 ## Changes in the language tagging standards may have to be reflected here.
96
97 my($tag) = lc($_[0]);
98
99 return 0 if $tag eq "i" or $tag eq "x";
21aeefd5 100 # Bad degenerate cases that the following
4b053158
JH
101 # regexp would erroneously let pass
102
103 return $tag =~
104 /^(?: # First subtag
105 [xi] | [a-z]{2,3}
106 )
107 (?: # Subtags thereafter
108 - # separator
109 [a-z0-9]{1,8} # subtag
110 )*
111 $/xs ? 1 : 0;
112}
113
114###########################################################################
115
116=item * the function extract_language_tags($whatever)
117
118Returns a list of whatever looks like formally valid language tags
119in $whatever. Not very smart, so don't get too creative with
120what you want to feed it.
121
122 extract_language_tags("fr, fr-ca, i-mingo")
123 returns: ('fr', 'fr-ca', 'i-mingo')
124
125 extract_language_tags("It's like this: I'm in fr -- French!")
126 returns: ('It', 'in', 'fr')
127 (So don't just feed it any old thing.)
128
129The output is untainted. If you don't know what tainting is,
130don't worry about it.
131
132=cut
133
134sub extract_language_tags {
135
136 ## Changes in the language tagging standards may have to be reflected here.
137
138 my($text) =
139 $_[0] =~ m/(.+)/ # to make for an untainted result
140 ? $1 : ''
141 ;
142
143 return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
144 $text =~
145 m/
146 \b
147 (?: # First subtag
148 [iIxX] | [a-zA-Z]{2,3}
149 )
150 (?: # Subtags thereafter
151 - # separator
152 [a-zA-Z0-9]{1,8} # subtag
153 )*
154 \b
155 /xsg
156 );
157}
158
159###########################################################################
160
161=item * the function same_language_tag($lang1, $lang2)
162
163Returns true iff $lang1 and $lang2 are acceptable variant tags
164representing the same language-form.
165
166 same_language_tag('x-kadara', 'i-kadara') is TRUE
167 (The x/i- alternation doesn't matter)
168 same_language_tag('X-KADARA', 'i-kadara') is TRUE
169 (...and neither does case)
170 same_language_tag('en', 'en-US') is FALSE
171 (all-English is not the SAME as US English)
172 same_language_tag('x-kadara', 'x-kadar') is FALSE
173 (these are totally unrelated tags)
21aeefd5
JH
174 same_language_tag('no-bok', 'nb') is TRUE
175 (no-bok is a legacy tag for nb (Norwegian Bokmal))
4b053158
JH
176
177C<same_language_tag> works by just seeing whether
178C<encode_language_tag($lang1)> is the same as
179C<encode_language_tag($lang2)>.
180
181(Yes, I know this function is named a bit oddly. Call it historic
182reasons.)
183
184=cut
185
186sub same_language_tag {
187 my $el1 = &encode_language_tag($_[0]);
188 return 0 unless defined $el1;
189 # this avoids the problem of
190 # encode_language_tag($lang1) eq and encode_language_tag($lang2)
191 # being true if $lang1 and $lang2 are both undef
192
193 return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
194}
195
196###########################################################################
197
198=item * the function similarity_language_tag($lang1, $lang2)
199
200Returns an integer representing the degree of similarity between
201tags $lang1 and $lang2 (the order of which does not matter), where
202similarity is the number of common elements on the left,
203without regard to case and to x/i- alternation.
204
205 similarity_language_tag('fr', 'fr-ca') is 1
206 (one element in common)
207 similarity_language_tag('fr-ca', 'fr-FR') is 1
208 (one element in common)
209
210 similarity_language_tag('fr-CA-joual',
211 'fr-CA-PEI') is 2
212 similarity_language_tag('fr-CA-joual', 'fr-CA') is 2
213 (two elements in common)
214
215 similarity_language_tag('x-kadara', 'i-kadara') is 1
216 (x/i- doesn't matter)
217
218 similarity_language_tag('en', 'x-kadar') is 0
219 similarity_language_tag('x-kadara', 'x-kadar') is 0
220 (unrelated tags -- no similarity)
221
222 similarity_language_tag('i-cree-syllabic',
223 'i-cherokee-syllabic') is 0
224 (no B<leftmost> elements in common!)
225
226=cut
227
228sub similarity_language_tag {
229 my $lang1 = &encode_language_tag($_[0]);
230 my $lang2 = &encode_language_tag($_[1]);
21aeefd5
JH
231 # And encode_language_tag takes care of the whole
232 # no-nyn==nn, i-hakka==zh-hakka, etc, things
233
4b053158
JH
234 # NB: (i-sil-...)? (i-sgn-...)?
235
236 return undef if !defined($lang1) and !defined($lang2);
237 return 0 if !defined($lang1) or !defined($lang2);
238
239 my @l1_subtags = split('-', $lang1);
240 my @l2_subtags = split('-', $lang2);
241 my $similarity = 0;
242
243 while(@l1_subtags and @l2_subtags) {
244 if(shift(@l1_subtags) eq shift(@l2_subtags)) {
245 ++$similarity;
246 } else {
247 last;
248 }
249 }
250 return $similarity;
251}
252
253###########################################################################
254
255=item * the function is_dialect_of($lang1, $lang2)
256
21aeefd5 257Returns true iff language tag $lang1 represents a subform of
4b053158
JH
258language tag $lang2.
259
260B<Get the order right! It doesn't work the other way around!>
261
262 is_dialect_of('en-US', 'en') is TRUE
263 (American English IS a dialect of all-English)
264
265 is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE
266 is_dialect_of('fr-CA-joual', 'fr') is TRUE
267 (Joual is a dialect of (a dialect of) French)
268
269 is_dialect_of('en', 'en-US') is FALSE
270 (all-English is a NOT dialect of American English)
271
272 is_dialect_of('fr', 'en-CA') is FALSE
273
21aeefd5
JH
274 is_dialect_of('en', 'en' ) is TRUE
275 is_dialect_of('en-US', 'en-US') is TRUE
4b053158
JH
276 (B<Note:> these are degenerate cases)
277
278 is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
279 (the x/i thing doesn't matter, nor does case)
280
21aeefd5
JH
281 is_dialect_of('nn', 'no') is TRUE
282 (because 'nn' (New Norse) is aliased to 'no-nyn',
283 as a special legacy case, and 'no-nyn' is a
284 subform of 'no' (Norwegian))
285
4b053158
JH
286=cut
287
288sub is_dialect_of {
289
290 my $lang1 = &encode_language_tag($_[0]);
291 my $lang2 = &encode_language_tag($_[1]);
292
293 return undef if !defined($lang1) and !defined($lang2);
294 return 0 if !defined($lang1) or !defined($lang2);
295
296 return 1 if $lang1 eq $lang2;
297 return 0 if length($lang1) < length($lang2);
298
299 $lang1 .= '-';
300 $lang2 .= '-';
301 return
302 (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
303}
304
305###########################################################################
306
307=item * the function super_languages($lang1)
308
309Returns a list of language tags that are superordinate tags to $lang1
310-- it gets this by removing subtags from the end of $lang1 until
311nothing (or just "i" or "x") is left.
312
313 super_languages("fr-CA-joual") is ("fr-CA", "fr")
314
315 super_languages("en-AU") is ("en")
316
317 super_languages("en") is empty-list, ()
318
319 super_languages("i-cherokee") is empty-list, ()
320 ...not ("i"), which would be illegal as well as pointless.
321
322If $lang1 is not a valid language tag, returns empty-list in
323a list context, undef in a scalar context.
324
325A notable and rather unavoidable problem with this method:
326"x-mingo-tom" has an "x" because the whole tag isn't an
327IANA-registered tag -- but super_languages('x-mingo-tom') is
328('x-mingo') -- which isn't really right, since 'i-mingo' is
329registered. But this module has no way of knowing that. (But note
330that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
331
332More importantly, you assume I<at your peril> that superordinates of
333$lang1 are mutually intelligible with $lang1. Consider this
334carefully.
335
336=cut
337
338sub super_languages {
339 my $lang1 = $_[0];
340 return() unless defined($lang1) && &is_language_tag($lang1);
21aeefd5
JH
341
342 # a hack for those annoying new (2001) tags:
343 $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards
344 $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards
345 $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way
346 # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark
347
4b053158
JH
348 my @l1_subtags = split('-', $lang1);
349
350 ## Changes in the language tagging standards may have to be reflected here.
351
352 # NB: (i-sil-...)?
353
354 my @supers = ();
355 foreach my $bit (@l1_subtags) {
356 push @supers,
357 scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
358 }
359 pop @supers if @supers;
360 shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
361 return reverse @supers;
362}
363
364###########################################################################
365
366=item * the function locale2language_tag($locale_identifier)
367
368This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
369and maps it to a language tag. If it's not mappable (as with,
370notably, "C" and "POSIX"), this returns empty-list in a list context,
371or undef in a scalar context.
372
373 locale2language_tag("en") is "en"
374
375 locale2language_tag("en_US") is "en-US"
376
377 locale2language_tag("en_US.ISO8859-1") is "en-US"
378
379 locale2language_tag("C") is undef or ()
380
381 locale2language_tag("POSIX") is undef or ()
382
383 locale2language_tag("POSIX") is undef or ()
384
385I'm not totally sure that locale names map satisfactorily to language
386tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED.
387
388The output is untainted. If you don't know what tainting is,
389don't worry about it.
390
391=cut
392
393sub locale2language_tag {
394 my $lang =
395 $_[0] =~ m/(.+)/ # to make for an untainted result
396 ? $1 : ''
397 ;
398
399 return $lang if &is_language_tag($lang); # like "en"
400
401 $lang =~ tr<_><->; # "en_US" -> en-US
402 $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s; # "en_US.ISO8859-1" -> en-US
403
404 return $lang if &is_language_tag($lang);
405
406 return;
407}
408
409###########################################################################
410
411=item * the function encode_language_tag($lang1)
412
413This function, if given a language tag, returns an encoding of it such
414that:
415
416* tags representing different languages never get the same encoding.
417
418* tags representing the same language always get the same encoding.
419
420* an encoding of a formally valid language tag always is a string
421value that is defined, has length, and is true if considered as a
422boolean.
423
424Note that the encoding itself is B<not> a formally valid language tag.
425Note also that you cannot, currently, go from an encoding back to a
426language tag that it's an encoding of.
427
428Note also that you B<must> consider the encoded value as atomic; i.e.,
429you should not consider it as anything but an opaque, unanalysable
430string value. (The internals of the encoding method may change in
431future versions, as the language tagging standard changes over time.)
432
433C<encode_language_tag> returns undef if given anything other than a
434formally valid language tag.
435
436The reason C<encode_language_tag> exists is because different language
437tags may represent the same language; this is normally treatable with
438C<same_language_tag>, but consider this situation:
439
440You have a data file that expresses greetings in different languages.
441Its format is "[language tag]=[how to say 'Hello']", like:
442
443 en-US=Hiho
444 fr=Bonjour
445 i-mingo=Hau'
446
447And suppose you write a program that reads that file and then runs as
448a daemon, answering client requests that specify a language tag and
449then expect the string that says how to greet in that language. So an
450interaction looks like:
451
452 greeting-client asks: fr
453 greeting-server answers: Bonjour
454
455So far so good. But suppose the way you're implementing this is:
456
457 my %greetings;
458 die unless open(IN, "<in.dat");
459 while(<IN>) {
460 chomp;
461 next unless /^([^=]+)=(.+)/s;
462 my($lang, $expr) = ($1, $2);
463 $greetings{$lang} = $expr;
464 }
465 close(IN);
466
467at which point %greetings has the contents:
468
469 "en-US" => "Hiho"
470 "fr" => "Bonjour"
471 "i-mingo" => "Hau'"
472
473And suppose then that you answer client requests for language $wanted
474by just looking up $greetings{$wanted}.
475
476If the client asks for "fr", that will look up successfully in
477%greetings, to the value "Bonjour". And if the client asks for
478"i-mingo", that will look up successfully in %greetings, to the value
479"Hau'".
480
481But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
482lookup in %greetings fails. That's the Wrong Thing.
483
484You could instead do lookups on $wanted with:
485
486 use I18N::LangTags qw(same_language_tag);
487 my $repsonse = '';
488 foreach my $l2 (keys %greetings) {
489 if(same_language_tag($wanted, $l2)) {
490 $response = $greetings{$l2};
491 last;
492 }
493 }
494
495But that's rather inefficient. A better way to do it is to start your
496program with:
497
498 use I18N::LangTags qw(encode_language_tag);
499 my %greetings;
500 die unless open(IN, "<in.dat");
501 while(<IN>) {
502 chomp;
503 next unless /^([^=]+)=(.+)/s;
504 my($lang, $expr) = ($1, $2);
505 $greetings{
506 encode_language_tag($lang)
507 } = $expr;
508 }
509 close(IN);
510
511and then just answer client requests for language $wanted by just
512looking up
513
514 $greetings{encode_language_tag($wanted)}
515
516And that does the Right Thing.
517
518=cut
519
520sub encode_language_tag {
521 # Only similarity_language_tag() is allowed to analyse encodings!
522
523 ## Changes in the language tagging standards may have to be reflected here.
524
21aeefd5 525 my($tag) = $_[0] || return undef;
4b053158 526 return undef unless &is_language_tag($tag);
21aeefd5
JH
527
528 # For the moment, these legacy variances are few enough that
529 # we can just handle them here with regexps.
530 $tag =~ s/^iw\b/he/i; # Hebrew
531 $tag =~ s/^in\b/id/i; # Indonesian
532 $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger
533 $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo
534 $tag =~ s/^ji\b/yi/i; # Yiddish
535 #
536 # These go FROM the simplex to complex form, to get
537 # similarity-comparison right. And that's okay, since
538 # similarity_language_tag is the only thing that
539 # analyzes our output.
540 $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka
541 $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal
542 $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk
4b053158
JH
543
544 $tag =~ s/^[xiXI]-//s;
545 # Just lop off any leading "x/i-"
4b053158 546
21aeefd5 547 return "~" . uc($tag);
4b053158
JH
548}
549
550#--------------------------------------------------------------------------
551
552=item * the function alternate_language_tags($lang1)
553
554This function, if given a language tag, returns all language tags that
21aeefd5
JH
555are alternate forms of this language tag. (I.e., tags which refer to
556the same language.) This is meant to handle legacy tags caused by
557the minor changes in language tag standards over the years; and
558the x-/i- alternation is also dealt with.
559
560Note that this function does I<not> try to equate new (and never-used,
561and unusable)
562ISO639-2 three-letter tags to old (and still in use) ISO639-1
563two-letter equivalents -- like "ara" -> "ar" -- because
564"ara" has I<never> been in use as an Internet language tag,
565and RFC 3066 stipulates that it never should be, since a shorter
566tag ("ar") exists.
567
568Examples:
569
570 alternate_language_tags('no-bok') is ('nb')
571 alternate_language_tags('nb') is ('no-bok')
572 alternate_language_tags('he') is ('iw')
573 alternate_language_tags('iw') is ('he')
574 alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka')
575 alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka')
576 alternate_language_tags('en') is ()
577 alternate_language_tags('x-mingo-tom') is ('i-mingo-tom')
578 alternate_language_tags('x-klikitat') is ('i-klikitat')
579 alternate_language_tags('i-klikitat') is ('x-klikitat')
580
581This function returns empty-list if given anything other than a formally
4b053158
JH
582valid language tag.
583
584=cut
585
586my %alt = qw( i x x i I X X I );
587sub alternate_language_tags {
4b053158
JH
588 my $tag = $_[0];
589 return() unless &is_language_tag($tag);
590
21aeefd5
JH
591 my @em; # push 'em real goood!
592
593 # For the moment, these legacy variances are few enough that
594 # we can just handle them here with regexps.
595
596 if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1";
597 } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1";
598
599 } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1";
600 } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1";
601
602 } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1";
603 } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1";
604
605 } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1";
606 } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1";
607
608 } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1";
609 } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1";
610
611 } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1";
612 } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1";
613
614 } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1";
615 } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1";
616
617 } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1";
618 } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1";
619 }
620
621 push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/;
622 return @em;
623}
624
625###########################################################################
626
627{
628 # Init %Panic...
629
630 my @panic = ( # MUST all be lowercase!
631 # Only large ("national") languages make it in this list.
632 # If you, as a user, are so bizarre that the /only/ language
633 # you claim to accept is Galician, then no, we won't do you
634 # the favor of providing Catalan as a panic-fallback for
635 # you. Because if I start trying to add "little languages" in
636 # here, I'll just go crazy.
637
4cf5bee0
JH
638 # Scandinavian lgs. All based on opinion and hearsay.
639 'sv' => [qw(nb no da nn)],
640 'da' => [qw(nb no sv nn)], # I guess
641 [qw(no nn nb)], [qw(no nn nb sv da)],
642 'is' => [qw(da sv no nb nn)],
643 'fo' => [qw(da is no nb nn sv)], # I guess
21aeefd5
JH
644
645 # I think this is about the extent of tolerable intelligibility
646 # among large modern Romance languages.
647 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French
648 'ca' => [qw(es pt it fr)],
649 'es' => [qw(ca it fr pt)],
650 'it' => [qw(es fr ca pt)],
651 'fr' => [qw(es it ca pt)],
652
653 # Also assume that speakers of the main Indian languages prefer
654 # to read/hear Hindi over English
655 [qw(
656 as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur
657 )] => 'hi',
658 # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri,
659 # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya,
660 # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu.
661 'hi' => [qw(bn pa as or)],
662 # I welcome finer data for the other Indian languages.
663 # E.g., what should Oriya's list be, besides just Hindi?
664
665 # And the panic languages for English is, of course, nil!
666
667 # My guesses at Slavic intelligibility:
668 ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian
669 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat
670 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak
671
672 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian
673
674 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish
675
676 #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai
677
678 );
679 my($k,$v);
680 while(@panic) {
681 ($k,$v) = splice(@panic,0,2);
682 foreach my $k (ref($k) ? @$k : $k) {
683 foreach my $v (ref($v) ? @$v : $v) {
684 push @{$Panic{$k} ||= []}, $v unless $k eq $v;
685 }
686 }
687 }
688}
689
690=item * the function @langs = panic_languages(@accept_languages)
691
692This function takes a list of 0 or more language
693tags that constitute a given user's Accept-Language list, and
694returns a list of tags for I<other> (non-super)
695languages that are probably acceptable to the user, to be
696used I<if all else fails>.
697
698For example, if a user accepts only 'ca' (Catalan) and
699'es' (Spanish), and the documents/interfaces you have
700available are just in German, Italian, and Chinese, then
701the user will most likely want the Italian one (and not
702the Chinese or German one!), instead of getting
703nothing. So C<panic_languages('ca', 'es')> returns
704a list containing 'it' (Italian).
705
706English ('en') is I<always> in the return list, but
707whether it's at the very end or not depends
708on the input languages. This function works by consulting
709an internal table that stipulates what common
710languages are "close" to each other.
711
712A useful construct you might consider using is:
713
714 @fallbacks = super_languages(@accept_languages);
715 push @fallbacks, panic_languages(
716 @accept_languages, @fallbacks,
717 );
718
719=cut
4b053158 720
21aeefd5
JH
721sub panic_languages {
722 # When in panic or in doubt, run in circles, scream, and shout!
723 my(@out, %seen);
724 foreach my $t (@_) {
725 next unless $t;
726 next if $seen{$t}++; # so we don't return it or hit it again
727 # push @out, super_languages($t); # nah, keep that separate
728 push @out, @{ $Panic{lc $t} || next };
4b053158 729 }
21aeefd5 730 return grep !$seen{$_}++, @out, 'en';
4b053158
JH
731}
732
733###########################################################################
21aeefd5
JH
7341;
735__END__
4b053158
JH
736
737=back
738
739=head1 ABOUT LOWERCASING
740
741I've considered making all the above functions that output language
742tags return all those tags strictly in lowercase. Having all your
743language tags in lowercase does make some things easier. But you
744might as well just lowercase as you like, or call
745C<encode_language_tag($lang1)> where appropriate.
746
747=head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
748
749In some future version of I18N::LangTags, I plan to include support
750for RFC2482-style language tags -- which are basically just normal
751language tags with their ASCII characters shifted into Plane 14.
752
753=head1 SEE ALSO
754
e7525a17
JH
755* L<I18N::LangTags::List|I18N::LangTags::List>
756
4b053158
JH
757* RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
758Identification of Languages". (Obsoletes RFC 1766)
759
760* RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on
761Character Sets and Languages".
762
763* RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter
764Value and Encoded Word Extensions: Character Sets, Languages, and
765Continuations".
766
767* RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>,
768"Language Tagging in Unicode Plain Text".
769
770* Locale::Codes, in
771C<http://www.perl.com/CPAN/modules/by-module/Locale/>
772
773* ISO 639, "Code for the representation of names of languages",
774C<http://www.indigo.ie/egt/standards/iso639/iso639-1-en.html>
775
776* ISO 639-2, "Codes for the representation of names of languages",
777including three-letter codes,
778C<http://lcweb.loc.gov/standards/iso639-2/bibcodes.html>
779
780* The IANA list of registered languages (hopefully up-to-date),
781C<ftp://ftp.isi.edu/in-notes/iana/assignments/languages/>
782
783=head1 COPYRIGHT
784
785Copyright (c) 1998-2001 Sean M. Burke. All rights reserved.
786
787This library is free software; you can redistribute it and/or
788modify it under the same terms as Perl itself.
789
790The programs and documentation in this dist are distributed in
791the hope that they will be useful, but without any warranty; without
792even the implied warranty of merchantability or fitness for a
793particular purpose.
794
795=head1 AUTHOR
796
797Sean M. Burke C<sburke@cpan.org>
798
799=cut
800