lib/I18N/LangTags.pm

   1
   2 # Time-stamp: "2002-02-02 20:43:03 MST"
   3 # Sean M. Burke <sburke@cpan.org>
   4
   5 require 5.000;
   6 package I18N::LangTags;
   7 use strict;
   8 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic);
   9 require Exporter;
  10 @ISA = qw(Exporter);
  11 @EXPORT = qw();
  12 @EXPORT_OK = qw(is_language_tag same_language_tag
  13                 extract_language_tags super_languages
  14                 similarity_language_tag is_dialect_of
  15                 locale2language_tag alternate_language_tags
  16                 encode_language_tag panic_languages
  17                );
  18 %EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
  19
  20 $VERSION = "0.27";
  21
  22 =head1 NAME
  23
  24 I18N::LangTags - functions for dealing with RFC3066-style language tags
  25
  26 =head1 SYNOPSIS
  27
  28     use I18N::LangTags qw(is_language_tag same_language_tag
  29                           extract_language_tags super_languages
  30                           similarity_language_tag is_dialect_of
  31                           locale2language_tag alternate_language_tags
  32                           encode_language_tag panic_languages
  33                          );
  34
  35 ...or whatever of those functions you want to import.  Those are
  36 all the exportable functions -- you're free to import only some,
  37 or none at all.  By default, none are imported.  If you say:
  38
  39     use I18N::LangTags qw(:ALL)
  40
  41 ...then all are exported.  (This saves you from having to use
  42 something less obvious like C<use I18N::LangTags qw(/./)>.)
  43
  44 If you don't import any of these functions, assume a C<&I18N::LangTags::>
  45 in front of all the function names in the following examples.
  46
  47 =head1 DESCRIPTION
  48
  49 Language tags are a formalism, described in RFC 3066 (obsoleting
  50 1766), for declaring what language form (language and possibly
  51 dialect) a given chunk of information is in.
  52
  53 This library provides functions for common tasks involving language
  54 tags as they are needed in a variety of protocols and applications.
  55
  56 Please see the "See Also" references for a thorough explanation
  57 of how to correctly use language tags.
  58
  59 =over
  60
  61 =cut
  62
  63 ###########################################################################
  64
  65 =item * the function is_language_tag($lang1)
  66
  67 Returns true iff $lang1 is a formally valid language tag.
  68
  69    is_language_tag("fr")            is TRUE
  70    is_language_tag("x-jicarilla")   is FALSE
  71        (Subtags can be 8 chars long at most -- 'jicarilla' is 9)
  72
  73    is_language_tag("sgn-US")    is TRUE
  74        (That's American Sign Language)
  75
  76    is_language_tag("i-Klikitat")    is TRUE
  77        (True without regard to the fact noone has actually
  78         registered Klikitat -- it's a formally valid tag)
  79
  80    is_language_tag("fr-patois")     is TRUE
  81        (Formally valid -- altho descriptively weak!)
  82
  83    is_language_tag("Spanish")       is FALSE
  84    is_language_tag("french-patois") is FALSE
  85        (No good -- first subtag has to match
  86         /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
  87
  88    is_language_tag("x-borg-prot2532") is TRUE
  89        (Yes, subtags can contain digits, as of RFC3066)
  90
  91 =cut
  92
  93 sub is_language_tag {
  94
  95   ## Changes in the language tagging standards may have to be reflected here.
  96
  97   my($tag) = lc($_[0]);
  98
  99   return 0 if $tag eq "i" or $tag eq "x";
 100   # Bad degenerate cases that the following
 101   #  regexp would erroneously let pass
 102
 103   return $tag =~
 104     /^(?:  # First subtag
 105          [xi] | [a-z]{2,3}
 106       )
 107       (?:  # Subtags thereafter
 108          -           # separator
 109          [a-z0-9]{1,8}  # subtag
 110       )*
 111     $/xs ? 1 : 0;
 112 }
 113
 114 ###########################################################################
 115
 116 =item * the function extract_language_tags($whatever)
 117
 118 Returns a list of whatever looks like formally valid language tags
 119 in $whatever.  Not very smart, so don't get too creative with
 120 what you want to feed it.
 121
 122   extract_language_tags("fr, fr-ca, i-mingo")
 123     returns:   ('fr', 'fr-ca', 'i-mingo')
 124
 125   extract_language_tags("It's like this: I'm in fr -- French!")
 126     returns:   ('It', 'in', 'fr')
 127   (So don't just feed it any old thing.)
 128
 129 The output is untainted.  If you don't know what tainting is,
 130 don't worry about it.
 131
 132 =cut
 133
 134 sub extract_language_tags {
 135
 136   ## Changes in the language tagging standards may have to be reflected here.
 137
 138   my($text) =
 139     $_[0] =~ m/(.+)/  # to make for an untainted result
 140     ? $1 : ''
 141   ;
 142
 143   return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
 144     $text =~
 145     m/
 146       \b
 147       (?:  # First subtag
 148          [iIxX] | [a-zA-Z]{2,3}
 149       )
 150       (?:  # Subtags thereafter
 151          -           # separator
 152          [a-zA-Z0-9]{1,8}  # subtag
 153       )*
 154       \b
 155     /xsg
 156   );
 157 }
 158
 159 ###########################################################################
 160
 161 =item * the function same_language_tag($lang1, $lang2)
 162
 163 Returns true iff $lang1 and $lang2 are acceptable variant tags
 164 representing the same language-form.
 165
 166    same_language_tag('x-kadara', 'i-kadara')  is TRUE
 167       (The x/i- alternation doesn't matter)
 168    same_language_tag('X-KADARA', 'i-kadara')  is TRUE
 169       (...and neither does case)
 170    same_language_tag('en',       'en-US')     is FALSE
 171       (all-English is not the SAME as US English)
 172    same_language_tag('x-kadara', 'x-kadar')   is FALSE
 173       (these are totally unrelated tags)
 174    same_language_tag('no-bok',    'nb')       is TRUE
 175       (no-bok is a legacy tag for nb (Norwegian Bokmal))
 176
 177 C<same_language_tag> works by just seeing whether
 178 C<encode_language_tag($lang1)> is the same as
 179 C<encode_language_tag($lang2)>.
 180
 181 (Yes, I know this function is named a bit oddly.  Call it historic
 182 reasons.)
 183
 184 =cut
 185
 186 sub same_language_tag {
 187   my $el1 = &encode_language_tag($_[0]);
 188   return 0 unless defined $el1;
 189    # this avoids the problem of
 190    # encode_language_tag($lang1) eq and encode_language_tag($lang2)
 191    # being true if $lang1 and $lang2 are both undef
 192
 193   return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
 194 }
 195
 196 ###########################################################################
 197
 198 =item * the function similarity_language_tag($lang1, $lang2)
 199
 200 Returns an integer representing the degree of similarity between
 201 tags $lang1 and $lang2 (the order of which does not matter), where
 202 similarity is the number of common elements on the left,
 203 without regard to case and to x/i- alternation.
 204
 205    similarity_language_tag('fr', 'fr-ca')           is 1
 206       (one element in common)
 207    similarity_language_tag('fr-ca', 'fr-FR')        is 1
 208       (one element in common)
 209
 210    similarity_language_tag('fr-CA-joual',
 211                            'fr-CA-PEI')             is 2
 212    similarity_language_tag('fr-CA-joual', 'fr-CA')  is 2
 213       (two elements in common)
 214
 215    similarity_language_tag('x-kadara', 'i-kadara')  is 1
 216       (x/i- doesn't matter)
 217
 218    similarity_language_tag('en',       'x-kadar')   is 0
 219    similarity_language_tag('x-kadara', 'x-kadar')   is 0
 220       (unrelated tags -- no similarity)
 221
 222    similarity_language_tag('i-cree-syllabic',
 223                            'i-cherokee-syllabic')   is 0
 224       (no B<leftmost> elements in common!)
 225
 226 =cut
 227
 228 sub similarity_language_tag {
 229   my $lang1 = &encode_language_tag($_[0]);
 230   my $lang2 = &encode_language_tag($_[1]);
 231    # And encode_language_tag takes care of the whole
 232    #  no-nyn==nn, i-hakka==zh-hakka, etc, things
 233
 234   # NB: (i-sil-...)?  (i-sgn-...)?
 235
 236   return undef if !defined($lang1) and !defined($lang2);
 237   return 0 if !defined($lang1) or !defined($lang2);
 238
 239   my @l1_subtags = split('-', $lang1);
 240   my @l2_subtags = split('-', $lang2);
 241   my $similarity = 0;
 242
 243   while(@l1_subtags and @l2_subtags) {
 244     if(shift(@l1_subtags) eq shift(@l2_subtags)) {
 245       ++$similarity;
 246     } else {
 247       last;
 248     }
 249   }
 250   return $similarity;
 251 }
 252
 253 ###########################################################################
 254
 255 =item * the function is_dialect_of($lang1, $lang2)
 256
 257 Returns true iff language tag $lang1 represents a subform of
 258 language tag $lang2.
 259
 260 B<Get the order right!  It doesn't work the other way around!>
 261
 262    is_dialect_of('en-US', 'en')            is TRUE
 263      (American English IS a dialect of all-English)
 264
 265    is_dialect_of('fr-CA-joual', 'fr-CA')   is TRUE
 266    is_dialect_of('fr-CA-joual', 'fr')      is TRUE
 267      (Joual is a dialect of (a dialect of) French)
 268
 269    is_dialect_of('en', 'en-US')            is FALSE
 270      (all-English is a NOT dialect of American English)
 271
 272    is_dialect_of('fr', 'en-CA')            is FALSE
 273
 274    is_dialect_of('en',    'en'   )         is TRUE
 275    is_dialect_of('en-US', 'en-US')         is TRUE
 276      (B<Note:> these are degenerate cases)
 277
 278    is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
 279      (the x/i thing doesn't matter, nor does case)
 280
 281    is_dialect_of('nn', 'no')               is TRUE
 282      (because 'nn' (New Norse) is aliased to 'no-nyn',
 283       as a special legacy case, and 'no-nyn' is a
 284       subform of 'no' (Norwegian))
 285
 286 =cut
 287
 288 sub is_dialect_of {
 289
 290   my $lang1 = &encode_language_tag($_[0]);
 291   my $lang2 = &encode_language_tag($_[1]);
 292
 293   return undef if !defined($lang1) and !defined($lang2);
 294   return 0 if !defined($lang1) or !defined($lang2);
 295
 296   return 1 if $lang1 eq $lang2;
 297   return 0 if length($lang1) < length($lang2);
 298
 299   $lang1 .= '-';
 300   $lang2 .= '-';
 301   return
 302     (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
 303 }
 304
 305 ###########################################################################
 306
 307 =item * the function super_languages($lang1)
 308
 309 Returns a list of language tags that are superordinate tags to $lang1
 310 -- it gets this by removing subtags from the end of $lang1 until
 311 nothing (or just "i" or "x") is left.
 312
 313    super_languages("fr-CA-joual")  is  ("fr-CA", "fr")
 314
 315    super_languages("en-AU")  is  ("en")
 316
 317    super_languages("en")  is  empty-list, ()
 318
 319    super_languages("i-cherokee")  is  empty-list, ()
 320     ...not ("i"), which would be illegal as well as pointless.
 321
 322 If $lang1 is not a valid language tag, returns empty-list in
 323 a list context, undef in a scalar context.
 324
 325 A notable and rather unavoidable problem with this method:
 326 "x-mingo-tom" has an "x" because the whole tag isn't an
 327 IANA-registered tag -- but super_languages('x-mingo-tom') is
 328 ('x-mingo') -- which isn't really right, since 'i-mingo' is
 329 registered.  But this module has no way of knowing that.  (But note
 330 that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
 331
 332 More importantly, you assume I<at your peril> that superordinates of
 333 $lang1 are mutually intelligible with $lang1.  Consider this
 334 carefully.
 335
 336 =cut
 337
 338 sub super_languages {
 339   my $lang1 = $_[0];
 340   return() unless defined($lang1) && &is_language_tag($lang1);
 341
 342   # a hack for those annoying new (2001) tags:
 343   $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards
 344   $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards
 345   $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way
 346    # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark
 347
 348   my @l1_subtags = split('-', $lang1);
 349
 350   ## Changes in the language tagging standards may have to be reflected here.
 351
 352   # NB: (i-sil-...)?
 353
 354   my @supers = ();
 355   foreach my $bit (@l1_subtags) {
 356     push @supers,
 357       scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
 358   }
 359   pop @supers if @supers;
 360   shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
 361   return reverse @supers;
 362 }
 363
 364 ###########################################################################
 365
 366 =item * the function locale2language_tag($locale_identifier)
 367
 368 This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
 369 and maps it to a language tag.  If it's not mappable (as with,
 370 notably, "C" and "POSIX"), this returns empty-list in a list context,
 371 or undef in a scalar context.
 372
 373    locale2language_tag("en") is "en"
 374
 375    locale2language_tag("en_US") is "en-US"
 376
 377    locale2language_tag("en_US.ISO8859-1") is "en-US"
 378
 379    locale2language_tag("C") is undef or ()
 380
 381    locale2language_tag("POSIX") is undef or ()
 382
 383    locale2language_tag("POSIX") is undef or ()
 384
 385 I'm not totally sure that locale names map satisfactorily to language
 386 tags.  Think REAL hard about how you use this.  YOU HAVE BEEN WARNED.
 387
 388 The output is untainted.  If you don't know what tainting is,
 389 don't worry about it.
 390
 391 =cut
 392
 393 sub locale2language_tag {
 394   my $lang =
 395     $_[0] =~ m/(.+)/  # to make for an untainted result
 396     ? $1 : ''
 397   ;
 398
 399   return $lang if &is_language_tag($lang); # like "en"
 400
 401   $lang =~ tr<_><->;  # "en_US" -> en-US
 402   $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s;  # "en_US.ISO8859-1" -> en-US
 403
 404   return $lang if &is_language_tag($lang);
 405
 406   return;
 407 }
 408
 409 ###########################################################################
 410
 411 =item * the function encode_language_tag($lang1)
 412
 413 This function, if given a language tag, returns an encoding of it such
 414 that:
 415
 416 * tags representing different languages never get the same encoding.
 417
 418 * tags representing the same language always get the same encoding.
 419
 420 * an encoding of a formally valid language tag always is a string
 421 value that is defined, has length, and is true if considered as a
 422 boolean.
 423
 424 Note that the encoding itself is B<not> a formally valid language tag.
 425 Note also that you cannot, currently, go from an encoding back to a
 426 language tag that it's an encoding of.
 427
 428 Note also that you B<must> consider the encoded value as atomic; i.e.,
 429 you should not consider it as anything but an opaque, unanalysable
 430 string value.  (The internals of the encoding method may change in
 431 future versions, as the language tagging standard changes over time.)
 432
 433 C<encode_language_tag> returns undef if given anything other than a
 434 formally valid language tag.
 435
 436 The reason C<encode_language_tag> exists is because different language
 437 tags may represent the same language; this is normally treatable with
 438 C<same_language_tag>, but consider this situation:
 439
 440 You have a data file that expresses greetings in different languages.
 441 Its format is "[language tag]=[how to say 'Hello']", like:
 442
 443           en-US=Hiho
 444           fr=Bonjour
 445           i-mingo=Hau'
 446
 447 And suppose you write a program that reads that file and then runs as
 448 a daemon, answering client requests that specify a language tag and
 449 then expect the string that says how to greet in that language.  So an
 450 interaction looks like:
 451
 452           greeting-client asks:    fr
 453           greeting-server answers: Bonjour
 454
 455 So far so good.  But suppose the way you're implementing this is:
 456
 457           my %greetings;
 458           die unless open(IN, "<in.dat");
 459           while(<IN>) {
 460             chomp;
 461             next unless /^([^=]+)=(.+)/s;
 462             my($lang, $expr) = ($1, $2);
 463             $greetings{$lang} = $expr;
 464           }
 465           close(IN);
 466
 467 at which point %greetings has the contents:
 468
 469           "en-US"   => "Hiho"
 470           "fr"      => "Bonjour"
 471           "i-mingo" => "Hau'"
 472
 473 And suppose then that you answer client requests for language $wanted
 474 by just looking up $greetings{$wanted}.
 475
 476 If the client asks for "fr", that will look up successfully in
 477 %greetings, to the value "Bonjour".  And if the client asks for
 478 "i-mingo", that will look up successfully in %greetings, to the value
 479 "Hau'".
 480
 481 But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
 482 lookup in %greetings fails.  That's the Wrong Thing.
 483
 484 You could instead do lookups on $wanted with:
 485
 486           use I18N::LangTags qw(same_language_tag);
 487           my $repsonse = '';
 488           foreach my $l2 (keys %greetings) {
 489             if(same_language_tag($wanted, $l2)) {
 490               $response = $greetings{$l2};
 491               last;
 492             }
 493           }
 494
 495 But that's rather inefficient.  A better way to do it is to start your
 496 program with:
 497
 498           use I18N::LangTags qw(encode_language_tag);
 499           my %greetings;
 500           die unless open(IN, "<in.dat");
 501           while(<IN>) {
 502             chomp;
 503             next unless /^([^=]+)=(.+)/s;
 504             my($lang, $expr) = ($1, $2);
 505             $greetings{
 506                         encode_language_tag($lang)
 507                       } = $expr;
 508           }
 509           close(IN);
 510
 511 and then just answer client requests for language $wanted by just
 512 looking up
 513
 514           $greetings{encode_language_tag($wanted)}
 515
 516 And that does the Right Thing.
 517
 518 =cut
 519
 520 sub encode_language_tag {
 521   # Only similarity_language_tag() is allowed to analyse encodings!
 522
 523   ## Changes in the language tagging standards may have to be reflected here.
 524
 525   my($tag) = $_[0] || return undef;
 526   return undef unless &is_language_tag($tag);
 527
 528   # For the moment, these legacy variances are few enough that
 529   #  we can just handle them here with regexps.
 530   $tag =~ s/^iw\b/he/i; # Hebrew
 531   $tag =~ s/^in\b/id/i; # Indonesian
 532   $tag =~ s/^[ix]-lux\b/lb/i;  # Luxemburger
 533   $tag =~ s/^[ix]-navajo\b/nv/i;  # Navajo
 534   $tag =~ s/^ji\b/yi/i;  # Yiddish
 535   #
 536   # These go FROM the simplex to complex form, to get
 537   #  similarity-comparison right.  And that's okay, since
 538   #  similarity_language_tag is the only thing that
 539   #  analyzes our output.
 540   $tag =~ s/^[ix]-hakka\b/zh-hakka/i;  # Hakka
 541   $tag =~ s/^nb\b/no-bok/i;  # BACKWARDS for Bokmal
 542   $tag =~ s/^nn\b/no-nyn/i;  # BACKWARDS for Nynorsk
 543
 544   $tag =~ s/^[xiXI]-//s;
 545    # Just lop off any leading "x/i-"
 546
 547   return "~" . uc($tag);
 548 }
 549
 550 #--------------------------------------------------------------------------
 551
 552 =item * the function alternate_language_tags($lang1)
 553
 554 This function, if given a language tag, returns all language tags that
 555 are alternate forms of this language tag.  (I.e., tags which refer to
 556 the same language.)  This is meant to handle legacy tags caused by
 557 the minor changes in language tag standards over the years; and
 558 the x-/i- alternation is also dealt with.
 559
 560 Note that this function does I<not> try to equate new (and never-used,
 561 and unusable)
 562 ISO639-2 three-letter tags to old (and still in use) ISO639-1
 563 two-letter equivalents -- like "ara" -> "ar" -- because
 564 "ara" has I<never> been in use as an Internet language tag,
 565 and RFC 3066 stipulates that it never should be, since a shorter
 566 tag ("ar") exists.
 567
 568 Examples:
 569
 570           alternate_language_tags('no-bok')       is ('nb')
 571           alternate_language_tags('nb')           is ('no-bok')
 572           alternate_language_tags('he')           is ('iw')
 573           alternate_language_tags('iw')           is ('he')
 574           alternate_language_tags('i-hakka')      is ('zh-hakka', 'x-hakka')
 575           alternate_language_tags('zh-hakka')     is ('i-hakka', 'x-hakka')
 576           alternate_language_tags('en')           is ()
 577           alternate_language_tags('x-mingo-tom')  is ('i-mingo-tom')
 578           alternate_language_tags('x-klikitat')   is ('i-klikitat')
 579           alternate_language_tags('i-klikitat')   is ('x-klikitat')
 580
 581 This function returns empty-list if given anything other than a formally
 582 valid language tag.
 583
 584 =cut
 585
 586 my %alt = qw( i x   x i   I X   X I );
 587 sub alternate_language_tags {
 588   my $tag = $_[0];
 589   return() unless &is_language_tag($tag);
 590
 591   my @em; # push 'em real goood!
 592
 593   # For the moment, these legacy variances are few enough that
 594   #  we can just handle them here with regexps.
 595
 596   if(     $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1";
 597   } elsif($tag =~ m/^zh-hakka\b(.*)/i) {  push @em, "x-hakka$1", "i-hakka$1";
 598
 599   } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1";
 600   } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1";
 601
 602   } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1";
 603   } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1";
 604
 605   } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1";
 606   } elsif($tag =~ m/^lb\b(.*)/i) {       push @em, "i-lux$1", "x-lux$1";
 607
 608   } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1";
 609   } elsif($tag =~ m/^nv\b(.*)/i) {          push @em, "i-navajo$1", "x-navajo$1";
 610
 611   } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1";
 612   } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1";
 613
 614   } elsif($tag =~ m/^nb\b(.*)/i) {     push @em, "no-bok$1";
 615   } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1";
 616
 617   } elsif($tag =~ m/^nn\b(.*)/i) {     push @em, "no-nyn$1";
 618   } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1";
 619   }
 620
 621   push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/;
 622   return @em;
 623 }
 624
 625 ###########################################################################
 626
 627 {
 628   # Init %Panic...
 629
 630   my @panic = (  # MUST all be lowercase!
 631    # Only large ("national") languages make it in this list.
 632    #  If you, as a user, are so bizarre that the /only/ language
 633    #  you claim to accept is Galician, then no, we won't do you
 634    #  the favor of providing Catalan as a panic-fallback for
 635    #  you.  Because if I start trying to add "little languages" in
 636    #  here, I'll just go crazy.
 637
 638    # Scandinavian lgs.  All based on opinion and hearsay.
 639    'sv' => [qw(nb no da nn)],
 640    'da' => [qw(nb no sv nn)], # I guess
 641    [qw(no nn nb)], [qw(no nn nb sv da)],
 642    'is' => [qw(da sv no nb nn)],
 643    'fo' => [qw(da is no nb nn sv)], # I guess
 644
 645    # I think this is about the extent of tolerable intelligibility
 646    #  among large modern Romance languages.
 647    'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French
 648    'ca' => [qw(es pt it fr)],
 649    'es' => [qw(ca it fr pt)],
 650    'it' => [qw(es fr ca pt)],
 651    'fr' => [qw(es it ca pt)],
 652
 653    # Also assume that speakers of the main Indian languages prefer
 654    #  to read/hear Hindi over English
 655    [qw(
 656      as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur
 657    )] => 'hi',
 658     # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri,
 659     # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya,
 660     # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu.
 661    'hi' => [qw(bn pa as or)],
 662    # I welcome finer data for the other Indian languages.
 663    #  E.g., what should Oriya's list be, besides just Hindi?
 664
 665    # And the panic languages for English is, of course, nil!
 666
 667    # My guesses at Slavic intelligibility:
 668    ([qw(ru be uk)]) x 2,  # Russian, Belarusian, Ukranian
 669    'sr' => 'hr', 'hr' => 'sr', # Serb + Croat
 670    'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak
 671
 672    'ms' => 'id', 'id' => 'ms', # Malay + Indonesian
 673
 674    'et' => 'fi', 'fi' => 'et', # Estonian + Finnish
 675
 676    #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai
 677
 678   );
 679   my($k,$v);
 680   while(@panic) {
 681     ($k,$v) = splice(@panic,0,2);
 682     foreach my $k (ref($k) ? @$k : $k) {
 683       foreach my $v (ref($v) ? @$v : $v) {
 684         push @{$Panic{$k} ||= []}, $v unless $k eq $v;
 685       }
 686     }
 687   }
 688 }
 689
 690 =item * the function @langs = panic_languages(@accept_languages)
 691
 692 This function takes a list of 0 or more language
 693 tags that constitute a given user's Accept-Language list, and
 694 returns a list of tags for I<other> (non-super)
 695 languages that are probably acceptable to the user, to be
 696 used I<if all else fails>.
 697
 698 For example, if a user accepts only 'ca' (Catalan) and
 699 'es' (Spanish), and the documents/interfaces you have
 700 available are just in German, Italian, and Chinese, then
 701 the user will most likely want the Italian one (and not
 702 the Chinese or German one!), instead of getting
 703 nothing.  So C<panic_languages('ca', 'es')> returns
 704 a list containing 'it' (Italian).
 705
 706 English ('en') is I<always> in the return list, but
 707 whether it's at the very end or not depends
 708 on the input languages.  This function works by consulting
 709 an internal table that stipulates what common
 710 languages are "close" to each other.
 711
 712 A useful construct you might consider using is:
 713
 714   @fallbacks = super_languages(@accept_languages);
 715   push @fallbacks, panic_languages(
 716     @accept_languages, @fallbacks,
 717   );
 718
 719 =cut
 720
 721 sub panic_languages {
 722   # When in panic or in doubt, run in circles, scream, and shout!
 723   my(@out, %seen);
 724   foreach my $t (@_) {
 725     next unless $t;
 726     next if $seen{$t}++; # so we don't return it or hit it again
 727     # push @out, super_languages($t); # nah, keep that separate
 728     push @out, @{ $Panic{lc $t} || next };
 729   }
 730   return grep !$seen{$_}++,  @out, 'en';
 731 }
 732
 733 ###########################################################################
 734 1;
 735 __END__
 736
 737 =back
 738
 739 =head1 ABOUT LOWERCASING
 740
 741 I've considered making all the above functions that output language
 742 tags return all those tags strictly in lowercase.  Having all your
 743 language tags in lowercase does make some things easier.  But you
 744 might as well just lowercase as you like, or call
 745 C<encode_language_tag($lang1)> where appropriate.
 746
 747 =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
 748
 749 In some future version of I18N::LangTags, I plan to include support
 750 for RFC2482-style language tags -- which are basically just normal
 751 language tags with their ASCII characters shifted into Plane 14.
 752
 753 =head1 SEE ALSO
 754
 755 * L<I18N::LangTags::List|I18N::LangTags::List>
 756
 757 * RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
 758 Identification of Languages".  (Obsoletes RFC 1766)
 759
 760 * RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on
 761 Character Sets and Languages".
 762
 763 * RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter
 764 Value and Encoded Word Extensions: Character Sets, Languages, and
 765 Continuations".
 766
 767 * RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>,
 768 "Language Tagging in Unicode Plain Text".
 769
 770 * Locale::Codes, in
 771 C<http://www.perl.com/CPAN/modules/by-module/Locale/>
 772
 773 * ISO 639, "Code for the representation of names of languages",
 774 C<http://www.indigo.ie/egt/standards/iso639/iso639-1-en.html>
 775
 776 * ISO 639-2, "Codes for the representation of names of languages",
 777 including three-letter codes,
 778 C<http://lcweb.loc.gov/standards/iso639-2/bibcodes.html>
 779
 780 * The IANA list of registered languages (hopefully up-to-date),
 781 C<ftp://ftp.isi.edu/in-notes/iana/assignments/languages/>
 782
 783 =head1 COPYRIGHT
 784
 785 Copyright (c) 1998-2001 Sean M. Burke. All rights reserved.
 786
 787 This library is free software; you can redistribute it and/or
 788 modify it under the same terms as Perl itself.
 789
 790 The programs and documentation in this dist are distributed in
 791 the hope that they will be useful, but without any warranty; without
 792 even the implied warranty of merchantability or fitness for a
 793 particular purpose.
 794
 795 =head1 AUTHOR
 796
 797 Sean M. Burke C<sburke@cpan.org>
 798
 799 =cut
 800