lib/I18N/LangTags.pm

   1
   2 # Time-stamp: "2004-07-01 14:34:40 ADT"
   3 # Sean M. Burke <sburke@cpan.org>
   4
   5 require 5.000;
   6 package I18N::LangTags;
   7 use strict;
   8 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic);
   9 require Exporter;
  10 @ISA = qw(Exporter);
  11 @EXPORT = qw();
  12 @EXPORT_OK = qw(is_language_tag same_language_tag
  13                 extract_language_tags super_languages
  14                 similarity_language_tag is_dialect_of
  15                 locale2language_tag alternate_language_tags
  16                 encode_language_tag panic_languages
  17                 implicate_supers
  18                 implicate_supers_strictly
  19                );
  20 %EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
  21
  22 $VERSION = "0.33";
  23
  24 sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function
  25
  26
  27 =head1 NAME
  28
  29 I18N::LangTags - functions for dealing with RFC3066-style language tags
  30
  31 =head1 SYNOPSIS
  32
  33   use I18N::LangTags();
  34
  35 ...or specify whichever of those functions you want to import, like so:
  36
  37   use I18N::LangTags qw(implicate_supers similarity_language_tag);
  38
  39 All the exportable functions are listed below -- you're free to import
  40 only some, or none at all.  By default, none are imported.  If you
  41 say:
  42
  43     use I18N::LangTags qw(:ALL)
  44
  45 ...then all are exported.  (This saves you from having to use
  46 something less obvious like C<use I18N::LangTags qw(/./)>.)
  47
  48 If you don't import any of these functions, assume a C<&I18N::LangTags::>
  49 in front of all the function names in the following examples.
  50
  51 =head1 DESCRIPTION
  52
  53 Language tags are a formalism, described in RFC 3066 (obsoleting
  54 1766), for declaring what language form (language and possibly
  55 dialect) a given chunk of information is in.
  56
  57 This library provides functions for common tasks involving language
  58 tags as they are needed in a variety of protocols and applications.
  59
  60 Please see the "See Also" references for a thorough explanation
  61 of how to correctly use language tags.
  62
  63 =over
  64
  65 =cut
  66
  67 ###########################################################################
  68
  69 =item * the function is_language_tag($lang1)
  70
  71 Returns true iff $lang1 is a formally valid language tag.
  72
  73    is_language_tag("fr")            is TRUE
  74    is_language_tag("x-jicarilla")   is FALSE
  75        (Subtags can be 8 chars long at most -- 'jicarilla' is 9)
  76
  77    is_language_tag("sgn-US")    is TRUE
  78        (That's American Sign Language)
  79
  80    is_language_tag("i-Klikitat")    is TRUE
  81        (True without regard to the fact noone has actually
  82         registered Klikitat -- it's a formally valid tag)
  83
  84    is_language_tag("fr-patois")     is TRUE
  85        (Formally valid -- altho descriptively weak!)
  86
  87    is_language_tag("Spanish")       is FALSE
  88    is_language_tag("french-patois") is FALSE
  89        (No good -- first subtag has to match
  90         /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
  91
  92    is_language_tag("x-borg-prot2532") is TRUE
  93        (Yes, subtags can contain digits, as of RFC3066)
  94
  95 =cut
  96
  97 sub is_language_tag {
  98
  99   ## Changes in the language tagging standards may have to be reflected here.
 100
 101   my($tag) = lc($_[0]);
 102
 103   return 0 if $tag eq "i" or $tag eq "x";
 104   # Bad degenerate cases that the following
 105   #  regexp would erroneously let pass
 106
 107   return $tag =~
 108     /^(?:  # First subtag
 109          [xi] | [a-z]{2,3}
 110       )
 111       (?:  # Subtags thereafter
 112          -           # separator
 113          [a-z0-9]{1,8}  # subtag
 114       )*
 115     $/xs ? 1 : 0;
 116 }
 117
 118 ###########################################################################
 119
 120 =item * the function extract_language_tags($whatever)
 121
 122 Returns a list of whatever looks like formally valid language tags
 123 in $whatever.  Not very smart, so don't get too creative with
 124 what you want to feed it.
 125
 126   extract_language_tags("fr, fr-ca, i-mingo")
 127     returns:   ('fr', 'fr-ca', 'i-mingo')
 128
 129   extract_language_tags("It's like this: I'm in fr -- French!")
 130     returns:   ('It', 'in', 'fr')
 131   (So don't just feed it any old thing.)
 132
 133 The output is untainted.  If you don't know what tainting is,
 134 don't worry about it.
 135
 136 =cut
 137
 138 sub extract_language_tags {
 139
 140   ## Changes in the language tagging standards may have to be reflected here.
 141
 142   my($text) =
 143     $_[0] =~ m/(.+)/  # to make for an untainted result
 144     ? $1 : ''
 145   ;
 146
 147   return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
 148     $text =~
 149     m/
 150       \b
 151       (?:  # First subtag
 152          [iIxX] | [a-zA-Z]{2,3}
 153       )
 154       (?:  # Subtags thereafter
 155          -           # separator
 156          [a-zA-Z0-9]{1,8}  # subtag
 157       )*
 158       \b
 159     /xsg
 160   );
 161 }
 162
 163 ###########################################################################
 164
 165 =item * the function same_language_tag($lang1, $lang2)
 166
 167 Returns true iff $lang1 and $lang2 are acceptable variant tags
 168 representing the same language-form.
 169
 170    same_language_tag('x-kadara', 'i-kadara')  is TRUE
 171       (The x/i- alternation doesn't matter)
 172    same_language_tag('X-KADARA', 'i-kadara')  is TRUE
 173       (...and neither does case)
 174    same_language_tag('en',       'en-US')     is FALSE
 175       (all-English is not the SAME as US English)
 176    same_language_tag('x-kadara', 'x-kadar')   is FALSE
 177       (these are totally unrelated tags)
 178    same_language_tag('no-bok',    'nb')       is TRUE
 179       (no-bok is a legacy tag for nb (Norwegian Bokmal))
 180
 181 C<same_language_tag> works by just seeing whether
 182 C<encode_language_tag($lang1)> is the same as
 183 C<encode_language_tag($lang2)>.
 184
 185 (Yes, I know this function is named a bit oddly.  Call it historic
 186 reasons.)
 187
 188 =cut
 189
 190 sub same_language_tag {
 191   my $el1 = &encode_language_tag($_[0]);
 192   return 0 unless defined $el1;
 193    # this avoids the problem of
 194    # encode_language_tag($lang1) eq and encode_language_tag($lang2)
 195    # being true if $lang1 and $lang2 are both undef
 196
 197   return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
 198 }
 199
 200 ###########################################################################
 201
 202 =item * the function similarity_language_tag($lang1, $lang2)
 203
 204 Returns an integer representing the degree of similarity between
 205 tags $lang1 and $lang2 (the order of which does not matter), where
 206 similarity is the number of common elements on the left,
 207 without regard to case and to x/i- alternation.
 208
 209    similarity_language_tag('fr', 'fr-ca')           is 1
 210       (one element in common)
 211    similarity_language_tag('fr-ca', 'fr-FR')        is 1
 212       (one element in common)
 213
 214    similarity_language_tag('fr-CA-joual',
 215                            'fr-CA-PEI')             is 2
 216    similarity_language_tag('fr-CA-joual', 'fr-CA')  is 2
 217       (two elements in common)
 218
 219    similarity_language_tag('x-kadara', 'i-kadara')  is 1
 220       (x/i- doesn't matter)
 221
 222    similarity_language_tag('en',       'x-kadar')   is 0
 223    similarity_language_tag('x-kadara', 'x-kadar')   is 0
 224       (unrelated tags -- no similarity)
 225
 226    similarity_language_tag('i-cree-syllabic',
 227                            'i-cherokee-syllabic')   is 0
 228       (no B<leftmost> elements in common!)
 229
 230 =cut
 231
 232 sub similarity_language_tag {
 233   my $lang1 = &encode_language_tag($_[0]);
 234   my $lang2 = &encode_language_tag($_[1]);
 235    # And encode_language_tag takes care of the whole
 236    #  no-nyn==nn, i-hakka==zh-hakka, etc, things
 237
 238   # NB: (i-sil-...)?  (i-sgn-...)?
 239
 240   return undef if !defined($lang1) and !defined($lang2);
 241   return 0 if !defined($lang1) or !defined($lang2);
 242
 243   my @l1_subtags = split('-', $lang1);
 244   my @l2_subtags = split('-', $lang2);
 245   my $similarity = 0;
 246
 247   while(@l1_subtags and @l2_subtags) {
 248     if(shift(@l1_subtags) eq shift(@l2_subtags)) {
 249       ++$similarity;
 250     } else {
 251       last;
 252     }
 253   }
 254   return $similarity;
 255 }
 256
 257 ###########################################################################
 258
 259 =item * the function is_dialect_of($lang1, $lang2)
 260
 261 Returns true iff language tag $lang1 represents a subform of
 262 language tag $lang2.
 263
 264 B<Get the order right!  It doesn't work the other way around!>
 265
 266    is_dialect_of('en-US', 'en')            is TRUE
 267      (American English IS a dialect of all-English)
 268
 269    is_dialect_of('fr-CA-joual', 'fr-CA')   is TRUE
 270    is_dialect_of('fr-CA-joual', 'fr')      is TRUE
 271      (Joual is a dialect of (a dialect of) French)
 272
 273    is_dialect_of('en', 'en-US')            is FALSE
 274      (all-English is a NOT dialect of American English)
 275
 276    is_dialect_of('fr', 'en-CA')            is FALSE
 277
 278    is_dialect_of('en',    'en'   )         is TRUE
 279    is_dialect_of('en-US', 'en-US')         is TRUE
 280      (B<Note:> these are degenerate cases)
 281
 282    is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
 283      (the x/i thing doesn't matter, nor does case)
 284
 285    is_dialect_of('nn', 'no')               is TRUE
 286      (because 'nn' (New Norse) is aliased to 'no-nyn',
 287       as a special legacy case, and 'no-nyn' is a
 288       subform of 'no' (Norwegian))
 289
 290 =cut
 291
 292 sub is_dialect_of {
 293
 294   my $lang1 = &encode_language_tag($_[0]);
 295   my $lang2 = &encode_language_tag($_[1]);
 296
 297   return undef if !defined($lang1) and !defined($lang2);
 298   return 0 if !defined($lang1) or !defined($lang2);
 299
 300   return 1 if $lang1 eq $lang2;
 301   return 0 if length($lang1) < length($lang2);
 302
 303   $lang1 .= '-';
 304   $lang2 .= '-';
 305   return
 306     (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
 307 }
 308
 309 ###########################################################################
 310
 311 =item * the function super_languages($lang1)
 312
 313 Returns a list of language tags that are superordinate tags to $lang1
 314 -- it gets this by removing subtags from the end of $lang1 until
 315 nothing (or just "i" or "x") is left.
 316
 317    super_languages("fr-CA-joual")  is  ("fr-CA", "fr")
 318
 319    super_languages("en-AU")  is  ("en")
 320
 321    super_languages("en")  is  empty-list, ()
 322
 323    super_languages("i-cherokee")  is  empty-list, ()
 324     ...not ("i"), which would be illegal as well as pointless.
 325
 326 If $lang1 is not a valid language tag, returns empty-list in
 327 a list context, undef in a scalar context.
 328
 329 A notable and rather unavoidable problem with this method:
 330 "x-mingo-tom" has an "x" because the whole tag isn't an
 331 IANA-registered tag -- but super_languages('x-mingo-tom') is
 332 ('x-mingo') -- which isn't really right, since 'i-mingo' is
 333 registered.  But this module has no way of knowing that.  (But note
 334 that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
 335
 336 More importantly, you assume I<at your peril> that superordinates of
 337 $lang1 are mutually intelligible with $lang1.  Consider this
 338 carefully.
 339
 340 =cut
 341
 342 sub super_languages {
 343   my $lang1 = $_[0];
 344   return() unless defined($lang1) && &is_language_tag($lang1);
 345
 346   # a hack for those annoying new (2001) tags:
 347   $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards
 348   $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards
 349   $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way
 350    # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark
 351
 352   my @l1_subtags = split('-', $lang1);
 353
 354   ## Changes in the language tagging standards may have to be reflected here.
 355
 356   # NB: (i-sil-...)?
 357
 358   my @supers = ();
 359   foreach my $bit (@l1_subtags) {
 360     push @supers,
 361       scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
 362   }
 363   pop @supers if @supers;
 364   shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
 365   return reverse @supers;
 366 }
 367
 368 ###########################################################################
 369
 370 =item * the function locale2language_tag($locale_identifier)
 371
 372 This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
 373 and maps it to a language tag.  If it's not mappable (as with,
 374 notably, "C" and "POSIX"), this returns empty-list in a list context,
 375 or undef in a scalar context.
 376
 377    locale2language_tag("en") is "en"
 378
 379    locale2language_tag("en_US") is "en-US"
 380
 381    locale2language_tag("en_US.ISO8859-1") is "en-US"
 382
 383    locale2language_tag("C") is undef or ()
 384
 385    locale2language_tag("POSIX") is undef or ()
 386
 387    locale2language_tag("POSIX") is undef or ()
 388
 389 I'm not totally sure that locale names map satisfactorily to language
 390 tags.  Think REAL hard about how you use this.  YOU HAVE BEEN WARNED.
 391
 392 The output is untainted.  If you don't know what tainting is,
 393 don't worry about it.
 394
 395 =cut
 396
 397 sub locale2language_tag {
 398   my $lang =
 399     $_[0] =~ m/(.+)/  # to make for an untainted result
 400     ? $1 : ''
 401   ;
 402
 403   return $lang if &is_language_tag($lang); # like "en"
 404
 405   $lang =~ tr<_><->;  # "en_US" -> en-US
 406   $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s;  # "en_US.ISO8859-1" -> en-US
 407
 408   return $lang if &is_language_tag($lang);
 409
 410   return;
 411 }
 412
 413 ###########################################################################
 414
 415 =item * the function encode_language_tag($lang1)
 416
 417 This function, if given a language tag, returns an encoding of it such
 418 that:
 419
 420 * tags representing different languages never get the same encoding.
 421
 422 * tags representing the same language always get the same encoding.
 423
 424 * an encoding of a formally valid language tag always is a string
 425 value that is defined, has length, and is true if considered as a
 426 boolean.
 427
 428 Note that the encoding itself is B<not> a formally valid language tag.
 429 Note also that you cannot, currently, go from an encoding back to a
 430 language tag that it's an encoding of.
 431
 432 Note also that you B<must> consider the encoded value as atomic; i.e.,
 433 you should not consider it as anything but an opaque, unanalysable
 434 string value.  (The internals of the encoding method may change in
 435 future versions, as the language tagging standard changes over time.)
 436
 437 C<encode_language_tag> returns undef if given anything other than a
 438 formally valid language tag.
 439
 440 The reason C<encode_language_tag> exists is because different language
 441 tags may represent the same language; this is normally treatable with
 442 C<same_language_tag>, but consider this situation:
 443
 444 You have a data file that expresses greetings in different languages.
 445 Its format is "[language tag]=[how to say 'Hello']", like:
 446
 447           en-US=Hiho
 448           fr=Bonjour
 449           i-mingo=Hau'
 450
 451 And suppose you write a program that reads that file and then runs as
 452 a daemon, answering client requests that specify a language tag and
 453 then expect the string that says how to greet in that language.  So an
 454 interaction looks like:
 455
 456           greeting-client asks:    fr
 457           greeting-server answers: Bonjour
 458
 459 So far so good.  But suppose the way you're implementing this is:
 460
 461           my %greetings;
 462           die unless open(IN, "<in.dat");
 463           while(<IN>) {
 464             chomp;
 465             next unless /^([^=]+)=(.+)/s;
 466             my($lang, $expr) = ($1, $2);
 467             $greetings{$lang} = $expr;
 468           }
 469           close(IN);
 470
 471 at which point %greetings has the contents:
 472
 473           "en-US"   => "Hiho"
 474           "fr"      => "Bonjour"
 475           "i-mingo" => "Hau'"
 476
 477 And suppose then that you answer client requests for language $wanted
 478 by just looking up $greetings{$wanted}.
 479
 480 If the client asks for "fr", that will look up successfully in
 481 %greetings, to the value "Bonjour".  And if the client asks for
 482 "i-mingo", that will look up successfully in %greetings, to the value
 483 "Hau'".
 484
 485 But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
 486 lookup in %greetings fails.  That's the Wrong Thing.
 487
 488 You could instead do lookups on $wanted with:
 489
 490           use I18N::LangTags qw(same_language_tag);
 491           my $repsonse = '';
 492           foreach my $l2 (keys %greetings) {
 493             if(same_language_tag($wanted, $l2)) {
 494               $response = $greetings{$l2};
 495               last;
 496             }
 497           }
 498
 499 But that's rather inefficient.  A better way to do it is to start your
 500 program with:
 501
 502           use I18N::LangTags qw(encode_language_tag);
 503           my %greetings;
 504           die unless open(IN, "<in.dat");
 505           while(<IN>) {
 506             chomp;
 507             next unless /^([^=]+)=(.+)/s;
 508             my($lang, $expr) = ($1, $2);
 509             $greetings{
 510                         encode_language_tag($lang)
 511                       } = $expr;
 512           }
 513           close(IN);
 514
 515 and then just answer client requests for language $wanted by just
 516 looking up
 517
 518           $greetings{encode_language_tag($wanted)}
 519
 520 And that does the Right Thing.
 521
 522 =cut
 523
 524 sub encode_language_tag {
 525   # Only similarity_language_tag() is allowed to analyse encodings!
 526
 527   ## Changes in the language tagging standards may have to be reflected here.
 528
 529   my($tag) = $_[0] || return undef;
 530   return undef unless &is_language_tag($tag);
 531
 532   # For the moment, these legacy variances are few enough that
 533   #  we can just handle them here with regexps.
 534   $tag =~ s/^iw\b/he/i; # Hebrew
 535   $tag =~ s/^in\b/id/i; # Indonesian
 536   $tag =~ s/^cre\b/cr/i; # Cree
 537   $tag =~ s/^jw\b/jv/i; # Javanese
 538   $tag =~ s/^[ix]-lux\b/lb/i;  # Luxemburger
 539   $tag =~ s/^[ix]-navajo\b/nv/i;  # Navajo
 540   $tag =~ s/^ji\b/yi/i;  # Yiddish
 541   # SMB 2003 -- Hm.  There's a bunch of new XXX->YY variances now,
 542   #  but maybe they're all so obscure I can ignore them.   "Obscure"
 543   #  meaning either that the language is obscure, and/or that the
 544   #  XXX form was extant so briefly that it's unlikely it was ever
 545   #  used.  I hope.
 546   #
 547   # These go FROM the simplex to complex form, to get
 548   #  similarity-comparison right.  And that's okay, since
 549   #  similarity_language_tag is the only thing that
 550   #  analyzes our output.
 551   $tag =~ s/^[ix]-hakka\b/zh-hakka/i;  # Hakka
 552   $tag =~ s/^nb\b/no-bok/i;  # BACKWARDS for Bokmal
 553   $tag =~ s/^nn\b/no-nyn/i;  # BACKWARDS for Nynorsk
 554
 555   $tag =~ s/^[xiXI]-//s;
 556    # Just lop off any leading "x/i-"
 557
 558   return "~" . uc($tag);
 559 }
 560
 561 #--------------------------------------------------------------------------
 562
 563 =item * the function alternate_language_tags($lang1)
 564
 565 This function, if given a language tag, returns all language tags that
 566 are alternate forms of this language tag.  (I.e., tags which refer to
 567 the same language.)  This is meant to handle legacy tags caused by
 568 the minor changes in language tag standards over the years; and
 569 the x-/i- alternation is also dealt with.
 570
 571 Note that this function does I<not> try to equate new (and never-used,
 572 and unusable)
 573 ISO639-2 three-letter tags to old (and still in use) ISO639-1
 574 two-letter equivalents -- like "ara" -> "ar" -- because
 575 "ara" has I<never> been in use as an Internet language tag,
 576 and RFC 3066 stipulates that it never should be, since a shorter
 577 tag ("ar") exists.
 578
 579 Examples:
 580
 581           alternate_language_tags('no-bok')       is ('nb')
 582           alternate_language_tags('nb')           is ('no-bok')
 583           alternate_language_tags('he')           is ('iw')
 584           alternate_language_tags('iw')           is ('he')
 585           alternate_language_tags('i-hakka')      is ('zh-hakka', 'x-hakka')
 586           alternate_language_tags('zh-hakka')     is ('i-hakka', 'x-hakka')
 587           alternate_language_tags('en')           is ()
 588           alternate_language_tags('x-mingo-tom')  is ('i-mingo-tom')
 589           alternate_language_tags('x-klikitat')   is ('i-klikitat')
 590           alternate_language_tags('i-klikitat')   is ('x-klikitat')
 591
 592 This function returns empty-list if given anything other than a formally
 593 valid language tag.
 594
 595 =cut
 596
 597 my %alt = qw( i x   x i   I X   X I );
 598 sub alternate_language_tags {
 599   my $tag = $_[0];
 600   return() unless &is_language_tag($tag);
 601
 602   my @em; # push 'em real goood!
 603
 604   # For the moment, these legacy variances are few enough that
 605   #  we can just handle them here with regexps.
 606
 607   if(     $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1";
 608   } elsif($tag =~ m/^zh-hakka\b(.*)/i) {  push @em, "x-hakka$1", "i-hakka$1";
 609
 610   } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1";
 611   } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1";
 612
 613   } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1";
 614   } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1";
 615
 616   } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1";
 617   } elsif($tag =~ m/^lb\b(.*)/i) {       push @em, "i-lux$1", "x-lux$1";
 618
 619   } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1";
 620   } elsif($tag =~ m/^nv\b(.*)/i) {          push @em, "i-navajo$1", "x-navajo$1";
 621
 622   } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1";
 623   } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1";
 624
 625   } elsif($tag =~ m/^nb\b(.*)/i) {     push @em, "no-bok$1";
 626   } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1";
 627
 628   } elsif($tag =~ m/^nn\b(.*)/i) {     push @em, "no-nyn$1";
 629   } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1";
 630   }
 631
 632   push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/;
 633   return @em;
 634 }
 635
 636 ###########################################################################
 637
 638 {
 639   # Init %Panic...
 640
 641   my @panic = (  # MUST all be lowercase!
 642    # Only large ("national") languages make it in this list.
 643    #  If you, as a user, are so bizarre that the /only/ language
 644    #  you claim to accept is Galician, then no, we won't do you
 645    #  the favor of providing Catalan as a panic-fallback for
 646    #  you.  Because if I start trying to add "little languages" in
 647    #  here, I'll just go crazy.
 648
 649    # Scandinavian lgs.  All based on opinion and hearsay.
 650    'sv' => [qw(nb no da nn)],
 651    'da' => [qw(nb no sv nn)], # I guess
 652    [qw(no nn nb)], [qw(no nn nb sv da)],
 653    'is' => [qw(da sv no nb nn)],
 654    'fo' => [qw(da is no nb nn sv)], # I guess
 655
 656    # I think this is about the extent of tolerable intelligibility
 657    #  among large modern Romance languages.
 658    'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French
 659    'ca' => [qw(es pt it fr)],
 660    'es' => [qw(ca it fr pt)],
 661    'it' => [qw(es fr ca pt)],
 662    'fr' => [qw(es it ca pt)],
 663
 664    # Also assume that speakers of the main Indian languages prefer
 665    #  to read/hear Hindi over English
 666    [qw(
 667      as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur
 668    )] => 'hi',
 669     # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri,
 670     # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya,
 671     # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu.
 672    'hi' => [qw(bn pa as or)],
 673    # I welcome finer data for the other Indian languages.
 674    #  E.g., what should Oriya's list be, besides just Hindi?
 675
 676    # And the panic languages for English is, of course, nil!
 677
 678    # My guesses at Slavic intelligibility:
 679    ([qw(ru be uk)]) x 2,  # Russian, Belarusian, Ukranian
 680    'sr' => 'hr', 'hr' => 'sr', # Serb + Croat
 681    'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak
 682
 683    'ms' => 'id', 'id' => 'ms', # Malay + Indonesian
 684
 685    'et' => 'fi', 'fi' => 'et', # Estonian + Finnish
 686
 687    #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai
 688
 689   );
 690   my($k,$v);
 691   while(@panic) {
 692     ($k,$v) = splice(@panic,0,2);
 693     foreach my $k (ref($k) ? @$k : $k) {
 694       foreach my $v (ref($v) ? @$v : $v) {
 695         push @{$Panic{$k} ||= []}, $v unless $k eq $v;
 696       }
 697     }
 698   }
 699 }
 700
 701 =item * the function @langs = panic_languages(@accept_languages)
 702
 703 This function takes a list of 0 or more language
 704 tags that constitute a given user's Accept-Language list, and
 705 returns a list of tags for I<other> (non-super)
 706 languages that are probably acceptable to the user, to be
 707 used I<if all else fails>.
 708
 709 For example, if a user accepts only 'ca' (Catalan) and
 710 'es' (Spanish), and the documents/interfaces you have
 711 available are just in German, Italian, and Chinese, then
 712 the user will most likely want the Italian one (and not
 713 the Chinese or German one!), instead of getting
 714 nothing.  So C<panic_languages('ca', 'es')> returns
 715 a list containing 'it' (Italian).
 716
 717 English ('en') is I<always> in the return list, but
 718 whether it's at the very end or not depends
 719 on the input languages.  This function works by consulting
 720 an internal table that stipulates what common
 721 languages are "close" to each other.
 722
 723 A useful construct you might consider using is:
 724
 725   @fallbacks = super_languages(@accept_languages);
 726   push @fallbacks, panic_languages(
 727     @accept_languages, @fallbacks,
 728   );
 729
 730 =cut
 731
 732 sub panic_languages {
 733   # When in panic or in doubt, run in circles, scream, and shout!
 734   my(@out, %seen);
 735   foreach my $t (@_) {
 736     next unless $t;
 737     next if $seen{$t}++; # so we don't return it or hit it again
 738     # push @out, super_languages($t); # nah, keep that separate
 739     push @out, @{ $Panic{lc $t} || next };
 740   }
 741   return grep !$seen{$_}++,  @out, 'en';
 742 }
 743
 744 #---------------------------------------------------------------------------
 745 #---------------------------------------------------------------------------
 746
 747 =item * the function implicate_supers( ...languages... )
 748
 749 This takes a list of strings (which are presumed to be language-tags;
 750 strings that aren't, are ignored); and after each one, this function
 751 inserts super-ordinate forms that don't already appear in the list.
 752 The original list, plus these insertions, is returned.
 753
 754 In other words, it takes this:
 755
 756   pt-br de-DE en-US fr pt-br-janeiro
 757
 758 and returns this:
 759
 760   pt-br pt de-DE de en-US en fr pt-br-janeiro
 761
 762 This function is most useful in the idiom
 763
 764   implicate_supers( I18N::LangTags::Detect::detect() );
 765
 766 (See L<I18N::LangTags::Detect>.)
 767
 768
 769 =item * the function implicate_supers_strictly( ...languages... )
 770
 771 This works like C<implicate_supers> except that the implicated
 772 forms are added to the end of the return list.
 773
 774 In other words, implicate_supers_strictly takes a list of strings
 775 (which are presumed to be language-tags; strings that aren't, are
 776 ignored) and after the whole given list, it inserts the super-ordinate forms
 777 of all given tags, minus any tags that already appear in the input list.
 778
 779 In other words, it takes this:
 780
 781   pt-br de-DE en-US fr pt-br-janeiro
 782
 783 and returns this:
 784
 785   pt-br de-DE en-US fr pt-br-janeiro pt de en
 786
 787 The reason this function has "_strictly" in its name is that when
 788 you're processing an Accept-Language list according to the RFCs, if
 789 you interpret the RFCs quite strictly, then you would use
 790 implicate_supers_strictly, but for normal use (i.e., common-sense use,
 791 as far as I'm concerned) you'd use implicate_supers.
 792
 793 =cut
 794
 795 sub implicate_supers {
 796   my @languages = grep is_language_tag($_), @_;
 797   my %seen_encoded;
 798   foreach my $lang (@languages) {
 799     $seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1
 800   }
 801
 802   my(@output_languages);
 803   foreach my $lang (@languages) {
 804     push @output_languages, $lang;
 805     foreach my $s ( I18N::LangTags::super_languages($lang) ) {
 806       # Note that super_languages returns the longest first.
 807       last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) };
 808       push @output_languages, $s;
 809     }
 810   }
 811   return uniq( @output_languages );
 812
 813 }
 814
 815 sub implicate_supers_strictly {
 816   my @tags = grep is_language_tag($_), @_;
 817   return uniq( @_,   map super_languages($_), @_ );
 818 }
 819
 820
 821
 822 ###########################################################################
 823 1;
 824 __END__
 825
 826 =back
 827
 828 =head1 ABOUT LOWERCASING
 829
 830 I've considered making all the above functions that output language
 831 tags return all those tags strictly in lowercase.  Having all your
 832 language tags in lowercase does make some things easier.  But you
 833 might as well just lowercase as you like, or call
 834 C<encode_language_tag($lang1)> where appropriate.
 835
 836 =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
 837
 838 In some future version of I18N::LangTags, I plan to include support
 839 for RFC2482-style language tags -- which are basically just normal
 840 language tags with their ASCII characters shifted into Plane 14.
 841
 842 =head1 SEE ALSO
 843
 844 * L<I18N::LangTags::List|I18N::LangTags::List>
 845
 846 * RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
 847 Identification of Languages".  (Obsoletes RFC 1766)
 848
 849 * RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on
 850 Character Sets and Languages".
 851
 852 * RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter
 853 Value and Encoded Word Extensions: Character Sets, Languages, and
 854 Continuations".
 855
 856 * RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>,
 857 "Language Tagging in Unicode Plain Text".
 858
 859 * Locale::Codes, in
 860 C<http://www.perl.com/CPAN/modules/by-module/Locale/>
 861
 862 * ISO 639-2, "Codes for the representation of names of languages",
 863 including two-letter and three-letter codes,
 864 C<http://www.loc.gov/standards/iso639-2/langcodes.html>
 865
 866 * The IANA list of registered languages (hopefully up-to-date),
 867 C<http://www.iana.org/assignments/language-tags>
 868
 869 =head1 COPYRIGHT
 870
 871 Copyright (c) 1998-2004 Sean M. Burke. All rights reserved.
 872
 873 This library is free software; you can redistribute it and/or
 874 modify it under the same terms as Perl itself.
 875
 876 The programs and documentation in this dist are distributed in
 877 the hope that they will be useful, but without any warranty; without
 878 even the implied warranty of merchantability or fitness for a
 879 particular purpose.
 880
 881 =head1 AUTHOR
 882
 883 Sean M. Burke C<sburke@cpan.org>
 884
 885 =cut
 886