dist/I18N-LangTags/lib/I18N/LangTags.pm

   1
   2 # Time-stamp: "2004-10-06 23:26:33 ADT"
   3 # Sean M. Burke <sburke@cpan.org>
   4
   5 require 5.000;
   6 package I18N::LangTags;
   7 use strict;
   8 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic);
   9 require Exporter;
  10 @ISA = qw(Exporter);
  11 @EXPORT = qw();
  12 @EXPORT_OK = qw(is_language_tag same_language_tag
  13                 extract_language_tags super_languages
  14                 similarity_language_tag is_dialect_of
  15                 locale2language_tag alternate_language_tags
  16                 encode_language_tag panic_languages
  17                 implicate_supers
  18                 implicate_supers_strictly
  19                );
  20 %EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
  21
  22 $VERSION = "0.42";
  23
  24 sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function
  25
  26
  27 =head1 NAME
  28
  29 I18N::LangTags - functions for dealing with RFC3066-style language tags
  30
  31 =head1 SYNOPSIS
  32
  33   use I18N::LangTags();
  34
  35 ...or specify whichever of those functions you want to import, like so:
  36
  37   use I18N::LangTags qw(implicate_supers similarity_language_tag);
  38
  39 All the exportable functions are listed below -- you're free to import
  40 only some, or none at all.  By default, none are imported.  If you
  41 say:
  42
  43     use I18N::LangTags qw(:ALL)
  44
  45 ...then all are exported.  (This saves you from having to use
  46 something less obvious like C<use I18N::LangTags qw(/./)>.)
  47
  48 If you don't import any of these functions, assume a C<&I18N::LangTags::>
  49 in front of all the function names in the following examples.
  50
  51 =head1 DESCRIPTION
  52
  53 Language tags are a formalism, described in RFC 3066 (obsoleting
  54 1766), for declaring what language form (language and possibly
  55 dialect) a given chunk of information is in.
  56
  57 This library provides functions for common tasks involving language
  58 tags as they are needed in a variety of protocols and applications.
  59
  60 Please see the "See Also" references for a thorough explanation
  61 of how to correctly use language tags.
  62
  63 =over
  64
  65 =cut
  66
  67 ###########################################################################
  68
  69 =item * the function is_language_tag($lang1)
  70
  71 Returns true iff $lang1 is a formally valid language tag.
  72
  73    is_language_tag("fr")            is TRUE
  74    is_language_tag("x-jicarilla")   is FALSE
  75        (Subtags can be 8 chars long at most -- 'jicarilla' is 9)
  76
  77    is_language_tag("sgn-US")    is TRUE
  78        (That's American Sign Language)
  79
  80    is_language_tag("i-Klikitat")    is TRUE
  81        (True without regard to the fact noone has actually
  82         registered Klikitat -- it's a formally valid tag)
  83
  84    is_language_tag("fr-patois")     is TRUE
  85        (Formally valid -- altho descriptively weak!)
  86
  87    is_language_tag("Spanish")       is FALSE
  88    is_language_tag("french-patois") is FALSE
  89        (No good -- first subtag has to match
  90         /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
  91
  92    is_language_tag("x-borg-prot2532") is TRUE
  93        (Yes, subtags can contain digits, as of RFC3066)
  94
  95 =cut
  96
  97 sub is_language_tag {
  98
  99   ## Changes in the language tagging standards may have to be reflected here.
 100
 101   my($tag) = lc($_[0]);
 102
 103   return 0 if $tag eq "i" or $tag eq "x";
 104   # Bad degenerate cases that the following
 105   #  regexp would erroneously let pass
 106
 107   return $tag =~
 108     /^(?:  # First subtag
 109          [xi] | [a-z]{2,3}
 110       )
 111       (?:  # Subtags thereafter
 112          -           # separator
 113          [a-z0-9]{1,8}  # subtag
 114       )*
 115     $/xs ? 1 : 0;
 116 }
 117
 118 ###########################################################################
 119
 120 =item * the function extract_language_tags($whatever)
 121
 122 Returns a list of whatever looks like formally valid language tags
 123 in $whatever.  Not very smart, so don't get too creative with
 124 what you want to feed it.
 125
 126   extract_language_tags("fr, fr-ca, i-mingo")
 127     returns:   ('fr', 'fr-ca', 'i-mingo')
 128
 129   extract_language_tags("It's like this: I'm in fr -- French!")
 130     returns:   ('It', 'in', 'fr')
 131   (So don't just feed it any old thing.)
 132
 133 The output is untainted.  If you don't know what tainting is,
 134 don't worry about it.
 135
 136 =cut
 137
 138 sub extract_language_tags {
 139
 140   ## Changes in the language tagging standards may have to be reflected here.
 141
 142   my($text) =
 143     $_[0] =~ m/(.+)/  # to make for an untainted result
 144     ? $1 : ''
 145   ;
 146
 147   return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
 148     $text =~
 149     m/
 150       \b
 151       (?:  # First subtag
 152          [iIxX] | [a-zA-Z]{2,3}
 153       )
 154       (?:  # Subtags thereafter
 155          -           # separator
 156          [a-zA-Z0-9]{1,8}  # subtag
 157       )*
 158       \b
 159     /xsg
 160   );
 161 }
 162
 163 ###########################################################################
 164
 165 =item * the function same_language_tag($lang1, $lang2)
 166
 167 Returns true iff $lang1 and $lang2 are acceptable variant tags
 168 representing the same language-form.
 169
 170    same_language_tag('x-kadara', 'i-kadara')  is TRUE
 171       (The x/i- alternation doesn't matter)
 172    same_language_tag('X-KADARA', 'i-kadara')  is TRUE
 173       (...and neither does case)
 174    same_language_tag('en',       'en-US')     is FALSE
 175       (all-English is not the SAME as US English)
 176    same_language_tag('x-kadara', 'x-kadar')   is FALSE
 177       (these are totally unrelated tags)
 178    same_language_tag('no-bok',    'nb')       is TRUE
 179       (no-bok is a legacy tag for nb (Norwegian Bokmal))
 180
 181 C<same_language_tag> works by just seeing whether
 182 C<encode_language_tag($lang1)> is the same as
 183 C<encode_language_tag($lang2)>.
 184
 185 (Yes, I know this function is named a bit oddly.  Call it historic
 186 reasons.)
 187
 188 =cut
 189
 190 sub same_language_tag {
 191   my $el1 = &encode_language_tag($_[0]);
 192   return 0 unless defined $el1;
 193    # this avoids the problem of
 194    # encode_language_tag($lang1) eq and encode_language_tag($lang2)
 195    # being true if $lang1 and $lang2 are both undef
 196
 197   return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
 198 }
 199
 200 ###########################################################################
 201
 202 =item * the function similarity_language_tag($lang1, $lang2)
 203
 204 Returns an integer representing the degree of similarity between
 205 tags $lang1 and $lang2 (the order of which does not matter), where
 206 similarity is the number of common elements on the left,
 207 without regard to case and to x/i- alternation.
 208
 209    similarity_language_tag('fr', 'fr-ca')           is 1
 210       (one element in common)
 211    similarity_language_tag('fr-ca', 'fr-FR')        is 1
 212       (one element in common)
 213
 214    similarity_language_tag('fr-CA-joual',
 215                            'fr-CA-PEI')             is 2
 216    similarity_language_tag('fr-CA-joual', 'fr-CA')  is 2
 217       (two elements in common)
 218
 219    similarity_language_tag('x-kadara', 'i-kadara')  is 1
 220       (x/i- doesn't matter)
 221
 222    similarity_language_tag('en',       'x-kadar')   is 0
 223    similarity_language_tag('x-kadara', 'x-kadar')   is 0
 224       (unrelated tags -- no similarity)
 225
 226    similarity_language_tag('i-cree-syllabic',
 227                            'i-cherokee-syllabic')   is 0
 228       (no B<leftmost> elements in common!)
 229
 230 =cut
 231
 232 sub similarity_language_tag {
 233   my $lang1 = &encode_language_tag($_[0]);
 234   my $lang2 = &encode_language_tag($_[1]);
 235    # And encode_language_tag takes care of the whole
 236    #  no-nyn==nn, i-hakka==zh-hakka, etc, things
 237
 238   # NB: (i-sil-...)?  (i-sgn-...)?
 239
 240   return undef if !defined($lang1) and !defined($lang2);
 241   return 0 if !defined($lang1) or !defined($lang2);
 242
 243   my @l1_subtags = split('-', $lang1);
 244   my @l2_subtags = split('-', $lang2);
 245   my $similarity = 0;
 246
 247   while(@l1_subtags and @l2_subtags) {
 248     if(shift(@l1_subtags) eq shift(@l2_subtags)) {
 249       ++$similarity;
 250     } else {
 251       last;
 252     }
 253   }
 254   return $similarity;
 255 }
 256
 257 ###########################################################################
 258
 259 =item * the function is_dialect_of($lang1, $lang2)
 260
 261 Returns true iff language tag $lang1 represents a subform of
 262 language tag $lang2.
 263
 264 B<Get the order right!  It doesn't work the other way around!>
 265
 266    is_dialect_of('en-US', 'en')            is TRUE
 267      (American English IS a dialect of all-English)
 268
 269    is_dialect_of('fr-CA-joual', 'fr-CA')   is TRUE
 270    is_dialect_of('fr-CA-joual', 'fr')      is TRUE
 271      (Joual is a dialect of (a dialect of) French)
 272
 273    is_dialect_of('en', 'en-US')            is FALSE
 274      (all-English is a NOT dialect of American English)
 275
 276    is_dialect_of('fr', 'en-CA')            is FALSE
 277
 278    is_dialect_of('en',    'en'   )         is TRUE
 279    is_dialect_of('en-US', 'en-US')         is TRUE
 280      (B<Note:> these are degenerate cases)
 281
 282    is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
 283      (the x/i thing doesn't matter, nor does case)
 284
 285    is_dialect_of('nn', 'no')               is TRUE
 286      (because 'nn' (New Norse) is aliased to 'no-nyn',
 287       as a special legacy case, and 'no-nyn' is a
 288       subform of 'no' (Norwegian))
 289
 290 =cut
 291
 292 sub is_dialect_of {
 293
 294   my $lang1 = &encode_language_tag($_[0]);
 295   my $lang2 = &encode_language_tag($_[1]);
 296
 297   return undef if !defined($lang1) and !defined($lang2);
 298   return 0 if !defined($lang1) or !defined($lang2);
 299
 300   return 1 if $lang1 eq $lang2;
 301   return 0 if length($lang1) < length($lang2);
 302
 303   $lang1 .= '-';
 304   $lang2 .= '-';
 305   return
 306     (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
 307 }
 308
 309 ###########################################################################
 310
 311 =item * the function super_languages($lang1)
 312
 313 Returns a list of language tags that are superordinate tags to $lang1
 314 -- it gets this by removing subtags from the end of $lang1 until
 315 nothing (or just "i" or "x") is left.
 316
 317    super_languages("fr-CA-joual")  is  ("fr-CA", "fr")
 318
 319    super_languages("en-AU")  is  ("en")
 320
 321    super_languages("en")  is  empty-list, ()
 322
 323    super_languages("i-cherokee")  is  empty-list, ()
 324     ...not ("i"), which would be illegal as well as pointless.
 325
 326 If $lang1 is not a valid language tag, returns empty-list in
 327 a list context, undef in a scalar context.
 328
 329 A notable and rather unavoidable problem with this method:
 330 "x-mingo-tom" has an "x" because the whole tag isn't an
 331 IANA-registered tag -- but super_languages('x-mingo-tom') is
 332 ('x-mingo') -- which isn't really right, since 'i-mingo' is
 333 registered.  But this module has no way of knowing that.  (But note
 334 that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
 335
 336 More importantly, you assume I<at your peril> that superordinates of
 337 $lang1 are mutually intelligible with $lang1.  Consider this
 338 carefully.
 339
 340 =cut
 341
 342 sub super_languages {
 343   my $lang1 = $_[0];
 344   return() unless defined($lang1) && &is_language_tag($lang1);
 345
 346   # a hack for those annoying new (2001) tags:
 347   $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards
 348   $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards
 349   $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way
 350    # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark
 351
 352   my @l1_subtags = split('-', $lang1);
 353
 354   ## Changes in the language tagging standards may have to be reflected here.
 355
 356   # NB: (i-sil-...)?
 357
 358   my @supers = ();
 359   foreach my $bit (@l1_subtags) {
 360     push @supers,
 361       scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
 362   }
 363   pop @supers if @supers;
 364   shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
 365   return reverse @supers;
 366 }
 367
 368 ###########################################################################
 369
 370 =item * the function locale2language_tag($locale_identifier)
 371
 372 This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
 373 and maps it to a language tag.  If it's not mappable (as with,
 374 notably, "C" and "POSIX"), this returns empty-list in a list context,
 375 or undef in a scalar context.
 376
 377    locale2language_tag("en") is "en"
 378
 379    locale2language_tag("en_US") is "en-US"
 380
 381    locale2language_tag("en_US.ISO8859-1") is "en-US"
 382
 383    locale2language_tag("C") is undef or ()
 384
 385    locale2language_tag("POSIX") is undef or ()
 386
 387    locale2language_tag("POSIX") is undef or ()
 388
 389 I'm not totally sure that locale names map satisfactorily to language
 390 tags.  Think REAL hard about how you use this.  YOU HAVE BEEN WARNED.
 391
 392 The output is untainted.  If you don't know what tainting is,
 393 don't worry about it.
 394
 395 =cut
 396
 397 sub locale2language_tag {
 398   my $lang =
 399     $_[0] =~ m/(.+)/  # to make for an untainted result
 400     ? $1 : ''
 401   ;
 402
 403   return $lang if &is_language_tag($lang); # like "en"
 404
 405   $lang =~ tr<_><->;  # "en_US" -> en-US
 406   $lang =~ s<(?:[\.\@][-_a-zA-Z0-9]+)+$><>s;  # "en_US.ISO8859-1" -> en-US
 407    # it_IT.utf8@euro => it-IT
 408
 409   return $lang if &is_language_tag($lang);
 410
 411   return;
 412 }
 413
 414 ###########################################################################
 415
 416 =item * the function encode_language_tag($lang1)
 417
 418 This function, if given a language tag, returns an encoding of it such
 419 that:
 420
 421 * tags representing different languages never get the same encoding.
 422
 423 * tags representing the same language always get the same encoding.
 424
 425 * an encoding of a formally valid language tag always is a string
 426 value that is defined, has length, and is true if considered as a
 427 boolean.
 428
 429 Note that the encoding itself is B<not> a formally valid language tag.
 430 Note also that you cannot, currently, go from an encoding back to a
 431 language tag that it's an encoding of.
 432
 433 Note also that you B<must> consider the encoded value as atomic; i.e.,
 434 you should not consider it as anything but an opaque, unanalysable
 435 string value.  (The internals of the encoding method may change in
 436 future versions, as the language tagging standard changes over time.)
 437
 438 C<encode_language_tag> returns undef if given anything other than a
 439 formally valid language tag.
 440
 441 The reason C<encode_language_tag> exists is because different language
 442 tags may represent the same language; this is normally treatable with
 443 C<same_language_tag>, but consider this situation:
 444
 445 You have a data file that expresses greetings in different languages.
 446 Its format is "[language tag]=[how to say 'Hello']", like:
 447
 448           en-US=Hiho
 449           fr=Bonjour
 450           i-mingo=Hau'
 451
 452 And suppose you write a program that reads that file and then runs as
 453 a daemon, answering client requests that specify a language tag and
 454 then expect the string that says how to greet in that language.  So an
 455 interaction looks like:
 456
 457           greeting-client asks:    fr
 458           greeting-server answers: Bonjour
 459
 460 So far so good.  But suppose the way you're implementing this is:
 461
 462           my %greetings;
 463           die unless open(IN, "<", "in.dat");
 464           while(<IN>) {
 465             chomp;
 466             next unless /^([^=]+)=(.+)/s;
 467             my($lang, $expr) = ($1, $2);
 468             $greetings{$lang} = $expr;
 469           }
 470           close(IN);
 471
 472 at which point %greetings has the contents:
 473
 474           "en-US"   => "Hiho"
 475           "fr"      => "Bonjour"
 476           "i-mingo" => "Hau'"
 477
 478 And suppose then that you answer client requests for language $wanted
 479 by just looking up $greetings{$wanted}.
 480
 481 If the client asks for "fr", that will look up successfully in
 482 %greetings, to the value "Bonjour".  And if the client asks for
 483 "i-mingo", that will look up successfully in %greetings, to the value
 484 "Hau'".
 485
 486 But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
 487 lookup in %greetings fails.  That's the Wrong Thing.
 488
 489 You could instead do lookups on $wanted with:
 490
 491           use I18N::LangTags qw(same_language_tag);
 492           my $response = '';
 493           foreach my $l2 (keys %greetings) {
 494             if(same_language_tag($wanted, $l2)) {
 495               $response = $greetings{$l2};
 496               last;
 497             }
 498           }
 499
 500 But that's rather inefficient.  A better way to do it is to start your
 501 program with:
 502
 503           use I18N::LangTags qw(encode_language_tag);
 504           my %greetings;
 505           die unless open(IN, "<", "in.dat");
 506           while(<IN>) {
 507             chomp;
 508             next unless /^([^=]+)=(.+)/s;
 509             my($lang, $expr) = ($1, $2);
 510             $greetings{
 511                         encode_language_tag($lang)
 512                       } = $expr;
 513           }
 514           close(IN);
 515
 516 and then just answer client requests for language $wanted by just
 517 looking up
 518
 519           $greetings{encode_language_tag($wanted)}
 520
 521 And that does the Right Thing.
 522
 523 =cut
 524
 525 sub encode_language_tag {
 526   # Only similarity_language_tag() is allowed to analyse encodings!
 527
 528   ## Changes in the language tagging standards may have to be reflected here.
 529
 530   my($tag) = $_[0] || return undef;
 531   return undef unless &is_language_tag($tag);
 532
 533   # For the moment, these legacy variances are few enough that
 534   #  we can just handle them here with regexps.
 535   $tag =~ s/^iw\b/he/i; # Hebrew
 536   $tag =~ s/^in\b/id/i; # Indonesian
 537   $tag =~ s/^cre\b/cr/i; # Cree
 538   $tag =~ s/^jw\b/jv/i; # Javanese
 539   $tag =~ s/^[ix]-lux\b/lb/i;  # Luxemburger
 540   $tag =~ s/^[ix]-navajo\b/nv/i;  # Navajo
 541   $tag =~ s/^ji\b/yi/i;  # Yiddish
 542   # SMB 2003 -- Hm.  There's a bunch of new XXX->YY variances now,
 543   #  but maybe they're all so obscure I can ignore them.   "Obscure"
 544   #  meaning either that the language is obscure, and/or that the
 545   #  XXX form was extant so briefly that it's unlikely it was ever
 546   #  used.  I hope.
 547   #
 548   # These go FROM the simplex to complex form, to get
 549   #  similarity-comparison right.  And that's okay, since
 550   #  similarity_language_tag is the only thing that
 551   #  analyzes our output.
 552   $tag =~ s/^[ix]-hakka\b/zh-hakka/i;  # Hakka
 553   $tag =~ s/^nb\b/no-bok/i;  # BACKWARDS for Bokmal
 554   $tag =~ s/^nn\b/no-nyn/i;  # BACKWARDS for Nynorsk
 555
 556   $tag =~ s/^[xiXI]-//s;
 557    # Just lop off any leading "x/i-"
 558
 559   return "~" . uc($tag);
 560 }
 561
 562 #--------------------------------------------------------------------------
 563
 564 =item * the function alternate_language_tags($lang1)
 565
 566 This function, if given a language tag, returns all language tags that
 567 are alternate forms of this language tag.  (I.e., tags which refer to
 568 the same language.)  This is meant to handle legacy tags caused by
 569 the minor changes in language tag standards over the years; and
 570 the x-/i- alternation is also dealt with.
 571
 572 Note that this function does I<not> try to equate new (and never-used,
 573 and unusable)
 574 ISO639-2 three-letter tags to old (and still in use) ISO639-1
 575 two-letter equivalents -- like "ara" -> "ar" -- because
 576 "ara" has I<never> been in use as an Internet language tag,
 577 and RFC 3066 stipulates that it never should be, since a shorter
 578 tag ("ar") exists.
 579
 580 Examples:
 581
 582   alternate_language_tags('no-bok')       is ('nb')
 583   alternate_language_tags('nb')           is ('no-bok')
 584   alternate_language_tags('he')           is ('iw')
 585   alternate_language_tags('iw')           is ('he')
 586   alternate_language_tags('i-hakka')      is ('zh-hakka', 'x-hakka')
 587   alternate_language_tags('zh-hakka')     is ('i-hakka', 'x-hakka')
 588   alternate_language_tags('en')           is ()
 589   alternate_language_tags('x-mingo-tom')  is ('i-mingo-tom')
 590   alternate_language_tags('x-klikitat')   is ('i-klikitat')
 591   alternate_language_tags('i-klikitat')   is ('x-klikitat')
 592
 593 This function returns empty-list if given anything other than a formally
 594 valid language tag.
 595
 596 =cut
 597
 598 my %alt = qw( i x   x i   I X   X I );
 599 sub alternate_language_tags {
 600   my $tag = $_[0];
 601   return() unless &is_language_tag($tag);
 602
 603   my @em; # push 'em real goood!
 604
 605   # For the moment, these legacy variances are few enough that
 606   #  we can just handle them here with regexps.
 607
 608   if(     $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1";
 609   } elsif($tag =~ m/^zh-hakka\b(.*)/i) {  push @em, "x-hakka$1", "i-hakka$1";
 610
 611   } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1";
 612   } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1";
 613
 614   } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1";
 615   } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1";
 616
 617   } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1";
 618   } elsif($tag =~ m/^lb\b(.*)/i) {       push @em, "i-lux$1", "x-lux$1";
 619
 620   } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1";
 621   } elsif($tag =~ m/^nv\b(.*)/i) {          push @em, "i-navajo$1", "x-navajo$1";
 622
 623   } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1";
 624   } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1";
 625
 626   } elsif($tag =~ m/^nb\b(.*)/i) {     push @em, "no-bok$1";
 627   } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1";
 628
 629   } elsif($tag =~ m/^nn\b(.*)/i) {     push @em, "no-nyn$1";
 630   } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1";
 631   }
 632
 633   push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/;
 634   return @em;
 635 }
 636
 637 ###########################################################################
 638
 639 {
 640   # Init %Panic...
 641
 642   my @panic = (  # MUST all be lowercase!
 643    # Only large ("national") languages make it in this list.
 644    #  If you, as a user, are so bizarre that the /only/ language
 645    #  you claim to accept is Galician, then no, we won't do you
 646    #  the favor of providing Catalan as a panic-fallback for
 647    #  you.  Because if I start trying to add "little languages" in
 648    #  here, I'll just go crazy.
 649
 650    # Scandinavian lgs.  All based on opinion and hearsay.
 651    'sv' => [qw(nb no da nn)],
 652    'da' => [qw(nb no sv nn)], # I guess
 653    [qw(no nn nb)], [qw(no nn nb sv da)],
 654    'is' => [qw(da sv no nb nn)],
 655    'fo' => [qw(da is no nb nn sv)], # I guess
 656
 657    # I think this is about the extent of tolerable intelligibility
 658    #  among large modern Romance languages.
 659    'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French
 660    'ca' => [qw(es pt it fr)],
 661    'es' => [qw(ca it fr pt)],
 662    'it' => [qw(es fr ca pt)],
 663    'fr' => [qw(es it ca pt)],
 664
 665    # Also assume that speakers of the main Indian languages prefer
 666    #  to read/hear Hindi over English
 667    [qw(
 668      as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur
 669    )] => 'hi',
 670     # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri,
 671     # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya,
 672     # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu.
 673    'hi' => [qw(bn pa as or)],
 674    # I welcome finer data for the other Indian languages.
 675    #  E.g., what should Oriya's list be, besides just Hindi?
 676
 677    # And the panic languages for English is, of course, nil!
 678
 679    # My guesses at Slavic intelligibility:
 680    ([qw(ru be uk)]) x 2,  # Russian, Belarusian, Ukranian
 681    ([qw(sr hr bs)]) x 2,  # Serbian, Croatian, Bosnian
 682    'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak
 683
 684    'ms' => 'id', 'id' => 'ms', # Malay + Indonesian
 685
 686    'et' => 'fi', 'fi' => 'et', # Estonian + Finnish
 687
 688    #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai
 689
 690   );
 691   my($k,$v);
 692   while(@panic) {
 693     ($k,$v) = splice(@panic,0,2);
 694     foreach my $k (ref($k) ? @$k : $k) {
 695       foreach my $v (ref($v) ? @$v : $v) {
 696         push @{$Panic{$k} ||= []}, $v unless $k eq $v;
 697       }
 698     }
 699   }
 700 }
 701
 702 =item * the function @langs = panic_languages(@accept_languages)
 703
 704 This function takes a list of 0 or more language
 705 tags that constitute a given user's Accept-Language list, and
 706 returns a list of tags for I<other> (non-super)
 707 languages that are probably acceptable to the user, to be
 708 used I<if all else fails>.
 709
 710 For example, if a user accepts only 'ca' (Catalan) and
 711 'es' (Spanish), and the documents/interfaces you have
 712 available are just in German, Italian, and Chinese, then
 713 the user will most likely want the Italian one (and not
 714 the Chinese or German one!), instead of getting
 715 nothing.  So C<panic_languages('ca', 'es')> returns
 716 a list containing 'it' (Italian).
 717
 718 English ('en') is I<always> in the return list, but
 719 whether it's at the very end or not depends
 720 on the input languages.  This function works by consulting
 721 an internal table that stipulates what common
 722 languages are "close" to each other.
 723
 724 A useful construct you might consider using is:
 725
 726   @fallbacks = super_languages(@accept_languages);
 727   push @fallbacks, panic_languages(
 728     @accept_languages, @fallbacks,
 729   );
 730
 731 =cut
 732
 733 sub panic_languages {
 734   # When in panic or in doubt, run in circles, scream, and shout!
 735   my(@out, %seen);
 736   foreach my $t (@_) {
 737     next unless $t;
 738     next if $seen{$t}++; # so we don't return it or hit it again
 739     # push @out, super_languages($t); # nah, keep that separate
 740     push @out, @{ $Panic{lc $t} || next };
 741   }
 742   return grep !$seen{$_}++,  @out, 'en';
 743 }
 744
 745 #---------------------------------------------------------------------------
 746 #---------------------------------------------------------------------------
 747
 748 =item * the function implicate_supers( ...languages... )
 749
 750 This takes a list of strings (which are presumed to be language-tags;
 751 strings that aren't, are ignored); and after each one, this function
 752 inserts super-ordinate forms that don't already appear in the list.
 753 The original list, plus these insertions, is returned.
 754
 755 In other words, it takes this:
 756
 757   pt-br de-DE en-US fr pt-br-janeiro
 758
 759 and returns this:
 760
 761   pt-br pt de-DE de en-US en fr pt-br-janeiro
 762
 763 This function is most useful in the idiom
 764
 765   implicate_supers( I18N::LangTags::Detect::detect() );
 766
 767 (See L<I18N::LangTags::Detect>.)
 768
 769
 770 =item * the function implicate_supers_strictly( ...languages... )
 771
 772 This works like C<implicate_supers> except that the implicated
 773 forms are added to the end of the return list.
 774
 775 In other words, implicate_supers_strictly takes a list of strings
 776 (which are presumed to be language-tags; strings that aren't, are
 777 ignored) and after the whole given list, it inserts the super-ordinate forms
 778 of all given tags, minus any tags that already appear in the input list.
 779
 780 In other words, it takes this:
 781
 782   pt-br de-DE en-US fr pt-br-janeiro
 783
 784 and returns this:
 785
 786   pt-br de-DE en-US fr pt-br-janeiro pt de en
 787
 788 The reason this function has "_strictly" in its name is that when
 789 you're processing an Accept-Language list according to the RFCs, if
 790 you interpret the RFCs quite strictly, then you would use
 791 implicate_supers_strictly, but for normal use (i.e., common-sense use,
 792 as far as I'm concerned) you'd use implicate_supers.
 793
 794 =cut
 795
 796 sub implicate_supers {
 797   my @languages = grep is_language_tag($_), @_;
 798   my %seen_encoded;
 799   foreach my $lang (@languages) {
 800     $seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1
 801   }
 802
 803   my(@output_languages);
 804   foreach my $lang (@languages) {
 805     push @output_languages, $lang;
 806     foreach my $s ( I18N::LangTags::super_languages($lang) ) {
 807       # Note that super_languages returns the longest first.
 808       last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) };
 809       push @output_languages, $s;
 810     }
 811   }
 812   return uniq( @output_languages );
 813
 814 }
 815
 816 sub implicate_supers_strictly {
 817   my @tags = grep is_language_tag($_), @_;
 818   return uniq( @_,   map super_languages($_), @_ );
 819 }
 820
 821
 822
 823 ###########################################################################
 824 1;
 825 __END__
 826
 827 =back
 828
 829 =head1 ABOUT LOWERCASING
 830
 831 I've considered making all the above functions that output language
 832 tags return all those tags strictly in lowercase.  Having all your
 833 language tags in lowercase does make some things easier.  But you
 834 might as well just lowercase as you like, or call
 835 C<encode_language_tag($lang1)> where appropriate.
 836
 837 =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
 838
 839 In some future version of I18N::LangTags, I plan to include support
 840 for RFC2482-style language tags -- which are basically just normal
 841 language tags with their ASCII characters shifted into Plane 14.
 842
 843 =head1 SEE ALSO
 844
 845 * L<I18N::LangTags::List|I18N::LangTags::List>
 846
 847 * RFC 3066, C<http://www.ietf.org/rfc/rfc3066.txt>, "Tags for the
 848 Identification of Languages".  (Obsoletes RFC 1766)
 849
 850 * RFC 2277, C<http://www.ietf.org/rfc/rfc2277.txt>, "IETF Policy on
 851 Character Sets and Languages".
 852
 853 * RFC 2231, C<http://www.ietf.org/rfc/rfc2231.txt>, "MIME Parameter
 854 Value and Encoded Word Extensions: Character Sets, Languages, and
 855 Continuations".
 856
 857 * RFC 2482, C<http://www.ietf.org/rfc/rfc2482.txt>,
 858 "Language Tagging in Unicode Plain Text".
 859
 860 * Locale::Codes, in
 861 C<http://www.perl.com/CPAN/modules/by-module/Locale/>
 862
 863 * ISO 639-2, "Codes for the representation of names of languages",
 864 including two-letter and three-letter codes,
 865 C<http://www.loc.gov/standards/iso639-2/php/code_list.php>
 866
 867 * The IANA list of registered languages (hopefully up-to-date),
 868 C<http://www.iana.org/assignments/language-tags>
 869
 870 =head1 COPYRIGHT
 871
 872 Copyright (c) 1998+ Sean M. Burke. All rights reserved.
 873
 874 This library is free software; you can redistribute it and/or
 875 modify it under the same terms as Perl itself.
 876
 877 The programs and documentation in this dist are distributed in
 878 the hope that they will be useful, but without any warranty; without
 879 even the implied warranty of merchantability or fitness for a
 880 particular purpose.
 881
 882 =head1 AUTHOR
 883
 884 Sean M. Burke C<sburke@cpan.org>
 885
 886 =cut
 887