Comply with the 0x80th commandment

[perl5.git] / lib / I18N / LangTags.pm
diff --git a/lib/I18N/LangTags.pm b/lib/I18N/LangTags.pm

index c8a64d3..0bdc65f 100644 (file)
--- a/lib/I18N/LangTags.pm
+++ b/lib/I18N/LangTags.pm
@@ -1,23 +1,28 @@
  
-# Time-stamp: "2001-05-25 07:36:55 MDT"
+# Time-stamp: "2004-10-06 23:26:33 ADT"
  # Sean M. Burke <sburke@cpan.org>
  
  require 5.000;
  package I18N::LangTags;
  use strict;
-use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); # $Debug
+use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic);
  require Exporter;
-# $Debug = 0;
  @ISA = qw(Exporter);
  @EXPORT = qw();
  @EXPORT_OK = qw(is_language_tag same_language_tag
                  extract_language_tags super_languages
                  similarity_language_tag is_dialect_of
                  locale2language_tag alternate_language_tags
-                encode_language_tag
+                encode_language_tag panic_languages
+                implicate_supers
+                implicate_supers_strictly
                 );
+%EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
+
+$VERSION = "0.35";
+
+sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function
  
-$VERSION = "0.21";
  
  =head1 NAME
  
@@ -25,16 +30,20 @@ I18N::LangTags - functions for dealing with RFC3066-style language tags
  
  =head1 SYNOPSIS
  
-    use I18N::LangTags qw(is_language_tag same_language_tag
-                          extract_language_tags super_languages
-                          similarity_language_tag is_dialect_of
-                          locale2language_tag alternate_language_tags
-                          encode_language_tag
-                         );
+  use I18N::LangTags();
+
+...or specify whichever of those functions you want to import, like so:
+
+  use I18N::LangTags qw(implicate_supers similarity_language_tag);
+
+All the exportable functions are listed below -- you're free to import
+only some, or none at all.  By default, none are imported.  If you
+say:
  
-...or whatever of those functions you want to import.  Those are
-all the exportable functions -- you're free to import only some,
-or none at all.  By default, none are imported.
+    use I18N::LangTags qw(:ALL)
+
+...then all are exported.  (This saves you from having to use
+something less obvious like C<use I18N::LangTags qw(/./)>.)
  
  If you don't import any of these functions, assume a C<&I18N::LangTags::>
  in front of all the function names in the following examples.
@@ -92,7 +101,7 @@ sub is_language_tag {
    my($tag) = lc($_[0]);
  
    return 0 if $tag eq "i" or $tag eq "x";
-  # Bad degenerate cases the following
+  # Bad degenerate cases that the following
    #  regexp would erroneously let pass
  
    return $tag =~ 
@@ -166,6 +175,8 @@ representing the same language-form.
        (all-English is not the SAME as US English)
     same_language_tag('x-kadara', 'x-kadar')   is FALSE
        (these are totally unrelated tags)
+   same_language_tag('no-bok',    'nb')       is TRUE
+      (no-bok is a legacy tag for nb (Norwegian Bokmal))
  
  C<same_language_tag> works by just seeing whether
  C<encode_language_tag($lang1)> is the same as
@@ -221,7 +232,9 @@ without regard to case and to x/i- alternation.
  sub similarity_language_tag {
    my $lang1 = &encode_language_tag($_[0]);
    my $lang2 = &encode_language_tag($_[1]);
-
+   # And encode_language_tag takes care of the whole
+   #  no-nyn==nn, i-hakka==zh-hakka, etc, things
+   
    # NB: (i-sil-...)?  (i-sgn-...)?
  
    return undef if !defined($lang1) and !defined($lang2);
@@ -245,7 +258,7 @@ sub similarity_language_tag {
  
  =item * the function is_dialect_of($lang1, $lang2)
  
-Returns true iff language tag $lang1 represents a subdialect of
+Returns true iff language tag $lang1 represents a subform of
  language tag $lang2.
  
  B<Get the order right!  It doesn't work the other way around!>
@@ -262,13 +275,18 @@ B<Get the order right!  It doesn't work the other way around!>
  
     is_dialect_of('fr', 'en-CA')            is FALSE
  
-   is_dialect_of('en',    'en'   )            is TRUE
-   is_dialect_of('en-US', 'en-US')            is TRUE
+   is_dialect_of('en',    'en'   )         is TRUE
+   is_dialect_of('en-US', 'en-US')         is TRUE
       (B<Note:> these are degenerate cases)
  
     is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
       (the x/i thing doesn't matter, nor does case)
  
+   is_dialect_of('nn', 'no')               is TRUE
+     (because 'nn' (New Norse) is aliased to 'no-nyn',
+      as a special legacy case, and 'no-nyn' is a
+      subform of 'no' (Norwegian))
+
  =cut
  
  sub is_dialect_of {
@@ -324,6 +342,13 @@ carefully.
  sub super_languages {
    my $lang1 = $_[0];
    return() unless defined($lang1) && &is_language_tag($lang1);
+
+  # a hack for those annoying new (2001) tags:
+  $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards
+  $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards
+  $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way
+   # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark
+
    my @l1_subtags = split('-', $lang1);
  
    ## Changes in the language tagging standards may have to be reflected here.
@@ -378,7 +403,8 @@ sub locale2language_tag {
    return $lang if &is_language_tag($lang); # like "en"
  
    $lang =~ tr<_><->;  # "en_US" -> en-US
-  $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s;  # "en_US.ISO8859-1" -> en-US
+  $lang =~ s<(?:[\.\@][-_a-zA-Z0-9]+)+$><>s;  # "en_US.ISO8859-1" -> en-US
+   # it_IT.utf8@euro => it-IT
  
    return $lang if &is_language_tag($lang);
  
@@ -463,7 +489,7 @@ lookup in %greetings fails.  That's the Wrong Thing.
  You could instead do lookups on $wanted with:
  
            use I18N::LangTags qw(same_language_tag);
-          my $repsonse = '';
+          my $response = '';
            foreach my $l2 (keys %greetings) {
              if(same_language_tag($wanted, $l2)) {
                $response = $greetings{$l2};
@@ -501,15 +527,36 @@ sub encode_language_tag {
  
    ## Changes in the language tagging standards may have to be reflected here.
  
-  my($tag) = uc($_[0]); # smash case
+  my($tag) = $_[0] || return undef;
    return undef unless &is_language_tag($tag);
-   # If it's not a language tag, its encoding is undef
+
+  # For the moment, these legacy variances are few enough that
+  #  we can just handle them here with regexps.
+  $tag =~ s/^iw\b/he/i; # Hebrew
+  $tag =~ s/^in\b/id/i; # Indonesian
+  $tag =~ s/^cre\b/cr/i; # Cree
+  $tag =~ s/^jw\b/jv/i; # Javanese
+  $tag =~ s/^[ix]-lux\b/lb/i;  # Luxemburger
+  $tag =~ s/^[ix]-navajo\b/nv/i;  # Navajo
+  $tag =~ s/^ji\b/yi/i;  # Yiddish
+  # SMB 2003 -- Hm.  There's a bunch of new XXX->YY variances now,
+  #  but maybe they're all so obscure I can ignore them.   "Obscure"
+  #  meaning either that the language is obscure, and/or that the
+  #  XXX form was extant so briefly that it's unlikely it was ever
+  #  used.  I hope.
+  #
+  # These go FROM the simplex to complex form, to get
+  #  similarity-comparison right.  And that's okay, since
+  #  similarity_language_tag is the only thing that
+  #  analyzes our output.
+  $tag =~ s/^[ix]-hakka\b/zh-hakka/i;  # Hakka
+  $tag =~ s/^nb\b/no-bok/i;  # BACKWARDS for Bokmal
+  $tag =~ s/^nn\b/no-nyn/i;  # BACKWARDS for Nynorsk
  
    $tag =~ s/^[xiXI]-//s;
     # Just lop off any leading "x/i-"
-   # Or I suppose I could do s/^[xiXI]-/_/s or something.
  
-  return "~$tag";
+  return "~" . uc($tag);
  }
  
  #--------------------------------------------------------------------------
@@ -517,40 +564,266 @@ sub encode_language_tag {
  =item * the function alternate_language_tags($lang1)
  
  This function, if given a language tag, returns all language tags that
-are alternate forms of this language tag.  (There is little
-alternation in the C<current> language tagging formalism, but
-extensions to the formalism are under consideration which could add a
-great deal of alternation.)
-
-Examples from the current formalism:
-
-          alternate_language_tags('en')           is   ()
-          alternate_language_tags('x-mingo-tom')  is   ('i-mingo-tom')
-          alternate_language_tags('x-klikitat')   is   ('i-klikitat')
-          alternate_language_tags('i-klikitat')   is   ('x-klikitat')
-
-This function returns undef if given anything other than a formally
+are alternate forms of this language tag.  (I.e., tags which refer to
+the same language.)  This is meant to handle legacy tags caused by
+the minor changes in language tag standards over the years; and
+the x-/i- alternation is also dealt with.
+
+Note that this function does I<not> try to equate new (and never-used,
+and unusable)
+ISO639-2 three-letter tags to old (and still in use) ISO639-1
+two-letter equivalents -- like "ara" -> "ar" -- because
+"ara" has I<never> been in use as an Internet language tag,
+and RFC 3066 stipulates that it never should be, since a shorter
+tag ("ar") exists.
+
+Examples:
+
+          alternate_language_tags('no-bok')       is ('nb')
+          alternate_language_tags('nb')           is ('no-bok')
+          alternate_language_tags('he')           is ('iw')
+          alternate_language_tags('iw')           is ('he')
+          alternate_language_tags('i-hakka')      is ('zh-hakka', 'x-hakka')
+          alternate_language_tags('zh-hakka')     is ('i-hakka', 'x-hakka')
+          alternate_language_tags('en')           is ()
+          alternate_language_tags('x-mingo-tom')  is ('i-mingo-tom')
+          alternate_language_tags('x-klikitat')   is ('i-klikitat')
+          alternate_language_tags('i-klikitat')   is ('x-klikitat')
+
+This function returns empty-list if given anything other than a formally
  valid language tag.
  
  =cut
  
  my %alt = qw( i x   x i   I X   X I );
  sub alternate_language_tags {
-  ## Changes in the language tagging standards may have to be reflected here.
    my $tag = $_[0];
    return() unless &is_language_tag($tag);
  
- # might as well preserve case
+  my @em; # push 'em real goood!
+
+  # For the moment, these legacy variances are few enough that
+  #  we can just handle them here with regexps.
+  
+  if(     $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1";
+  } elsif($tag =~ m/^zh-hakka\b(.*)/i) {  push @em, "x-hakka$1", "i-hakka$1";
+
+  } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1";
+  } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1";
+
+  } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1";
+  } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1";
+
+  } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1";
+  } elsif($tag =~ m/^lb\b(.*)/i) {       push @em, "i-lux$1", "x-lux$1";
  
-  if($tag =~ /^([XIxi])(-.+)/) {
-    # This handles all the alternation that exists CURRENTLY
-    return($alt{$1} . $2);
+  } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1";
+  } elsif($tag =~ m/^nv\b(.*)/i) {          push @em, "i-navajo$1", "x-navajo$1";
+
+  } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1";
+  } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1";
+
+  } elsif($tag =~ m/^nb\b(.*)/i) {     push @em, "no-bok$1";
+  } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1";
+  
+  } elsif($tag =~ m/^nn\b(.*)/i) {     push @em, "no-nyn$1";
+  } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1";
    }
-  return();
+
+  push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/;
+  return @em;
  }
  
  ###########################################################################
  
+{
+  # Init %Panic...
+  
+  my @panic = (  # MUST all be lowercase!
+   # Only large ("national") languages make it in this list.
+   #  If you, as a user, are so bizarre that the /only/ language
+   #  you claim to accept is Galician, then no, we won't do you
+   #  the favor of providing Catalan as a panic-fallback for
+   #  you.  Because if I start trying to add "little languages" in
+   #  here, I'll just go crazy.
+
+   # Scandinavian lgs.  All based on opinion and hearsay.
+   'sv' => [qw(nb no da nn)],
+   'da' => [qw(nb no sv nn)], # I guess
+   [qw(no nn nb)], [qw(no nn nb sv da)],
+   'is' => [qw(da sv no nb nn)],
+   'fo' => [qw(da is no nb nn sv)], # I guess
+   
+   # I think this is about the extent of tolerable intelligibility
+   #  among large modern Romance languages.
+   'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French
+   'ca' => [qw(es pt it fr)],
+   'es' => [qw(ca it fr pt)],
+   'it' => [qw(es fr ca pt)],
+   'fr' => [qw(es it ca pt)],
+   
+   # Also assume that speakers of the main Indian languages prefer
+   #  to read/hear Hindi over English
+   [qw(
+     as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur
+   )] => 'hi',
+    # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri,
+    # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya,
+    # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu.
+   'hi' => [qw(bn pa as or)],
+   # I welcome finer data for the other Indian languages.
+   #  E.g., what should Oriya's list be, besides just Hindi?
+   
+   # And the panic languages for English is, of course, nil!
+
+   # My guesses at Slavic intelligibility:
+   ([qw(ru be uk)]) x 2,  # Russian, Belarusian, Ukranian
+   'sr' => 'hr', 'hr' => 'sr', # Serb + Croat
+   'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak
+
+   'ms' => 'id', 'id' => 'ms', # Malay + Indonesian
+
+   'et' => 'fi', 'fi' => 'et', # Estonian + Finnish
+
+   #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai
+
+  );
+  my($k,$v);
+  while(@panic) {
+    ($k,$v) = splice(@panic,0,2);
+    foreach my $k (ref($k) ? @$k : $k) {
+      foreach my $v (ref($v) ? @$v : $v) {
+        push @{$Panic{$k} ||= []}, $v unless $k eq $v;
+      }
+    }
+  }
+}
+
+=item * the function @langs = panic_languages(@accept_languages)
+
+This function takes a list of 0 or more language
+tags that constitute a given user's Accept-Language list, and
+returns a list of tags for I<other> (non-super)
+languages that are probably acceptable to the user, to be
+used I<if all else fails>.
+
+For example, if a user accepts only 'ca' (Catalan) and
+'es' (Spanish), and the documents/interfaces you have
+available are just in German, Italian, and Chinese, then
+the user will most likely want the Italian one (and not
+the Chinese or German one!), instead of getting
+nothing.  So C<panic_languages('ca', 'es')> returns
+a list containing 'it' (Italian).
+
+English ('en') is I<always> in the return list, but
+whether it's at the very end or not depends
+on the input languages.  This function works by consulting
+an internal table that stipulates what common
+languages are "close" to each other.
+
+A useful construct you might consider using is:
+
+  @fallbacks = super_languages(@accept_languages);
+  push @fallbacks, panic_languages(
+    @accept_languages, @fallbacks,
+  );
+
+=cut
+
+sub panic_languages {
+  # When in panic or in doubt, run in circles, scream, and shout!
+  my(@out, %seen);
+  foreach my $t (@_) {
+    next unless $t;
+    next if $seen{$t}++; # so we don't return it or hit it again
+    # push @out, super_languages($t); # nah, keep that separate
+    push @out, @{ $Panic{lc $t} || next };
+  }
+  return grep !$seen{$_}++,  @out, 'en';
+}
+
+#---------------------------------------------------------------------------
+#---------------------------------------------------------------------------
+
+=item * the function implicate_supers( ...languages... )
+
+This takes a list of strings (which are presumed to be language-tags;
+strings that aren't, are ignored); and after each one, this function
+inserts super-ordinate forms that don't already appear in the list.
+The original list, plus these insertions, is returned.
+
+In other words, it takes this:
+
+  pt-br de-DE en-US fr pt-br-janeiro
+
+and returns this:
+
+  pt-br pt de-DE de en-US en fr pt-br-janeiro
+
+This function is most useful in the idiom
+
+  implicate_supers( I18N::LangTags::Detect::detect() );
+
+(See L<I18N::LangTags::Detect>.)
+
+
+=item * the function implicate_supers_strictly( ...languages... )
+
+This works like C<implicate_supers> except that the implicated
+forms are added to the end of the return list.
+
+In other words, implicate_supers_strictly takes a list of strings
+(which are presumed to be language-tags; strings that aren't, are
+ignored) and after the whole given list, it inserts the super-ordinate forms 
+of all given tags, minus any tags that already appear in the input list.
+
+In other words, it takes this:
+
+  pt-br de-DE en-US fr pt-br-janeiro
+
+and returns this:
+
+  pt-br de-DE en-US fr pt-br-janeiro pt de en
+
+The reason this function has "_strictly" in its name is that when
+you're processing an Accept-Language list according to the RFCs, if
+you interpret the RFCs quite strictly, then you would use
+implicate_supers_strictly, but for normal use (i.e., common-sense use,
+as far as I'm concerned) you'd use implicate_supers.
+
+=cut
+
+sub implicate_supers {
+  my @languages = grep is_language_tag($_), @_;
+  my %seen_encoded;
+  foreach my $lang (@languages) {
+    $seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1
+  }
+
+  my(@output_languages);
+  foreach my $lang (@languages) {
+    push @output_languages, $lang;
+    foreach my $s ( I18N::LangTags::super_languages($lang) ) {
+      # Note that super_languages returns the longest first.
+      last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) };
+      push @output_languages, $s;
+    }
+  }
+  return uniq( @output_languages );
+
+}
+
+sub implicate_supers_strictly {
+  my @tags = grep is_language_tag($_), @_;
+  return uniq( @_,   map super_languages($_), @_ );
+}
+
+
+
+###########################################################################
+1;
+__END__
+
  =back
  
  =head1 ABOUT LOWERCASING
@@ -569,6 +842,8 @@ language tags with their ASCII characters shifted into Plane 14.
  
  =head1 SEE ALSO
  
+* L<I18N::LangTags::List|I18N::LangTags::List>
+
  * RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
  Identification of Languages".  (Obsoletes RFC 1766)
  
@@ -585,19 +860,16 @@ Continuations".
  * Locale::Codes, in
  C<http://www.perl.com/CPAN/modules/by-module/Locale/>
  
-* ISO 639, "Code for the representation of names of languages",
-C<http://www.indigo.ie/egt/standards/iso639/iso639-1-en.html>
-
  * ISO 639-2, "Codes for the representation of names of languages",
-including three-letter codes,
-C<http://lcweb.loc.gov/standards/iso639-2/bibcodes.html>
+including two-letter and three-letter codes,
+C<http://www.loc.gov/standards/iso639-2/langcodes.html>
  
  * The IANA list of registered languages (hopefully up-to-date),
-C<ftp://ftp.isi.edu/in-notes/iana/assignments/languages/>
+C<http://www.iana.org/assignments/language-tags>
  
  =head1 COPYRIGHT
  
-Copyright (c) 1998-2001 Sean M. Burke. All rights reserved.
+Copyright (c) 1998+ Sean M. Burke. All rights reserved.
  
  This library is free software; you can redistribute it and/or
  modify it under the same terms as Perl itself.
@@ -613,6 +885,3 @@ Sean M. Burke C<sburke@cpan.org>
  
  =cut
  
-1;
-
-__END__