2 # Time-stamp: "2004-10-06 23:26:33 ADT"
3 # Sean M. Burke <sburke@cpan.org>
6 package I18N::LangTags;
8 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic);
12 @EXPORT_OK = qw(is_language_tag same_language_tag
13 extract_language_tags super_languages
14 similarity_language_tag is_dialect_of
15 locale2language_tag alternate_language_tags
16 encode_language_tag panic_languages
18 implicate_supers_strictly
20 %EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
24 sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function
29 I18N::LangTags - functions for dealing with RFC3066-style language tags
35 ...or specify whichever of those functions you want to import, like so:
37 use I18N::LangTags qw(implicate_supers similarity_language_tag);
39 All the exportable functions are listed below -- you're free to import
40 only some, or none at all. By default, none are imported. If you
43 use I18N::LangTags qw(:ALL)
45 ...then all are exported. (This saves you from having to use
46 something less obvious like C<use I18N::LangTags qw(/./)>.)
48 If you don't import any of these functions, assume a C<&I18N::LangTags::>
49 in front of all the function names in the following examples.
53 Language tags are a formalism, described in RFC 3066 (obsoleting
54 1766), for declaring what language form (language and possibly
55 dialect) a given chunk of information is in.
57 This library provides functions for common tasks involving language
58 tags as they are needed in a variety of protocols and applications.
60 Please see the "See Also" references for a thorough explanation
61 of how to correctly use language tags.
67 ###########################################################################
69 =item * the function is_language_tag($lang1)
71 Returns true iff $lang1 is a formally valid language tag.
73 is_language_tag("fr") is TRUE
74 is_language_tag("x-jicarilla") is FALSE
75 (Subtags can be 8 chars long at most -- 'jicarilla' is 9)
77 is_language_tag("sgn-US") is TRUE
78 (That's American Sign Language)
80 is_language_tag("i-Klikitat") is TRUE
81 (True without regard to the fact noone has actually
82 registered Klikitat -- it's a formally valid tag)
84 is_language_tag("fr-patois") is TRUE
85 (Formally valid -- altho descriptively weak!)
87 is_language_tag("Spanish") is FALSE
88 is_language_tag("french-patois") is FALSE
89 (No good -- first subtag has to match
90 /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
92 is_language_tag("x-borg-prot2532") is TRUE
93 (Yes, subtags can contain digits, as of RFC3066)
99 ## Changes in the language tagging standards may have to be reflected here.
101 my($tag) = lc($_[0]);
103 return 0 if $tag eq "i" or $tag eq "x";
104 # Bad degenerate cases that the following
105 # regexp would erroneously let pass
111 (?: # Subtags thereafter
113 [a-z0-9]{1,8} # subtag
118 ###########################################################################
120 =item * the function extract_language_tags($whatever)
122 Returns a list of whatever looks like formally valid language tags
123 in $whatever. Not very smart, so don't get too creative with
124 what you want to feed it.
126 extract_language_tags("fr, fr-ca, i-mingo")
127 returns: ('fr', 'fr-ca', 'i-mingo')
129 extract_language_tags("It's like this: I'm in fr -- French!")
130 returns: ('It', 'in', 'fr')
131 (So don't just feed it any old thing.)
133 The output is untainted. If you don't know what tainting is,
134 don't worry about it.
138 sub extract_language_tags {
140 ## Changes in the language tagging standards may have to be reflected here.
143 $_[0] =~ m/(.+)/ # to make for an untainted result
147 return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
152 [iIxX] | [a-zA-Z]{2,3}
154 (?: # Subtags thereafter
156 [a-zA-Z0-9]{1,8} # subtag
163 ###########################################################################
165 =item * the function same_language_tag($lang1, $lang2)
167 Returns true iff $lang1 and $lang2 are acceptable variant tags
168 representing the same language-form.
170 same_language_tag('x-kadara', 'i-kadara') is TRUE
171 (The x/i- alternation doesn't matter)
172 same_language_tag('X-KADARA', 'i-kadara') is TRUE
173 (...and neither does case)
174 same_language_tag('en', 'en-US') is FALSE
175 (all-English is not the SAME as US English)
176 same_language_tag('x-kadara', 'x-kadar') is FALSE
177 (these are totally unrelated tags)
178 same_language_tag('no-bok', 'nb') is TRUE
179 (no-bok is a legacy tag for nb (Norwegian Bokmal))
181 C<same_language_tag> works by just seeing whether
182 C<encode_language_tag($lang1)> is the same as
183 C<encode_language_tag($lang2)>.
185 (Yes, I know this function is named a bit oddly. Call it historic
190 sub same_language_tag {
191 my $el1 = &encode_language_tag($_[0]);
192 return 0 unless defined $el1;
193 # this avoids the problem of
194 # encode_language_tag($lang1) eq and encode_language_tag($lang2)
195 # being true if $lang1 and $lang2 are both undef
197 return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
200 ###########################################################################
202 =item * the function similarity_language_tag($lang1, $lang2)
204 Returns an integer representing the degree of similarity between
205 tags $lang1 and $lang2 (the order of which does not matter), where
206 similarity is the number of common elements on the left,
207 without regard to case and to x/i- alternation.
209 similarity_language_tag('fr', 'fr-ca') is 1
210 (one element in common)
211 similarity_language_tag('fr-ca', 'fr-FR') is 1
212 (one element in common)
214 similarity_language_tag('fr-CA-joual',
216 similarity_language_tag('fr-CA-joual', 'fr-CA') is 2
217 (two elements in common)
219 similarity_language_tag('x-kadara', 'i-kadara') is 1
220 (x/i- doesn't matter)
222 similarity_language_tag('en', 'x-kadar') is 0
223 similarity_language_tag('x-kadara', 'x-kadar') is 0
224 (unrelated tags -- no similarity)
226 similarity_language_tag('i-cree-syllabic',
227 'i-cherokee-syllabic') is 0
228 (no B<leftmost> elements in common!)
232 sub similarity_language_tag {
233 my $lang1 = &encode_language_tag($_[0]);
234 my $lang2 = &encode_language_tag($_[1]);
235 # And encode_language_tag takes care of the whole
236 # no-nyn==nn, i-hakka==zh-hakka, etc, things
238 # NB: (i-sil-...)? (i-sgn-...)?
240 return undef if !defined($lang1) and !defined($lang2);
241 return 0 if !defined($lang1) or !defined($lang2);
243 my @l1_subtags = split('-', $lang1);
244 my @l2_subtags = split('-', $lang2);
247 while(@l1_subtags and @l2_subtags) {
248 if(shift(@l1_subtags) eq shift(@l2_subtags)) {
257 ###########################################################################
259 =item * the function is_dialect_of($lang1, $lang2)
261 Returns true iff language tag $lang1 represents a subform of
264 B<Get the order right! It doesn't work the other way around!>
266 is_dialect_of('en-US', 'en') is TRUE
267 (American English IS a dialect of all-English)
269 is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE
270 is_dialect_of('fr-CA-joual', 'fr') is TRUE
271 (Joual is a dialect of (a dialect of) French)
273 is_dialect_of('en', 'en-US') is FALSE
274 (all-English is a NOT dialect of American English)
276 is_dialect_of('fr', 'en-CA') is FALSE
278 is_dialect_of('en', 'en' ) is TRUE
279 is_dialect_of('en-US', 'en-US') is TRUE
280 (B<Note:> these are degenerate cases)
282 is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
283 (the x/i thing doesn't matter, nor does case)
285 is_dialect_of('nn', 'no') is TRUE
286 (because 'nn' (New Norse) is aliased to 'no-nyn',
287 as a special legacy case, and 'no-nyn' is a
288 subform of 'no' (Norwegian))
294 my $lang1 = &encode_language_tag($_[0]);
295 my $lang2 = &encode_language_tag($_[1]);
297 return undef if !defined($lang1) and !defined($lang2);
298 return 0 if !defined($lang1) or !defined($lang2);
300 return 1 if $lang1 eq $lang2;
301 return 0 if length($lang1) < length($lang2);
306 (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
309 ###########################################################################
311 =item * the function super_languages($lang1)
313 Returns a list of language tags that are superordinate tags to $lang1
314 -- it gets this by removing subtags from the end of $lang1 until
315 nothing (or just "i" or "x") is left.
317 super_languages("fr-CA-joual") is ("fr-CA", "fr")
319 super_languages("en-AU") is ("en")
321 super_languages("en") is empty-list, ()
323 super_languages("i-cherokee") is empty-list, ()
324 ...not ("i"), which would be illegal as well as pointless.
326 If $lang1 is not a valid language tag, returns empty-list in
327 a list context, undef in a scalar context.
329 A notable and rather unavoidable problem with this method:
330 "x-mingo-tom" has an "x" because the whole tag isn't an
331 IANA-registered tag -- but super_languages('x-mingo-tom') is
332 ('x-mingo') -- which isn't really right, since 'i-mingo' is
333 registered. But this module has no way of knowing that. (But note
334 that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
336 More importantly, you assume I<at your peril> that superordinates of
337 $lang1 are mutually intelligible with $lang1. Consider this
342 sub super_languages {
344 return() unless defined($lang1) && &is_language_tag($lang1);
346 # a hack for those annoying new (2001) tags:
347 $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards
348 $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards
349 $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way
350 # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark
352 my @l1_subtags = split('-', $lang1);
354 ## Changes in the language tagging standards may have to be reflected here.
359 foreach my $bit (@l1_subtags) {
361 scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
363 pop @supers if @supers;
364 shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
365 return reverse @supers;
368 ###########################################################################
370 =item * the function locale2language_tag($locale_identifier)
372 This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
373 and maps it to a language tag. If it's not mappable (as with,
374 notably, "C" and "POSIX"), this returns empty-list in a list context,
375 or undef in a scalar context.
377 locale2language_tag("en") is "en"
379 locale2language_tag("en_US") is "en-US"
381 locale2language_tag("en_US.ISO8859-1") is "en-US"
383 locale2language_tag("C") is undef or ()
385 locale2language_tag("POSIX") is undef or ()
387 locale2language_tag("POSIX") is undef or ()
389 I'm not totally sure that locale names map satisfactorily to language
390 tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED.
392 The output is untainted. If you don't know what tainting is,
393 don't worry about it.
397 sub locale2language_tag {
399 $_[0] =~ m/(.+)/ # to make for an untainted result
403 return $lang if &is_language_tag($lang); # like "en"
405 $lang =~ tr<_><->; # "en_US" -> en-US
406 $lang =~ s<(?:[\.\@][-_a-zA-Z0-9]+)+$><>s; # "en_US.ISO8859-1" -> en-US
407 # it_IT.utf8@euro => it-IT
409 return $lang if &is_language_tag($lang);
414 ###########################################################################
416 =item * the function encode_language_tag($lang1)
418 This function, if given a language tag, returns an encoding of it such
421 * tags representing different languages never get the same encoding.
423 * tags representing the same language always get the same encoding.
425 * an encoding of a formally valid language tag always is a string
426 value that is defined, has length, and is true if considered as a
429 Note that the encoding itself is B<not> a formally valid language tag.
430 Note also that you cannot, currently, go from an encoding back to a
431 language tag that it's an encoding of.
433 Note also that you B<must> consider the encoded value as atomic; i.e.,
434 you should not consider it as anything but an opaque, unanalysable
435 string value. (The internals of the encoding method may change in
436 future versions, as the language tagging standard changes over time.)
438 C<encode_language_tag> returns undef if given anything other than a
439 formally valid language tag.
441 The reason C<encode_language_tag> exists is because different language
442 tags may represent the same language; this is normally treatable with
443 C<same_language_tag>, but consider this situation:
445 You have a data file that expresses greetings in different languages.
446 Its format is "[language tag]=[how to say 'Hello']", like:
452 And suppose you write a program that reads that file and then runs as
453 a daemon, answering client requests that specify a language tag and
454 then expect the string that says how to greet in that language. So an
455 interaction looks like:
457 greeting-client asks: fr
458 greeting-server answers: Bonjour
460 So far so good. But suppose the way you're implementing this is:
463 die unless open(IN, "<in.dat");
466 next unless /^([^=]+)=(.+)/s;
467 my($lang, $expr) = ($1, $2);
468 $greetings{$lang} = $expr;
472 at which point %greetings has the contents:
478 And suppose then that you answer client requests for language $wanted
479 by just looking up $greetings{$wanted}.
481 If the client asks for "fr", that will look up successfully in
482 %greetings, to the value "Bonjour". And if the client asks for
483 "i-mingo", that will look up successfully in %greetings, to the value
486 But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
487 lookup in %greetings fails. That's the Wrong Thing.
489 You could instead do lookups on $wanted with:
491 use I18N::LangTags qw(same_language_tag);
493 foreach my $l2 (keys %greetings) {
494 if(same_language_tag($wanted, $l2)) {
495 $response = $greetings{$l2};
500 But that's rather inefficient. A better way to do it is to start your
503 use I18N::LangTags qw(encode_language_tag);
505 die unless open(IN, "<in.dat");
508 next unless /^([^=]+)=(.+)/s;
509 my($lang, $expr) = ($1, $2);
511 encode_language_tag($lang)
516 and then just answer client requests for language $wanted by just
519 $greetings{encode_language_tag($wanted)}
521 And that does the Right Thing.
525 sub encode_language_tag {
526 # Only similarity_language_tag() is allowed to analyse encodings!
528 ## Changes in the language tagging standards may have to be reflected here.
530 my($tag) = $_[0] || return undef;
531 return undef unless &is_language_tag($tag);
533 # For the moment, these legacy variances are few enough that
534 # we can just handle them here with regexps.
535 $tag =~ s/^iw\b/he/i; # Hebrew
536 $tag =~ s/^in\b/id/i; # Indonesian
537 $tag =~ s/^cre\b/cr/i; # Cree
538 $tag =~ s/^jw\b/jv/i; # Javanese
539 $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger
540 $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo
541 $tag =~ s/^ji\b/yi/i; # Yiddish
542 # SMB 2003 -- Hm. There's a bunch of new XXX->YY variances now,
543 # but maybe they're all so obscure I can ignore them. "Obscure"
544 # meaning either that the language is obscure, and/or that the
545 # XXX form was extant so briefly that it's unlikely it was ever
548 # These go FROM the simplex to complex form, to get
549 # similarity-comparison right. And that's okay, since
550 # similarity_language_tag is the only thing that
551 # analyzes our output.
552 $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka
553 $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal
554 $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk
556 $tag =~ s/^[xiXI]-//s;
557 # Just lop off any leading "x/i-"
559 return "~" . uc($tag);
562 #--------------------------------------------------------------------------
564 =item * the function alternate_language_tags($lang1)
566 This function, if given a language tag, returns all language tags that
567 are alternate forms of this language tag. (I.e., tags which refer to
568 the same language.) This is meant to handle legacy tags caused by
569 the minor changes in language tag standards over the years; and
570 the x-/i- alternation is also dealt with.
572 Note that this function does I<not> try to equate new (and never-used,
574 ISO639-2 three-letter tags to old (and still in use) ISO639-1
575 two-letter equivalents -- like "ara" -> "ar" -- because
576 "ara" has I<never> been in use as an Internet language tag,
577 and RFC 3066 stipulates that it never should be, since a shorter
582 alternate_language_tags('no-bok') is ('nb')
583 alternate_language_tags('nb') is ('no-bok')
584 alternate_language_tags('he') is ('iw')
585 alternate_language_tags('iw') is ('he')
586 alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka')
587 alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka')
588 alternate_language_tags('en') is ()
589 alternate_language_tags('x-mingo-tom') is ('i-mingo-tom')
590 alternate_language_tags('x-klikitat') is ('i-klikitat')
591 alternate_language_tags('i-klikitat') is ('x-klikitat')
593 This function returns empty-list if given anything other than a formally
598 my %alt = qw( i x x i I X X I );
599 sub alternate_language_tags {
601 return() unless &is_language_tag($tag);
603 my @em; # push 'em real goood!
605 # For the moment, these legacy variances are few enough that
606 # we can just handle them here with regexps.
608 if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1";
609 } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1";
611 } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1";
612 } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1";
614 } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1";
615 } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1";
617 } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1";
618 } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1";
620 } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1";
621 } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1";
623 } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1";
624 } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1";
626 } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1";
627 } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1";
629 } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1";
630 } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1";
633 push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/;
637 ###########################################################################
642 my @panic = ( # MUST all be lowercase!
643 # Only large ("national") languages make it in this list.
644 # If you, as a user, are so bizarre that the /only/ language
645 # you claim to accept is Galician, then no, we won't do you
646 # the favor of providing Catalan as a panic-fallback for
647 # you. Because if I start trying to add "little languages" in
648 # here, I'll just go crazy.
650 # Scandinavian lgs. All based on opinion and hearsay.
651 'sv' => [qw(nb no da nn)],
652 'da' => [qw(nb no sv nn)], # I guess
653 [qw(no nn nb)], [qw(no nn nb sv da)],
654 'is' => [qw(da sv no nb nn)],
655 'fo' => [qw(da is no nb nn sv)], # I guess
657 # I think this is about the extent of tolerable intelligibility
658 # among large modern Romance languages.
659 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French
660 'ca' => [qw(es pt it fr)],
661 'es' => [qw(ca it fr pt)],
662 'it' => [qw(es fr ca pt)],
663 'fr' => [qw(es it ca pt)],
665 # Also assume that speakers of the main Indian languages prefer
666 # to read/hear Hindi over English
668 as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur
670 # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri,
671 # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya,
672 # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu.
673 'hi' => [qw(bn pa as or)],
674 # I welcome finer data for the other Indian languages.
675 # E.g., what should Oriya's list be, besides just Hindi?
677 # And the panic languages for English is, of course, nil!
679 # My guesses at Slavic intelligibility:
680 ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian
681 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat
682 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak
684 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian
686 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish
688 #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai
693 ($k,$v) = splice(@panic,0,2);
694 foreach my $k (ref($k) ? @$k : $k) {
695 foreach my $v (ref($v) ? @$v : $v) {
696 push @{$Panic{$k} ||= []}, $v unless $k eq $v;
702 =item * the function @langs = panic_languages(@accept_languages)
704 This function takes a list of 0 or more language
705 tags that constitute a given user's Accept-Language list, and
706 returns a list of tags for I<other> (non-super)
707 languages that are probably acceptable to the user, to be
708 used I<if all else fails>.
710 For example, if a user accepts only 'ca' (Catalan) and
711 'es' (Spanish), and the documents/interfaces you have
712 available are just in German, Italian, and Chinese, then
713 the user will most likely want the Italian one (and not
714 the Chinese or German one!), instead of getting
715 nothing. So C<panic_languages('ca', 'es')> returns
716 a list containing 'it' (Italian).
718 English ('en') is I<always> in the return list, but
719 whether it's at the very end or not depends
720 on the input languages. This function works by consulting
721 an internal table that stipulates what common
722 languages are "close" to each other.
724 A useful construct you might consider using is:
726 @fallbacks = super_languages(@accept_languages);
727 push @fallbacks, panic_languages(
728 @accept_languages, @fallbacks,
733 sub panic_languages {
734 # When in panic or in doubt, run in circles, scream, and shout!
738 next if $seen{$t}++; # so we don't return it or hit it again
739 # push @out, super_languages($t); # nah, keep that separate
740 push @out, @{ $Panic{lc $t} || next };
742 return grep !$seen{$_}++, @out, 'en';
745 #---------------------------------------------------------------------------
746 #---------------------------------------------------------------------------
748 =item * the function implicate_supers( ...languages... )
750 This takes a list of strings (which are presumed to be language-tags;
751 strings that aren't, are ignored); and after each one, this function
752 inserts super-ordinate forms that don't already appear in the list.
753 The original list, plus these insertions, is returned.
755 In other words, it takes this:
757 pt-br de-DE en-US fr pt-br-janeiro
761 pt-br pt de-DE de en-US en fr pt-br-janeiro
763 This function is most useful in the idiom
765 implicate_supers( I18N::LangTags::Detect::detect() );
767 (See L<I18N::LangTags::Detect>.)
770 =item * the function implicate_supers_strictly( ...languages... )
772 This works like C<implicate_supers> except that the implicated
773 forms are added to the end of the return list.
775 In other words, implicate_supers_strictly takes a list of strings
776 (which are presumed to be language-tags; strings that aren't, are
777 ignored) and after the whole given list, it inserts the super-ordinate forms
778 of all given tags, minus any tags that already appear in the input list.
780 In other words, it takes this:
782 pt-br de-DE en-US fr pt-br-janeiro
786 pt-br de-DE en-US fr pt-br-janeiro pt de en
788 The reason this function has "_strictly" in its name is that when
789 you're processing an Accept-Language list according to the RFCs, if
790 you interpret the RFCs quite strictly, then you would use
791 implicate_supers_strictly, but for normal use (i.e., common-sense use,
792 as far as I'm concerned) you'd use implicate_supers.
796 sub implicate_supers {
797 my @languages = grep is_language_tag($_), @_;
799 foreach my $lang (@languages) {
800 $seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1
803 my(@output_languages);
804 foreach my $lang (@languages) {
805 push @output_languages, $lang;
806 foreach my $s ( I18N::LangTags::super_languages($lang) ) {
807 # Note that super_languages returns the longest first.
808 last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) };
809 push @output_languages, $s;
812 return uniq( @output_languages );
816 sub implicate_supers_strictly {
817 my @tags = grep is_language_tag($_), @_;
818 return uniq( @_, map super_languages($_), @_ );
823 ###########################################################################
829 =head1 ABOUT LOWERCASING
831 I've considered making all the above functions that output language
832 tags return all those tags strictly in lowercase. Having all your
833 language tags in lowercase does make some things easier. But you
834 might as well just lowercase as you like, or call
835 C<encode_language_tag($lang1)> where appropriate.
837 =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
839 In some future version of I18N::LangTags, I plan to include support
840 for RFC2482-style language tags -- which are basically just normal
841 language tags with their ASCII characters shifted into Plane 14.
845 * L<I18N::LangTags::List|I18N::LangTags::List>
847 * RFC 3066, C<http://www.ietf.org/rfc/rfc3066.txt>, "Tags for the
848 Identification of Languages". (Obsoletes RFC 1766)
850 * RFC 2277, C<http://www.ietf.org/rfc/rfc2277.txt>, "IETF Policy on
851 Character Sets and Languages".
853 * RFC 2231, C<http://www.ietf.org/rfc/rfc2231.txt>, "MIME Parameter
854 Value and Encoded Word Extensions: Character Sets, Languages, and
857 * RFC 2482, C<http://www.ietf.org/rfc/rfc2482.txt>,
858 "Language Tagging in Unicode Plain Text".
861 C<http://www.perl.com/CPAN/modules/by-module/Locale/>
863 * ISO 639-2, "Codes for the representation of names of languages",
864 including two-letter and three-letter codes,
865 C<http://www.loc.gov/standards/iso639-2/php/code_list.php>
867 * The IANA list of registered languages (hopefully up-to-date),
868 C<http://www.iana.org/assignments/language-tags>
872 Copyright (c) 1998+ Sean M. Burke. All rights reserved.
874 This library is free software; you can redistribute it and/or
875 modify it under the same terms as Perl itself.
877 The programs and documentation in this dist are distributed in
878 the hope that they will be useful, but without any warranty; without
879 even the implied warranty of merchantability or fitness for a
884 Sean M. Burke C<sburke@cpan.org>