cpan/Encode/encoding.pm

   1 # $Id: encoding.pm,v 2.12 2013/04/26 18:30:46 dankogai Exp $
   2 package encoding;
   3 our $VERSION = sprintf "%d.%02d", q$Revision: 2.12 $ =~ /(\d+)/g;
   4
   5 use Encode;
   6 use strict;
   7 use warnings;
   8
   9 use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
  10
  11 BEGIN {
  12     if ( ord("A") == 193 ) {
  13         require Carp;
  14         Carp::croak("encoding: pragma does not support EBCDIC platforms");
  15     }
  16 }
  17
  18 our $HAS_PERLIO = 0;
  19 eval { require PerlIO::encoding };
  20 unless ($@) {
  21     $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 );
  22 }
  23
  24 sub _exception {
  25     my $name = shift;
  26     $] > 5.008 and return 0;    # 5.8.1 or higher then no
  27     my %utfs = map { $_ => 1 }
  28       qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
  29       UTF-32 UTF-32BE UTF-32LE);
  30     $utfs{$name} or return 0;    # UTFs or no
  31     require Config;
  32     Config->import();
  33     our %Config;
  34     return $Config{perl_patchlevel} ? 0 : 1    # maintperl then no
  35 }
  36
  37 sub in_locale { $^H & ( $locale::hint_bits || 0 ) }
  38
  39 sub _get_locale_encoding {
  40     my $locale_encoding;
  41
  42     # I18N::Langinfo isn't available everywhere
  43     eval {
  44         require I18N::Langinfo;
  45         I18N::Langinfo->import(qw(langinfo CODESET));
  46         $locale_encoding = langinfo( CODESET() );
  47     };
  48
  49     my $country_language;
  50
  51     no warnings 'uninitialized';
  52
  53     if ( (not $locale_encoding) && in_locale() ) {
  54         if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) {
  55             ( $country_language, $locale_encoding ) = ( $1, $2 );
  56         }
  57         elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) {
  58             ( $country_language, $locale_encoding ) = ( $1, $2 );
  59         }
  60
  61         # LANGUAGE affects only LC_MESSAGES only on glibc
  62     }
  63     elsif ( not $locale_encoding ) {
  64         if (   $ENV{LC_ALL} =~ /\butf-?8\b/i
  65             || $ENV{LANG} =~ /\butf-?8\b/i )
  66         {
  67             $locale_encoding = 'utf8';
  68         }
  69
  70         # Could do more heuristics based on the country and language
  71         # parts of LC_ALL and LANG (the parts before the dot (if any)),
  72         # since we have Locale::Country and Locale::Language available.
  73         # TODO: get a database of Language -> Encoding mappings
  74         # (the Estonian database at http://www.eki.ee/letter/
  75         # would be excellent!) --jhi
  76     }
  77     if (   defined $locale_encoding
  78         && lc($locale_encoding) eq 'euc'
  79         && defined $country_language )
  80     {
  81         if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) {
  82             $locale_encoding = 'euc-jp';
  83         }
  84         elsif ( $country_language =~ /^ko_KR|korean?$/i ) {
  85             $locale_encoding = 'euc-kr';
  86         }
  87         elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) {
  88             $locale_encoding = 'euc-cn';
  89         }
  90         elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) {
  91             $locale_encoding = 'euc-tw';
  92         }
  93         else {
  94             require Carp;
  95             Carp::croak(
  96                 "encoding: Locale encoding '$locale_encoding' too ambiguous"
  97             );
  98         }
  99     }
 100
 101     return $locale_encoding;
 102 }
 103
 104 sub import {
 105     if ($] >= 5.020) {
 106         Carp::croak("encoding: pragma has been removed");
 107     }
 108     if ($] >= 5.017) {
 109         warnings::warnif("deprecated",
 110                          "Use of the encoding pragma is deprecated")
 111     }
 112     my $class = shift;
 113     my $name  = shift;
 114     if (!$name){
 115         require Carp;
 116         Carp::croak("encoding: no encoding specified.");
 117     }
 118     if ( $name eq ':_get_locale_encoding' ) {    # used by lib/open.pm
 119         my $caller = caller();
 120         {
 121             no strict 'refs';
 122             *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
 123         }
 124         return;
 125     }
 126     $name = _get_locale_encoding() if $name eq ':locale';
 127     my %arg = @_;
 128     $name = $ENV{PERL_ENCODING} unless defined $name;
 129     my $enc = find_encoding($name);
 130     unless ( defined $enc ) {
 131         require Carp;
 132         Carp::croak("encoding: Unknown encoding '$name'");
 133     }
 134     $name = $enc->name;    # canonize
 135     unless ( $arg{Filter} ) {
 136         DEBUG and warn "_exception($name) = ", _exception($name);
 137         _exception($name) or ${^ENCODING} = $enc;
 138         $HAS_PERLIO or return 1;
 139     }
 140     else {
 141         defined( ${^ENCODING} ) and undef ${^ENCODING};
 142
 143         # implicitly 'use utf8'
 144         require utf8;      # to fetch $utf8::hint_bits;
 145         $^H |= $utf8::hint_bits;
 146         eval {
 147             require Filter::Util::Call;
 148             Filter::Util::Call->import;
 149             filter_add(
 150                 sub {
 151                     my $status = filter_read();
 152                     if ( $status > 0 ) {
 153                         $_ = $enc->decode( $_, 1 );
 154                         DEBUG and warn $_;
 155                     }
 156                     $status;
 157                 }
 158             );
 159         };
 160         $@ eq '' and DEBUG and warn "Filter installed";
 161     }
 162     defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
 163     for my $h (qw(STDIN STDOUT)) {
 164         if ( $arg{$h} ) {
 165             unless ( defined find_encoding( $arg{$h} ) ) {
 166                 require Carp;
 167                 Carp::croak(
 168                     "encoding: Unknown encoding for $h, '$arg{$h}'");
 169             }
 170             eval { binmode( $h, ":raw :encoding($arg{$h})" ) };
 171         }
 172         else {
 173             unless ( exists $arg{$h} ) {
 174                 eval {
 175                     no warnings 'uninitialized';
 176                     binmode( $h, ":raw :encoding($name)" );
 177                 };
 178             }
 179         }
 180         if ($@) {
 181             require Carp;
 182             Carp::croak($@);
 183         }
 184     }
 185     return 1;    # I doubt if we need it, though
 186 }
 187
 188 sub unimport {
 189     no warnings;
 190     undef ${^ENCODING};
 191     if ($HAS_PERLIO) {
 192         binmode( STDIN,  ":raw" );
 193         binmode( STDOUT, ":raw" );
 194     }
 195     else {
 196         binmode(STDIN);
 197         binmode(STDOUT);
 198     }
 199     if ( $INC{"Filter/Util/Call.pm"} ) {
 200         eval { filter_del() };
 201     }
 202 }
 203
 204 1;
 205 __END__
 206
 207 =pod
 208
 209 =head1 NAME
 210
 211 encoding - allows you to write your script in non-ascii or non-utf8
 212
 213 =head1 WARNING
 214
 215 This module is deprecated under perl 5.18.  It uses a mechanism provided by
 216 perl that is deprecated under 5.18 and higher, and may be removed in a
 217 future version.
 218
 219 The easiest and the best alternative is to write your script in UTF-8
 220 and declear:
 221
 222   use utf8; # not use encoding ':utf8';
 223
 224 See L<perluniintro> and L<utf8> for details.
 225
 226 =head1 SYNOPSIS
 227
 228   use encoding "greek";  # Perl like Greek to you?
 229   use encoding "euc-jp"; # Jperl!
 230
 231   # or you can even do this if your shell supports your native encoding
 232
 233   perl -Mencoding=latin2 -e'...' # Feeling centrally European?
 234   perl -Mencoding=euc-kr -e'...' # Or Korean?
 235
 236   # more control
 237
 238   # A simple euc-cn => utf-8 converter
 239   use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};
 240
 241   # "no encoding;" supported (but not scoped!)
 242   no encoding;
 243
 244   # an alternate way, Filter
 245   use encoding "euc-jp", Filter=>1;
 246   # now you can use kanji identifiers -- in euc-jp!
 247
 248   # switch on locale -
 249   # note that this probably means that unless you have a complete control
 250   # over the environments the application is ever going to be run, you should
 251   # NOT use the feature of encoding pragma allowing you to write your script
 252   # in any recognized encoding because changing locale settings will wreck
 253   # the script; you can of course still use the other features of the pragma.
 254   use encoding ':locale';
 255
 256 =head1 ABSTRACT
 257
 258 Let's start with a bit of history: Perl 5.6.0 introduced Unicode
 259 support.  You could apply C<substr()> and regexes even to complex CJK
 260 characters -- so long as the script was written in UTF-8.  But back
 261 then, text editors that supported UTF-8 were still rare and many users
 262 instead chose to write scripts in legacy encodings, giving up a whole
 263 new feature of Perl 5.6.
 264
 265 Rewind to the future: starting from perl 5.8.0 with the B<encoding>
 266 pragma, you can write your script in any encoding you like (so long
 267 as the C<Encode> module supports it) and still enjoy Unicode support.
 268 This pragma achieves that by doing the following:
 269
 270 =over
 271
 272 =item *
 273
 274 Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
 275 the encoding specified to utf8.  In Perl 5.8.1 and later, literals in
 276 C<tr///> and C<DATA> pseudo-filehandle are also converted.
 277
 278 =item *
 279
 280 Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
 281  specified.
 282
 283 =back
 284
 285 =head2 Literal Conversions
 286
 287 You can write code in EUC-JP as follows:
 288
 289   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
 290                #<-char-><-char->   # 4 octets
 291   s/\bCamel\b/$Rakuda/;
 292
 293 And with C<use encoding "euc-jp"> in effect, it is the same thing as
 294 the code in UTF-8:
 295
 296   my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
 297   s/\bCamel\b/$Rakuda/;
 298
 299 =head2 PerlIO layers for C<STD(IN|OUT)>
 300
 301 The B<encoding> pragma also modifies the filehandle layers of
 302 STDIN and STDOUT to the specified encoding.  Therefore,
 303
 304   use encoding "euc-jp";
 305   my $message = "Camel is the symbol of perl.\n";
 306   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
 307   $message =~ s/\bCamel\b/$Rakuda/;
 308   print $message;
 309
 310 Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
 311 not "\x{99F1}\x{99DD} is the symbol of perl.\n".
 312
 313 You can override this by giving extra arguments; see below.
 314
 315 =head2 Implicit upgrading for byte strings
 316
 317 By default, if strings operating under byte semantics and strings
 318 with Unicode character data are concatenated, the new string will
 319 be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
 320
 321 The B<encoding> pragma changes this to use the specified encoding
 322 instead.  For example:
 323
 324     use encoding 'utf8';
 325     my $string = chr(20000); # a Unicode string
 326     utf8::encode($string);   # now it's a UTF-8 encoded byte string
 327     # concatenate with another Unicode string
 328     print length($string . chr(20000));
 329
 330 Will print C<2>, because C<$string> is upgraded as UTF-8.  Without
 331 C<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
 332 is three octets when interpreted as Latin-1.
 333
 334 =head2 Side effects
 335
 336 If the C<encoding> pragma is in scope then the lengths returned are
 337 calculated from the length of C<$/> in Unicode characters, which is not
 338 always the same as the length of C<$/> in the native encoding.
 339
 340 This pragma affects utf8::upgrade, but not utf8::downgrade.
 341
 342 =head1 FEATURES THAT REQUIRE 5.8.1
 343
 344 Some of the features offered by this pragma requires perl 5.8.1.  Most
 345 of these are done by Inaba Hiroto.  Any other features and changes
 346 are good for 5.8.0.
 347
 348 =over
 349
 350 =item "NON-EUC" doublebyte encodings
 351
 352 Because perl needs to parse script before applying this pragma, such
 353 encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
 354 \x5c) in the second byte fails because the second byte may
 355 accidentally escape the quoting character that follows.  Perl 5.8.1
 356 or later fixes this problem.
 357
 358 =item tr//
 359
 360 C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
 361 See the section below for details.
 362
 363 =item DATA pseudo-filehandle
 364
 365 Another feature that was overlooked was C<DATA>.
 366
 367 =back
 368
 369 =head1 USAGE
 370
 371 =over 4
 372
 373 =item use encoding [I<ENCNAME>] ;
 374
 375 Sets the script encoding to I<ENCNAME>.  And unless ${^UNICODE}
 376 exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
 377 ":encoding(I<ENCNAME>)".
 378
 379 Note that STDERR WILL NOT be changed.
 380
 381 Also note that non-STD file handles remain unaffected.  Use C<use
 382 open> or C<binmode> to change layers of those.
 383
 384 If no encoding is specified, the environment variable L<PERL_ENCODING>
 385 is consulted.  If no encoding can be found, the error C<Unknown encoding
 386 'I<ENCNAME>'> will be thrown.
 387
 388 =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
 389
 390 You can also individually set encodings of STDIN and STDOUT via the
 391 C<< STDIN => I<ENCNAME> >> form.  In this case, you cannot omit the
 392 first I<ENCNAME>.  C<< STDIN => undef >> turns the IO transcoding
 393 completely off.
 394
 395 When ${^UNICODE} exists and non-zero, these options will completely
 396 ignored.  ${^UNICODE} is a variable introduced in perl 5.8.1.  See
 397 L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
 398 details (perl 5.8.1 and later).
 399
 400 =item use encoding I<ENCNAME> Filter=E<gt>1;
 401
 402 This turns the encoding pragma into a source filter.  While the
 403 default approach just decodes interpolated literals (in qq() and
 404 qr()), this will apply a source filter to the entire source code.  See
 405 L</"The Filter Option"> below for details.
 406
 407 =item no encoding;
 408
 409 Unsets the script encoding. The layers of STDIN, STDOUT are
 410 reset to ":raw" (the default unprocessed raw stream of bytes).
 411
 412 =back
 413
 414 =head1 The Filter Option
 415
 416 The magic of C<use encoding> is not applied to the names of
 417 identifiers.  In order to make C<${"\x{4eba}"}++> ($human++, where human
 418 is a single Han ideograph) work, you still need to write your script
 419 in UTF-8 -- or use a source filter.  That's what 'Filter=>1' does.
 420
 421 What does this mean?  Your source code behaves as if it is written in
 422 UTF-8 with 'use utf8' in effect.  So even if your editor only supports
 423 Shift_JIS, for example, you can still try examples in Chapter 15 of
 424 C<Programming Perl, 3rd Ed.>.  For instance, you can use UTF-8
 425 identifiers.
 426
 427 This option is significantly slower and (as of this writing) non-ASCII
 428 identifiers are not very stable WITHOUT this option and with the
 429 source code written in UTF-8.
 430
 431 =head2 Filter-related changes at Encode version 1.87
 432
 433 =over
 434
 435 =item *
 436
 437 The Filter option now sets STDIN and STDOUT like non-filter options.
 438 And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
 439 non-filter version.
 440
 441 =item *
 442
 443 C<use utf8> is implicitly declared so you no longer have to C<use
 444 utf8> to C<${"\x{4eba}"}++>.
 445
 446 =back
 447
 448 =head1 CAVEATS
 449
 450 =head2 NOT SCOPED
 451
 452 The pragma is a per script, not a per block lexical.  Only the last
 453 C<use encoding> or C<no encoding> matters, and it affects
 454 B<the whole script>.  However, the <no encoding> pragma is supported and
 455 B<use encoding> can appear as many times as you want in a given script.
 456 The multiple use of this pragma is discouraged.
 457
 458 By the same reason, the use this pragma inside modules is also
 459 discouraged (though not as strongly discouraged as the case above.
 460 See below).
 461
 462 If you still have to write a module with this pragma, be very careful
 463 of the load order.  See the codes below;
 464
 465   # called module
 466   package Module_IN_BAR;
 467   use encoding "bar";
 468   # stuff in "bar" encoding here
 469   1;
 470
 471   # caller script
 472   use encoding "foo"
 473   use Module_IN_BAR;
 474   # surprise! use encoding "bar" is in effect.
 475
 476 The best way to avoid this oddity is to use this pragma RIGHT AFTER
 477 other modules are loaded.  i.e.
 478
 479   use Module_IN_BAR;
 480   use encoding "foo";
 481
 482 =head2 DO NOT MIX MULTIPLE ENCODINGS
 483
 484 Notice that only literals (string or regular expression) having only
 485 legacy code points are affected: if you mix data like this
 486
 487     \xDF\x{100}
 488
 489 the data is assumed to be in (Latin 1 and) Unicode, not in your native
 490 encoding.  In other words, this will match in "greek":
 491
 492     "\xDF" =~ /\x{3af}/
 493
 494 but this will not
 495
 496     "\xDF\x{100}" =~ /\x{3af}\x{100}/
 497
 498 since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
 499 the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
 500 LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
 501 should not be mixing your legacy data and Unicode in the same string.
 502
 503 This pragma also affects encoding of the 0x80..0xFF code point range:
 504 normally characters in that range are left as eight-bit bytes (unless
 505 they are combined with characters with code points 0x100 or larger,
 506 in which case all characters need to become UTF-8 encoded), but if
 507 the C<encoding> pragma is present, even the 0x80..0xFF range always
 508 gets UTF-8 encoded.
 509
 510 After all, the best thing about this pragma is that you don't have to
 511 resort to \x{....} just to spell your name in a native encoding.
 512 So feel free to put your strings in your encoding in quotes and
 513 regexes.
 514
 515 =head2 tr/// with ranges
 516
 517 The B<encoding> pragma works by decoding string literals in
 518 C<q//,qq//,qr//,qw///, qx//> and so forth.  In perl 5.8.0, this
 519 does not apply to C<tr///>.  Therefore,
 520
 521   use encoding 'euc-jp';
 522   #....
 523   $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
 524   #           -------- -------- -------- --------
 525
 526 Does not work as
 527
 528   $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
 529
 530 =over
 531
 532 =item Legend of characters above
 533
 534   utf8     euc-jp   charnames::viacode()
 535   -----------------------------------------
 536   \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
 537   \x{3093} \xA4\xF3 HIRAGANA LETTER N
 538   \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
 539   \x{30f3} \xA5\xF3 KATAKANA LETTER N
 540
 541 =back
 542
 543 This counterintuitive behavior has been fixed in perl 5.8.1.
 544
 545 =head3 workaround to tr///;
 546
 547 In perl 5.8.0, you can work around as follows;
 548
 549   use encoding 'euc-jp';
 550   #  ....
 551   eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
 552
 553 Note the C<tr//> expression is surrounded by C<qq{}>.  The idea behind
 554 is the same as classic idiom that makes C<tr///> 'interpolate'.
 555
 556    tr/$from/$to/;            # wrong!
 557    eval qq{ tr/$from/$to/ }; # workaround.
 558
 559 Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
 560 C<tr///> not being decoded was obviously against the will of Perl5
 561 Porters so it has been fixed in Perl 5.8.1 or later.
 562
 563 =head1 EXAMPLE - Greekperl
 564
 565     use encoding "iso 8859-7";
 566
 567     # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
 568
 569     $a = "\xDF";
 570     $b = "\x{100}";
 571
 572     printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
 573
 574     $c = $a . $b;
 575
 576     # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
 577
 578     # chr() is affected, and ...
 579
 580     print "mega\n"  if ord(chr(0xdf)) == 0x3af;
 581
 582     # ... ord() is affected by the encoding pragma ...
 583
 584     print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
 585
 586     # ... as are eq and cmp ...
 587
 588     print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
 589     print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
 590
 591     # ... but pack/unpack C are not affected, in case you still
 592     # want to go back to your native encoding
 593
 594     print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
 595
 596 =head1 KNOWN PROBLEMS
 597
 598 =over
 599
 600 =item literals in regex that are longer than 127 bytes
 601
 602 For native multibyte encodings (either fixed or variable length),
 603 the current implementation of the regular expressions may introduce
 604 recoding errors for regular expression literals longer than 127 bytes.
 605
 606 =item EBCDIC
 607
 608 The encoding pragma is not supported on EBCDIC platforms.
 609 (Porters who are willing and able to remove this limitation are
 610 welcome.)
 611
 612 =item format
 613
 614 This pragma doesn't work well with format because PerlIO does not
 615 get along very well with it.  When format contains non-ascii
 616 characters it prints funny or gets "wide character warnings".
 617 To understand it, try the code below.
 618
 619   # Save this one in utf8
 620   # replace *non-ascii* with a non-ascii string
 621   my $camel;
 622   format STDOUT =
 623   *non-ascii*@>>>>>>>
 624   $camel
 625   .
 626   $camel = "*non-ascii*";
 627   binmode(STDOUT=>':encoding(utf8)'); # bang!
 628   write;              # funny
 629   print $camel, "\n"; # fine
 630
 631 Without binmode this happens to work but without binmode, print()
 632 fails instead of write().
 633
 634 At any rate, the very use of format is questionable when it comes to
 635 unicode characters since you have to consider such things as character
 636 width (i.e. double-width for ideographs) and directions (i.e. BIDI for
 637 Arabic and Hebrew).
 638
 639 =item Thread safety
 640
 641 C<use encoding ...> is not thread-safe (i.e., do not use in threaded
 642 applications).
 643
 644 =back
 645
 646 =head2 The Logic of :locale
 647
 648 The logic of C<:locale> is as follows:
 649
 650 =over 4
 651
 652 =item 1.
 653
 654 If the platform supports the langinfo(CODESET) interface, the codeset
 655 returned is used as the default encoding for the open pragma.
 656
 657 =item 2.
 658
 659 If 1. didn't work but we are under the locale pragma, the environment
 660 variables LC_ALL and LANG (in that order) are matched for encodings
 661 (the part after C<.>, if any), and if any found, that is used
 662 as the default encoding for the open pragma.
 663
 664 =item 3.
 665
 666 If 1. and 2. didn't work, the environment variables LC_ALL and LANG
 667 (in that order) are matched for anything looking like UTF-8, and if
 668 any found, C<:utf8> is used as the default encoding for the open
 669 pragma.
 670
 671 =back
 672
 673 If your locale environment variables (LC_ALL, LC_CTYPE, LANG)
 674 contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
 675 the default encoding of your STDIN, STDOUT, and STDERR, and of
 676 B<any subsequent file open>, is UTF-8.
 677
 678 =head1 HISTORY
 679
 680 This pragma first appeared in Perl 5.8.0.  For features that require
 681 5.8.1 and better, see above.
 682
 683 The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6.
 684
 685 =head1 SEE ALSO
 686
 687 L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
 688
 689 Ch. 15 of C<Programming Perl (3rd Edition)>
 690 by Larry Wall, Tom Christiansen, Jon Orwant;
 691 O'Reilly & Associates; ISBN 0-596-00027-8
 692
 693 =cut