cpan/Encode/encoding.pm

   1 # $Id: encoding.pm,v 2.12 2013/04/26 18:30:46 dankogai Exp $
   2 package encoding;
   3 our $VERSION = sprintf "%d.%02d", q$Revision: 2.12 $ =~ /(\d+)/g;
   4
   5 use Encode;
   6 use strict;
   7 use warnings;
   8
   9 use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
  10
  11 BEGIN {
  12     if ( ord("A") == 193 ) {
  13         require Carp;
  14         Carp::croak("encoding: pragma does not support EBCDIC platforms");
  15     }
  16 }
  17
  18 our $HAS_PERLIO = 0;
  19 eval { require PerlIO::encoding };
  20 unless ($@) {
  21     $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 );
  22 }
  23
  24 sub _exception {
  25     my $name = shift;
  26     $] > 5.008 and return 0;    # 5.8.1 or higher then no
  27     my %utfs = map { $_ => 1 }
  28       qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
  29       UTF-32 UTF-32BE UTF-32LE);
  30     $utfs{$name} or return 0;    # UTFs or no
  31     require Config;
  32     Config->import();
  33     our %Config;
  34     return $Config{perl_patchlevel} ? 0 : 1    # maintperl then no
  35 }
  36
  37 sub in_locale { $^H & ( $locale::hint_bits || 0 ) }
  38
  39 sub _get_locale_encoding {
  40     my $locale_encoding;
  41
  42     # I18N::Langinfo isn't available everywhere
  43     eval {
  44         require I18N::Langinfo;
  45         I18N::Langinfo->import(qw(langinfo CODESET));
  46         $locale_encoding = langinfo( CODESET() );
  47     };
  48
  49     my $country_language;
  50
  51     no warnings 'uninitialized';
  52
  53     if ( (not $locale_encoding) && in_locale() ) {
  54         if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) {
  55             ( $country_language, $locale_encoding ) = ( $1, $2 );
  56         }
  57         elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) {
  58             ( $country_language, $locale_encoding ) = ( $1, $2 );
  59         }
  60
  61         # LANGUAGE affects only LC_MESSAGES only on glibc
  62     }
  63     elsif ( not $locale_encoding ) {
  64         if (   $ENV{LC_ALL} =~ /\butf-?8\b/i
  65             || $ENV{LANG} =~ /\butf-?8\b/i )
  66         {
  67             $locale_encoding = 'utf8';
  68         }
  69
  70         # Could do more heuristics based on the country and language
  71         # parts of LC_ALL and LANG (the parts before the dot (if any)),
  72         # since we have Locale::Country and Locale::Language available.
  73         # TODO: get a database of Language -> Encoding mappings
  74         # (the Estonian database at http://www.eki.ee/letter/
  75         # would be excellent!) --jhi
  76     }
  77     if (   defined $locale_encoding
  78         && lc($locale_encoding) eq 'euc'
  79         && defined $country_language )
  80     {
  81         if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) {
  82             $locale_encoding = 'euc-jp';
  83         }
  84         elsif ( $country_language =~ /^ko_KR|korean?$/i ) {
  85             $locale_encoding = 'euc-kr';
  86         }
  87         elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) {
  88             $locale_encoding = 'euc-cn';
  89         }
  90         elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) {
  91             $locale_encoding = 'euc-tw';
  92         }
  93         else {
  94             require Carp;
  95             Carp::croak(
  96                 "encoding: Locale encoding '$locale_encoding' too ambiguous"
  97             );
  98         }
  99     }
 100
 101     return $locale_encoding;
 102 }
 103
 104 sub import {
 105     if ($] >= 5.017) {
 106         warnings::warnif("deprecated",
 107                          "Use of the encoding pragma is deprecated")
 108     }
 109     my $class = shift;
 110     my $name  = shift;
 111     if (!$name){
 112         require Carp;
 113         Carp::croak("encoding: no encoding specified.");
 114     }
 115     if ( $name eq ':_get_locale_encoding' ) {    # used by lib/open.pm
 116         my $caller = caller();
 117         {
 118             no strict 'refs';
 119             *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
 120         }
 121         return;
 122     }
 123     $name = _get_locale_encoding() if $name eq ':locale';
 124     my %arg = @_;
 125     $name = $ENV{PERL_ENCODING} unless defined $name;
 126     my $enc = find_encoding($name);
 127     unless ( defined $enc ) {
 128         require Carp;
 129         Carp::croak("encoding: Unknown encoding '$name'");
 130     }
 131     $name = $enc->name;    # canonize
 132     unless ( $arg{Filter} ) {
 133         DEBUG and warn "_exception($name) = ", _exception($name);
 134         if (! _exception($name)) {
 135             if (!$^V || $^V lt v5.21.7) {
 136                 ${^ENCODING} = $enc;
 137             }
 138             else {
 139                 # Starting with 5.21.7, this pragma uses a shadow variable
 140                 # designed explicitly for it, ${^E_NCODING}, to enforce
 141                 # lexical scope; instead of ${^ENCODING}.
 142                 $^H{'encoding'} = 1;
 143                 ${^E_NCODING} = $enc;
 144             }
 145         }
 146         $HAS_PERLIO or return 1;
 147     }
 148     else {
 149         defined( ${^ENCODING} ) and undef ${^ENCODING};
 150         undef ${^E_NCODING} if $^V && $^V ge v5.21.7;
 151
 152         # implicitly 'use utf8'
 153         require utf8;      # to fetch $utf8::hint_bits;
 154         $^H |= $utf8::hint_bits;
 155         eval {
 156             require Filter::Util::Call;
 157             Filter::Util::Call->import;
 158             filter_add(
 159                 sub {
 160                     my $status = filter_read();
 161                     if ( $status > 0 ) {
 162                         $_ = $enc->decode( $_, 1 );
 163                         DEBUG and warn $_;
 164                     }
 165                     $status;
 166                 }
 167             );
 168         };
 169         $@ eq '' and DEBUG and warn "Filter installed";
 170     }
 171     defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
 172     for my $h (qw(STDIN STDOUT)) {
 173         if ( $arg{$h} ) {
 174             unless ( defined find_encoding( $arg{$h} ) ) {
 175                 require Carp;
 176                 Carp::croak(
 177                     "encoding: Unknown encoding for $h, '$arg{$h}'");
 178             }
 179             eval { binmode( $h, ":raw :encoding($arg{$h})" ) };
 180         }
 181         else {
 182             unless ( exists $arg{$h} ) {
 183                 eval {
 184                     no warnings 'uninitialized';
 185                     binmode( $h, ":raw :encoding($name)" );
 186                 };
 187             }
 188         }
 189         if ($@) {
 190             require Carp;
 191             Carp::croak($@);
 192         }
 193     }
 194     return 1;    # I doubt if we need it, though
 195 }
 196
 197 sub unimport {
 198     no warnings;
 199     undef ${^ENCODING};
 200     undef ${^E_NCODING} if $^V && $^V ge v5.21.7;
 201     if ($HAS_PERLIO) {
 202         binmode( STDIN,  ":raw" );
 203         binmode( STDOUT, ":raw" );
 204     }
 205     else {
 206         binmode(STDIN);
 207         binmode(STDOUT);
 208     }
 209     if ( $INC{"Filter/Util/Call.pm"} ) {
 210         eval { filter_del() };
 211     }
 212 }
 213
 214 1;
 215 __END__
 216
 217 =pod
 218
 219 =head1 NAME
 220
 221 encoding - allows you to write your script in non-ASCII and non-UTF-8
 222
 223 =head1 WARNING
 224
 225 This module has been deprecated since perl v5.18.  See L</DESCRIPTION> and
 226 L</BUGS>.
 227
 228 =head1 SYNOPSIS
 229
 230   use encoding "greek";  # Perl like Greek to you?
 231   use encoding "euc-jp"; # Jperl!
 232
 233   # or you can even do this if your shell supports your native encoding
 234
 235   perl -Mencoding=latin2 -e'...' # Feeling centrally European?
 236   perl -Mencoding=euc-kr -e'...' # Or Korean?
 237
 238   # more control
 239
 240   # A simple euc-cn => utf-8 converter
 241   use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};
 242
 243   # "no encoding;" supported
 244   no encoding;
 245
 246   # an alternate way, Filter
 247   use encoding "euc-jp", Filter=>1;
 248   # now you can use kanji identifiers -- in euc-jp!
 249
 250   # encode based on the current locale - specialized purposes only;
 251   # fraught with danger!!
 252   use encoding ':locale';
 253
 254 =head1 DESCRIPTION
 255
 256 This pragma is used to enable a Perl script to be written in encodings that
 257 aren't strictly ASCII nor UTF-8.  It translates all or portions of the Perl
 258 program script from a given encoding into UTF-8, and changes the PerlIO layers
 259 of C<STDIN> and C<STDOUT> to the encoding specified.
 260
 261 This pragma dates from the days when UTF-8-enabled editors were uncommon.  But
 262 that was long ago, and the need for it is greatly diminished.  That, coupled
 263 with the fact that it doesn't work with threads, along with other problems,
 264 (see L</BUGS>) have led to its being deprecated.  It is planned to remove this
 265 pragma in a future Perl version.  New code should be written in UTF-8, and the
 266 C<use utf8> pragma used instead (see L<perluniintro> and L<utf8> for details).
 267 Old code should be converted to UTF-8, via something like the recipe in the
 268 L</SYNOPSIS> (though this simple approach may require manual adjustments
 269 afterwards).
 270
 271 The only legitimate use of this pragma is almost certainly just one per file,
 272 near the top, with file scope, as the file is likely going to only be written
 273 in one encoding.  Further restrictions apply in Perls before v5.22 (see
 274 L</Prior to Perl v5.22>).
 275
 276 There are two basic modes of operation (plus turning if off):
 277
 278 =over 4
 279
 280 =item C<use encoding ['I<ENCNAME>'] ;>
 281
 282 This is the normal operation.  It translates various literals encountered in
 283 the Perl source file from the encoding I<ENCNAME> into UTF-8, and similarly
 284 converts character code points.  This is used when the script is a combination
 285 of ASCII (for the variable names and punctuation, I<etc>), but the literal
 286 data is in the specified encoding.
 287
 288 I<ENCNAME> is optional.  If omitted, the encoding specified in the environment
 289 variable L<C<PERL_ENCODING>|perlrun/PERL_ENCODING> is used.  If this isn't
 290 set, or the resolved-to encoding is not known to C<L<Encode>>, the error
 291 C<Unknown encoding 'I<ENCNAME>'> will be thrown.
 292
 293 Starting in Perl v5.8.6 (C<Encode> version 2.0.1), I<ENCNAME> may be the
 294 name C<:locale>.  This is for very specialized applications, and is documented
 295 in L</The C<:locale> sub-pragma> below.
 296
 297 The literals that are converted are C<q//, qq//, qr//, qw///, qx//>, and
 298 starting in v5.8.1, C<tr///>.  Operations that do conversions include C<chr>,
 299 C<ord>, C<utf8::upgrade> (but not C<utf8::downgrade>), and C<chomp>.
 300
 301 Also starting in v5.8.1, the C<DATA> pseudo-filehandle is translated from the
 302 encoding into UTF-8.
 303
 304 For example, you can write code in EUC-JP as follows:
 305
 306   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
 307                #<-char-><-char->   # 4 octets
 308   s/\bCamel\b/$Rakuda/;
 309
 310 And with C<use encoding "euc-jp"> in effect, it is the same thing as
 311 that code in UTF-8:
 312
 313   my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
 314   s/\bCamel\b/$Rakuda/;
 315
 316 See L</EXAMPLE> below for a more complete example.
 317
 318 Unless C<${^UNICODE}> (available starting in v5.8.2) exists and is non-zero, the
 319 PerlIO layers of C<STDIN> and C<STDOUT> are set to "C<:encoding(I<ENCNAME>)>".
 320 Therefore,
 321
 322   use encoding "euc-jp";
 323   my $message = "Camel is the symbol of perl.\n";
 324   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
 325   $message =~ s/\bCamel\b/$Rakuda/;
 326   print $message;
 327
 328 will print
 329
 330  "\xF1\xD1\xF1\xCC is the symbol of perl.\n"
 331
 332 not
 333
 334  "\x{99F1}\x{99DD} is the symbol of perl.\n"
 335
 336 You can override this by giving extra arguments; see below.
 337
 338 Note that C<STDERR> WILL NOT be changed, regardless.
 339
 340 Also note that non-STD file handles remain unaffected.  Use C<use
 341 open> or C<binmode> to change the layers of those.
 342
 343 =item C<use encoding I<ENCNAME> Filter=E<gt>1;>
 344
 345 This operates as above, but the C<Filter> argument with a non-zero
 346 value causes the entire script, and not just literals, to be translated from
 347 the encoding into UTF-8.  This allows identifiers in the source to be in that
 348 encoding as well.  (Problems may occur if the encoding is not a superset of
 349 ASCII; imagine all your semi-colons being translated into something
 350 different.)  One can use this form to make
 351
 352  ${"\x{4eba}"}++
 353
 354 work.  (This is equivalent to C<$I<human>++>, where I<human> is a single Han
 355 ideograph).
 356
 357 This effectively means that your source code behaves as if it were written in
 358 UTF-8 with C<'use utf8>' in effect.  So even if your editor only supports
 359 Shift_JIS, for example, you can still try examples in Chapter 15 of
 360 C<Programming Perl, 3rd Ed.>.
 361
 362 This option is significantly slower than the other one.
 363
 364 =item C<no encoding;>
 365
 366 Unsets the script encoding. The layers of C<STDIN>, C<STDOUT> are
 367 reset to "C<:raw>" (the default unprocessed raw stream of bytes).
 368
 369 =back
 370
 371 =head1 OPTIONS
 372
 373 =head2 Setting C<STDIN> and/or C<STDOUT> individually
 374
 375 The encodings of C<STDIN> and C<STDOUT> are individually settable by parameters to
 376 the pragma:
 377
 378  use encoding 'euc-tw', STDIN => 'greek'  ...;
 379
 380 In this case, you cannot omit the first I<ENCNAME>.  C<< STDIN => undef >>
 381 turns the I/O transcoding completely off for that filehandle.
 382
 383 When C<${^UNICODE}> (available starting in v5.8.2) exists and is non-zero,
 384 these options will be completely ignored.  See L<perlvar/C<${^UNICODE}>> and
 385 L<"C<-C>" in perlrun|perlrun/-C [numberE<sol>list]> for details.
 386
 387 =head2 The C<:locale> sub-pragma
 388
 389 Starting in v5.8.6, the encoding name may be C<:locale>.  This means that the
 390 encoding is taken from the current locale, and not hard-coded by the pragma.
 391 Since a script really can only be encoded in exactly one encoding, this option
 392 is dangerous.  It makes sense only if the script itself is written in ASCII,
 393 and all the possible locales that will be in use when the script is executed
 394 are supersets of ASCII.  That means that the script itself doesn't get
 395 changed, but the I/O handles have the specified encoding added, and the
 396 operations like C<chr> and C<ord> use that encoding.
 397
 398 The logic of finding which locale C<:locale> uses is as follows:
 399
 400 =over 4
 401
 402 =item 1.
 403
 404 If the platform supports the C<langinfo(CODESET)> interface, the codeset
 405 returned is used as the default encoding for the open pragma.
 406
 407 =item 2.
 408
 409 If 1. didn't work but we are under the locale pragma, the environment
 410 variables C<LC_ALL> and C<LANG> (in that order) are matched for encodings
 411 (the part after "C<.>", if any), and if any found, that is used
 412 as the default encoding for the open pragma.
 413
 414 =item 3.
 415
 416 If 1. and 2. didn't work, the environment variables C<LC_ALL> and C<LANG>
 417 (in that order) are matched for anything looking like UTF-8, and if
 418 any found, C<:utf8> is used as the default encoding for the open
 419 pragma.
 420
 421 =back
 422
 423 If your locale environment variables (C<LC_ALL>, C<LC_CTYPE>, C<LANG>)
 424 contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
 425 the default encoding of your C<STDIN>, C<STDOUT>, and C<STDERR>, and of
 426 B<any subsequent file open>, is UTF-8.
 427
 428 =head1 CAVEATS
 429
 430 =head2 SIDE EFFECTS
 431
 432 =over
 433
 434 =item *
 435
 436 If the C<encoding> pragma is in scope then the lengths returned are
 437 calculated from the length of C<$/> in Unicode characters, which is not
 438 always the same as the length of C<$/> in the native encoding.
 439
 440 =item *
 441
 442 Without this pragma, if strings operating under byte semantics and strings
 443 with Unicode character data are concatenated, the new string will
 444 be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
 445
 446 The B<encoding> pragma changes this to use the specified encoding
 447 instead.  For example:
 448
 449     use encoding 'utf8';
 450     my $string = chr(20000); # a Unicode string
 451     utf8::encode($string);   # now it's a UTF-8 encoded byte string
 452     # concatenate with another Unicode string
 453     print length($string . chr(20000));
 454
 455 Will print C<2>, because C<$string> is upgraded as UTF-8.  Without
 456 C<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
 457 is three octets when interpreted as Latin-1.
 458
 459 =back
 460
 461 =head2 DO NOT MIX MULTIPLE ENCODINGS
 462
 463 Notice that only literals (string or regular expression) having only
 464 legacy code points are affected: if you mix data like this
 465
 466     \x{100}\xDF
 467     \xDF\x{100}
 468
 469 the data is assumed to be in (Latin 1 and) Unicode, not in your native
 470 encoding.  In other words, this will match in "greek":
 471
 472     "\xDF" =~ /\x{3af}/
 473
 474 but this will not
 475
 476     "\xDF\x{100}" =~ /\x{3af}\x{100}/
 477
 478 since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
 479 the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
 480 LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
 481 should not be mixing your legacy data and Unicode in the same string.
 482
 483 This pragma also affects encoding of the 0x80..0xFF code point range:
 484 normally characters in that range are left as eight-bit bytes (unless
 485 they are combined with characters with code points 0x100 or larger,
 486 in which case all characters need to become UTF-8 encoded), but if
 487 the C<encoding> pragma is present, even the 0x80..0xFF range always
 488 gets UTF-8 encoded.
 489
 490 After all, the best thing about this pragma is that you don't have to
 491 resort to \x{....} just to spell your name in a native encoding.
 492 So feel free to put your strings in your encoding in quotes and
 493 regexes.
 494
 495 =head2 Prior to Perl v5.22
 496
 497 The pragma was a per script, not a per block lexical.  Only the last
 498 C<use encoding> or C<no encoding> mattered, and it affected
 499 B<the whole script>.  However, the C<no encoding> pragma was supported and
 500 C<use encoding> could appear as many times as you want in a given script
 501 (though only the last was effective).
 502
 503 Since the scope wasn't lexical, other modules' use of C<chr>, C<ord>, I<etc.>
 504 were affected.  This leads to spooky, incorrect action at a distance that is
 505 hard to debug.
 506
 507 This means you would have to be very careful of the load order:
 508
 509   # called module
 510   package Module_IN_BAR;
 511   use encoding "bar";
 512   # stuff in "bar" encoding here
 513   1;
 514
 515   # caller script
 516   use encoding "foo"
 517   use Module_IN_BAR;
 518   # surprise! use encoding "bar" is in effect.
 519
 520 The best way to avoid this oddity is to use this pragma RIGHT AFTER
 521 other modules are loaded.  i.e.
 522
 523   use Module_IN_BAR;
 524   use encoding "foo";
 525
 526 =head2 Prior to Encode version 1.87
 527
 528 =over
 529
 530 =item *
 531
 532 C<STDIN> and C<STDOUT> were not set under the filter option.
 533 And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> didn't work like
 534 non-filter version.
 535
 536 =item *
 537
 538 C<use utf8> wasn't implicitly declared so you have to C<use utf8> to do
 539
 540  ${"\x{4eba}"}++
 541
 542 =back
 543
 544 =head2 Prior to Perl v5.8.1
 545
 546 =over
 547
 548 =item "NON-EUC" doublebyte encodings
 549
 550 Because perl needs to parse the script before applying this pragma, such
 551 encodings as Shift_JIS and Big-5 that may contain C<'\'> (BACKSLASH;
 552 C<\x5c>) in the second byte fail because the second byte may
 553 accidentally escape the quoting character that follows.
 554
 555 =item C<tr///>
 556
 557 The B<encoding> pragma works by decoding string literals in
 558 C<q//,qq//,qr//,qw///, qx//> and so forth.  In perl v5.8.0, this
 559 does not apply to C<tr///>.  Therefore,
 560
 561   use encoding 'euc-jp';
 562   #....
 563   $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
 564   #           -------- -------- -------- --------
 565
 566 Does not work as
 567
 568   $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
 569
 570 =over
 571
 572 =item Legend of characters above
 573
 574   utf8     euc-jp   charnames::viacode()
 575   -----------------------------------------
 576   \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
 577   \x{3093} \xA4\xF3 HIRAGANA LETTER N
 578   \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
 579   \x{30f3} \xA5\xF3 KATAKANA LETTER N
 580
 581 =back
 582
 583 This counterintuitive behavior has been fixed in perl v5.8.1.
 584
 585 In perl v5.8.0, you can work around this as follows;
 586
 587   use encoding 'euc-jp';
 588   #  ....
 589   eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
 590
 591 Note the C<tr//> expression is surrounded by C<qq{}>.  The idea behind
 592 this is the same as the classic idiom that makes C<tr///> 'interpolate':
 593
 594    tr/$from/$to/;            # wrong!
 595    eval qq{ tr/$from/$to/ }; # workaround.
 596
 597 =back
 598
 599 =head1 EXAMPLE - Greekperl
 600
 601     use encoding "iso 8859-7";
 602
 603     # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
 604
 605     $a = "\xDF";
 606     $b = "\x{100}";
 607
 608     printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
 609
 610     $c = $a . $b;
 611
 612     # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
 613
 614     # chr() is affected, and ...
 615
 616     print "mega\n"  if ord(chr(0xdf)) == 0x3af;
 617
 618     # ... ord() is affected by the encoding pragma ...
 619
 620     print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
 621
 622     # ... as are eq and cmp ...
 623
 624     print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
 625     print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
 626
 627     # ... but pack/unpack C are not affected, in case you still
 628     # want to go back to your native encoding
 629
 630     print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
 631
 632 =head1 BUGS
 633
 634 =over
 635
 636 =item Thread safety
 637
 638 C<use encoding ...> is not thread-safe (i.e., do not use in threaded
 639 applications).
 640
 641 =item Can't be used by more than one module in a single program.
 642
 643 Only one encoding is allowed.  If you combine modules in a program that have
 644 different encodings, only one will be actually used.
 645
 646 =item Other modules using C<STDIN> and C<STDOUT> get the encoded stream
 647
 648 They may be expecting something completely different.
 649
 650 =item literals in regex that are longer than 127 bytes
 651
 652 For native multibyte encodings (either fixed or variable length),
 653 the current implementation of the regular expressions may introduce
 654 recoding errors for regular expression literals longer than 127 bytes.
 655
 656 =item EBCDIC
 657
 658 The encoding pragma is not supported on EBCDIC platforms.
 659
 660 =item C<format>
 661
 662 This pragma doesn't work well with C<format> because PerlIO does not
 663 get along very well with it.  When C<format> contains non-ASCII
 664 characters it prints funny or gets "wide character warnings".
 665 To understand it, try the code below.
 666
 667   # Save this one in utf8
 668   # replace *non-ascii* with a non-ascii string
 669   my $camel;
 670   format STDOUT =
 671   *non-ascii*@>>>>>>>
 672   $camel
 673   .
 674   $camel = "*non-ascii*";
 675   binmode(STDOUT=>':encoding(utf8)'); # bang!
 676   write;              # funny
 677   print $camel, "\n"; # fine
 678
 679 Without binmode this happens to work but without binmode, print()
 680 fails instead of write().
 681
 682 At any rate, the very use of C<format> is questionable when it comes to
 683 unicode characters since you have to consider such things as character
 684 width (i.e. double-width for ideographs) and directions (i.e. BIDI for
 685 Arabic and Hebrew).
 686
 687 =item See also L</CAVEATS>
 688
 689 =back
 690
 691 =head1 HISTORY
 692
 693 This pragma first appeared in Perl v5.8.0.  It has been enhanced in later
 694 releases as specified above.
 695
 696 =head1 SEE ALSO
 697
 698 L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
 699
 700 Ch. 15 of C<Programming Perl (3rd Edition)>
 701 by Larry Wall, Tom Christiansen, Jon Orwant;
 702 O'Reilly & Associates; ISBN 0-596-00027-8
 703
 704 =cut