1 # $Id: encoding.pm,v 2.12 2013/04/26 18:30:46 dankogai Exp $
3 our $VERSION = sprintf "%d.%02d", q$Revision: 2.12 $ =~ /(\d+)/g;
9 use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
12 if ( ord("A") == 193 ) {
14 Carp::croak("encoding: pragma does not support EBCDIC platforms");
19 eval { require PerlIO::encoding };
21 $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 );
26 $] > 5.008 and return 0; # 5.8.1 or higher then no
27 my %utfs = map { $_ => 1 }
28 qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
29 UTF-32 UTF-32BE UTF-32LE);
30 $utfs{$name} or return 0; # UTFs or no
34 return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
37 sub in_locale { $^H & ( $locale::hint_bits || 0 ) }
39 sub _get_locale_encoding {
42 # I18N::Langinfo isn't available everywhere
44 require I18N::Langinfo;
45 I18N::Langinfo->import(qw(langinfo CODESET));
46 $locale_encoding = langinfo( CODESET() );
51 no warnings 'uninitialized';
53 if ( (not $locale_encoding) && in_locale() ) {
54 if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) {
55 ( $country_language, $locale_encoding ) = ( $1, $2 );
57 elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) {
58 ( $country_language, $locale_encoding ) = ( $1, $2 );
61 # LANGUAGE affects only LC_MESSAGES only on glibc
63 elsif ( not $locale_encoding ) {
64 if ( $ENV{LC_ALL} =~ /\butf-?8\b/i
65 || $ENV{LANG} =~ /\butf-?8\b/i )
67 $locale_encoding = 'utf8';
70 # Could do more heuristics based on the country and language
71 # parts of LC_ALL and LANG (the parts before the dot (if any)),
72 # since we have Locale::Country and Locale::Language available.
73 # TODO: get a database of Language -> Encoding mappings
74 # (the Estonian database at http://www.eki.ee/letter/
75 # would be excellent!) --jhi
77 if ( defined $locale_encoding
78 && lc($locale_encoding) eq 'euc'
79 && defined $country_language )
81 if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) {
82 $locale_encoding = 'euc-jp';
84 elsif ( $country_language =~ /^ko_KR|korean?$/i ) {
85 $locale_encoding = 'euc-kr';
87 elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) {
88 $locale_encoding = 'euc-cn';
90 elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) {
91 $locale_encoding = 'euc-tw';
96 "encoding: Locale encoding '$locale_encoding' too ambiguous"
101 return $locale_encoding;
106 warnings::warnif("deprecated",
107 "Use of the encoding pragma is deprecated")
113 Carp::croak("encoding: no encoding specified.");
115 if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm
116 my $caller = caller();
119 *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
123 $name = _get_locale_encoding() if $name eq ':locale';
125 $name = $ENV{PERL_ENCODING} unless defined $name;
126 my $enc = find_encoding($name);
127 unless ( defined $enc ) {
129 Carp::croak("encoding: Unknown encoding '$name'");
131 $name = $enc->name; # canonize
132 unless ( $arg{Filter} ) {
133 DEBUG and warn "_exception($name) = ", _exception($name);
134 if (! _exception($name)) {
135 if (!$^V || $^V lt v5.21.7) {
139 # Starting with 5.21.7, this pragma uses a shadow variable
140 # designed explicitly for it, ${^E_NCODING}, to enforce
141 # lexical scope; instead of ${^ENCODING}.
143 ${^E_NCODING} = $enc;
146 $HAS_PERLIO or return 1;
149 defined( ${^ENCODING} ) and undef ${^ENCODING};
150 undef ${^E_NCODING} if $^V && $^V ge v5.21.7;
152 # implicitly 'use utf8'
153 require utf8; # to fetch $utf8::hint_bits;
154 $^H |= $utf8::hint_bits;
156 require Filter::Util::Call;
157 Filter::Util::Call->import;
160 my $status = filter_read();
162 $_ = $enc->decode( $_, 1 );
169 $@ eq '' and DEBUG and warn "Filter installed";
171 defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
172 for my $h (qw(STDIN STDOUT)) {
174 unless ( defined find_encoding( $arg{$h} ) ) {
177 "encoding: Unknown encoding for $h, '$arg{$h}'");
179 eval { binmode( $h, ":raw :encoding($arg{$h})" ) };
182 unless ( exists $arg{$h} ) {
184 no warnings 'uninitialized';
185 binmode( $h, ":raw :encoding($name)" );
194 return 1; # I doubt if we need it, though
200 undef ${^E_NCODING} if $^V && $^V ge v5.21.7;
202 binmode( STDIN, ":raw" );
203 binmode( STDOUT, ":raw" );
209 if ( $INC{"Filter/Util/Call.pm"} ) {
210 eval { filter_del() };
221 encoding - allows you to write your script in non-ASCII and non-UTF-8
225 This module has been deprecated since perl v5.18. See L</DESCRIPTION> and
230 use encoding "greek"; # Perl like Greek to you?
231 use encoding "euc-jp"; # Jperl!
233 # or you can even do this if your shell supports your native encoding
235 perl -Mencoding=latin2 -e'...' # Feeling centrally European?
236 perl -Mencoding=euc-kr -e'...' # Or Korean?
240 # A simple euc-cn => utf-8 converter
241 use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
243 # "no encoding;" supported
246 # an alternate way, Filter
247 use encoding "euc-jp", Filter=>1;
248 # now you can use kanji identifiers -- in euc-jp!
250 # encode based on the current locale - specialized purposes only;
251 # fraught with danger!!
252 use encoding ':locale';
256 This pragma is used to enable a Perl script to be written in encodings that
257 aren't strictly ASCII nor UTF-8. It translates all or portions of the Perl
258 program script from a given encoding into UTF-8, and changes the PerlIO layers
259 of C<STDIN> and C<STDOUT> to the encoding specified.
261 This pragma dates from the days when UTF-8-enabled editors were uncommon. But
262 that was long ago, and the need for it is greatly diminished. That, coupled
263 with the fact that it doesn't work with threads, along with other problems,
264 (see L</BUGS>) have led to its being deprecated. It is planned to remove this
265 pragma in a future Perl version. New code should be written in UTF-8, and the
266 C<use utf8> pragma used instead (see L<perluniintro> and L<utf8> for details).
267 Old code should be converted to UTF-8, via something like the recipe in the
268 L</SYNOPSIS> (though this simple approach may require manual adjustments
271 The only legitimate use of this pragma is almost certainly just one per file,
272 near the top, with file scope, as the file is likely going to only be written
273 in one encoding. Further restrictions apply in Perls before v5.22 (see
274 L</Prior to Perl v5.22>).
276 There are two basic modes of operation (plus turning if off):
280 =item C<use encoding ['I<ENCNAME>'] ;>
282 This is the normal operation. It translates various literals encountered in
283 the Perl source file from the encoding I<ENCNAME> into UTF-8, and similarly
284 converts character code points. This is used when the script is a combination
285 of ASCII (for the variable names and punctuation, I<etc>), but the literal
286 data is in the specified encoding.
288 I<ENCNAME> is optional. If omitted, the encoding specified in the environment
289 variable L<C<PERL_ENCODING>|perlrun/PERL_ENCODING> is used. If this isn't
290 set, or the resolved-to encoding is not known to C<L<Encode>>, the error
291 C<Unknown encoding 'I<ENCNAME>'> will be thrown.
293 Starting in Perl v5.8.6 (C<Encode> version 2.0.1), I<ENCNAME> may be the
294 name C<:locale>. This is for very specialized applications, and is documented
295 in L</The C<:locale> sub-pragma> below.
297 The literals that are converted are C<q//, qq//, qr//, qw///, qx//>, and
298 starting in v5.8.1, C<tr///>. Operations that do conversions include C<chr>,
299 C<ord>, C<utf8::upgrade> (but not C<utf8::downgrade>), and C<chomp>.
301 Also starting in v5.8.1, the C<DATA> pseudo-filehandle is translated from the
304 For example, you can write code in EUC-JP as follows:
306 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
307 #<-char-><-char-> # 4 octets
308 s/\bCamel\b/$Rakuda/;
310 And with C<use encoding "euc-jp"> in effect, it is the same thing as
313 my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
314 s/\bCamel\b/$Rakuda/;
316 See L</EXAMPLE> below for a more complete example.
318 Unless C<${^UNICODE}> (available starting in v5.8.2) exists and is non-zero, the
319 PerlIO layers of C<STDIN> and C<STDOUT> are set to "C<:encoding(I<ENCNAME>)>".
322 use encoding "euc-jp";
323 my $message = "Camel is the symbol of perl.\n";
324 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
325 $message =~ s/\bCamel\b/$Rakuda/;
330 "\xF1\xD1\xF1\xCC is the symbol of perl.\n"
334 "\x{99F1}\x{99DD} is the symbol of perl.\n"
336 You can override this by giving extra arguments; see below.
338 Note that C<STDERR> WILL NOT be changed, regardless.
340 Also note that non-STD file handles remain unaffected. Use C<use
341 open> or C<binmode> to change the layers of those.
343 =item C<use encoding I<ENCNAME> Filter=E<gt>1;>
345 This operates as above, but the C<Filter> argument with a non-zero
346 value causes the entire script, and not just literals, to be translated from
347 the encoding into UTF-8. This allows identifiers in the source to be in that
348 encoding as well. (Problems may occur if the encoding is not a superset of
349 ASCII; imagine all your semi-colons being translated into something
350 different.) One can use this form to make
354 work. (This is equivalent to C<$I<human>++>, where I<human> is a single Han
357 This effectively means that your source code behaves as if it were written in
358 UTF-8 with C<'use utf8>' in effect. So even if your editor only supports
359 Shift_JIS, for example, you can still try examples in Chapter 15 of
360 C<Programming Perl, 3rd Ed.>.
362 This option is significantly slower than the other one.
364 =item C<no encoding;>
366 Unsets the script encoding. The layers of C<STDIN>, C<STDOUT> are
367 reset to "C<:raw>" (the default unprocessed raw stream of bytes).
373 =head2 Setting C<STDIN> and/or C<STDOUT> individually
375 The encodings of C<STDIN> and C<STDOUT> are individually settable by parameters to
378 use encoding 'euc-tw', STDIN => 'greek' ...;
380 In this case, you cannot omit the first I<ENCNAME>. C<< STDIN => undef >>
381 turns the I/O transcoding completely off for that filehandle.
383 When C<${^UNICODE}> (available starting in v5.8.2) exists and is non-zero,
384 these options will be completely ignored. See L<perlvar/C<${^UNICODE}>> and
385 L<"C<-C>" in perlrun|perlrun/-C [numberE<sol>list]> for details.
387 =head2 The C<:locale> sub-pragma
389 Starting in v5.8.6, the encoding name may be C<:locale>. This means that the
390 encoding is taken from the current locale, and not hard-coded by the pragma.
391 Since a script really can only be encoded in exactly one encoding, this option
392 is dangerous. It makes sense only if the script itself is written in ASCII,
393 and all the possible locales that will be in use when the script is executed
394 are supersets of ASCII. That means that the script itself doesn't get
395 changed, but the I/O handles have the specified encoding added, and the
396 operations like C<chr> and C<ord> use that encoding.
398 The logic of finding which locale C<:locale> uses is as follows:
404 If the platform supports the C<langinfo(CODESET)> interface, the codeset
405 returned is used as the default encoding for the open pragma.
409 If 1. didn't work but we are under the locale pragma, the environment
410 variables C<LC_ALL> and C<LANG> (in that order) are matched for encodings
411 (the part after "C<.>", if any), and if any found, that is used
412 as the default encoding for the open pragma.
416 If 1. and 2. didn't work, the environment variables C<LC_ALL> and C<LANG>
417 (in that order) are matched for anything looking like UTF-8, and if
418 any found, C<:utf8> is used as the default encoding for the open
423 If your locale environment variables (C<LC_ALL>, C<LC_CTYPE>, C<LANG>)
424 contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
425 the default encoding of your C<STDIN>, C<STDOUT>, and C<STDERR>, and of
426 B<any subsequent file open>, is UTF-8.
436 If the C<encoding> pragma is in scope then the lengths returned are
437 calculated from the length of C<$/> in Unicode characters, which is not
438 always the same as the length of C<$/> in the native encoding.
442 Without this pragma, if strings operating under byte semantics and strings
443 with Unicode character data are concatenated, the new string will
444 be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
446 The B<encoding> pragma changes this to use the specified encoding
447 instead. For example:
450 my $string = chr(20000); # a Unicode string
451 utf8::encode($string); # now it's a UTF-8 encoded byte string
452 # concatenate with another Unicode string
453 print length($string . chr(20000));
455 Will print C<2>, because C<$string> is upgraded as UTF-8. Without
456 C<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
457 is three octets when interpreted as Latin-1.
461 =head2 DO NOT MIX MULTIPLE ENCODINGS
463 Notice that only literals (string or regular expression) having only
464 legacy code points are affected: if you mix data like this
469 the data is assumed to be in (Latin 1 and) Unicode, not in your native
470 encoding. In other words, this will match in "greek":
476 "\xDF\x{100}" =~ /\x{3af}\x{100}/
478 since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
479 the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
480 LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You
481 should not be mixing your legacy data and Unicode in the same string.
483 This pragma also affects encoding of the 0x80..0xFF code point range:
484 normally characters in that range are left as eight-bit bytes (unless
485 they are combined with characters with code points 0x100 or larger,
486 in which case all characters need to become UTF-8 encoded), but if
487 the C<encoding> pragma is present, even the 0x80..0xFF range always
490 After all, the best thing about this pragma is that you don't have to
491 resort to \x{....} just to spell your name in a native encoding.
492 So feel free to put your strings in your encoding in quotes and
495 =head2 Prior to Perl v5.22
497 The pragma was a per script, not a per block lexical. Only the last
498 C<use encoding> or C<no encoding> mattered, and it affected
499 B<the whole script>. However, the C<no encoding> pragma was supported and
500 C<use encoding> could appear as many times as you want in a given script
501 (though only the last was effective).
503 Since the scope wasn't lexical, other modules' use of C<chr>, C<ord>, I<etc.>
504 were affected. This leads to spooky, incorrect action at a distance that is
507 This means you would have to be very careful of the load order:
510 package Module_IN_BAR;
512 # stuff in "bar" encoding here
518 # surprise! use encoding "bar" is in effect.
520 The best way to avoid this oddity is to use this pragma RIGHT AFTER
521 other modules are loaded. i.e.
526 =head2 Prior to Encode version 1.87
532 C<STDIN> and C<STDOUT> were not set under the filter option.
533 And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> didn't work like
538 C<use utf8> wasn't implicitly declared so you have to C<use utf8> to do
544 =head2 Prior to Perl v5.8.1
548 =item "NON-EUC" doublebyte encodings
550 Because perl needs to parse the script before applying this pragma, such
551 encodings as Shift_JIS and Big-5 that may contain C<'\'> (BACKSLASH;
552 C<\x5c>) in the second byte fail because the second byte may
553 accidentally escape the quoting character that follows.
557 The B<encoding> pragma works by decoding string literals in
558 C<q//,qq//,qr//,qw///, qx//> and so forth. In perl v5.8.0, this
559 does not apply to C<tr///>. Therefore,
561 use encoding 'euc-jp';
563 $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
564 # -------- -------- -------- --------
568 $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
572 =item Legend of characters above
574 utf8 euc-jp charnames::viacode()
575 -----------------------------------------
576 \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
577 \x{3093} \xA4\xF3 HIRAGANA LETTER N
578 \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
579 \x{30f3} \xA5\xF3 KATAKANA LETTER N
583 This counterintuitive behavior has been fixed in perl v5.8.1.
585 In perl v5.8.0, you can work around this as follows;
587 use encoding 'euc-jp';
589 eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
591 Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind
592 this is the same as the classic idiom that makes C<tr///> 'interpolate':
594 tr/$from/$to/; # wrong!
595 eval qq{ tr/$from/$to/ }; # workaround.
599 =head1 EXAMPLE - Greekperl
601 use encoding "iso 8859-7";
603 # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
608 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
612 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
614 # chr() is affected, and ...
616 print "mega\n" if ord(chr(0xdf)) == 0x3af;
618 # ... ord() is affected by the encoding pragma ...
620 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
622 # ... as are eq and cmp ...
624 print "peta\n" if "\x{3af}" eq pack("C", 0xdf);
625 print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
627 # ... but pack/unpack C are not affected, in case you still
628 # want to go back to your native encoding
630 print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
638 C<use encoding ...> is not thread-safe (i.e., do not use in threaded
641 =item Can't be used by more than one module in a single program.
643 Only one encoding is allowed. If you combine modules in a program that have
644 different encodings, only one will be actually used.
646 =item Other modules using C<STDIN> and C<STDOUT> get the encoded stream
648 They may be expecting something completely different.
650 =item literals in regex that are longer than 127 bytes
652 For native multibyte encodings (either fixed or variable length),
653 the current implementation of the regular expressions may introduce
654 recoding errors for regular expression literals longer than 127 bytes.
658 The encoding pragma is not supported on EBCDIC platforms.
662 This pragma doesn't work well with C<format> because PerlIO does not
663 get along very well with it. When C<format> contains non-ASCII
664 characters it prints funny or gets "wide character warnings".
665 To understand it, try the code below.
667 # Save this one in utf8
668 # replace *non-ascii* with a non-ascii string
674 $camel = "*non-ascii*";
675 binmode(STDOUT=>':encoding(utf8)'); # bang!
677 print $camel, "\n"; # fine
679 Without binmode this happens to work but without binmode, print()
680 fails instead of write().
682 At any rate, the very use of C<format> is questionable when it comes to
683 unicode characters since you have to consider such things as character
684 width (i.e. double-width for ideographs) and directions (i.e. BIDI for
687 =item See also L</CAVEATS>
693 This pragma first appeared in Perl v5.8.0. It has been enhanced in later
694 releases as specified above.
698 L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
700 Ch. 15 of C<Programming Perl (3rd Edition)>
701 by Larry Wall, Tom Christiansen, Jon Orwant;
702 O'Reilly & Associates; ISBN 0-596-00027-8