This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to Win32 0.34
[perl5.git] / ext / Encode / encoding.pm
CommitLineData
742555bd 1# $Id: encoding.pm,v 2.6 2007/04/22 14:56:12 dankogai Exp $
3ef515df 2package encoding;
44b3b9c7 3our $VERSION = do { my @r = ( q$Revision: 2.6 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
3ef515df
JH
4
5use Encode;
046f36bf 6use strict;
656ebd29 7use warnings;
b1aeb384 8
8f139f4c 9sub DEBUG () { 0 }
3ef515df
JH
10
11BEGIN {
d1256cb1
RGS
12 if ( ord("A") == 193 ) {
13 require Carp;
14 Carp::croak("encoding: pragma does not support EBCDIC platforms");
3ef515df
JH
15 }
16}
17
0ab8f81e
JH
18our $HAS_PERLIO = 0;
19eval { require PerlIO::encoding };
d1256cb1
RGS
20unless ($@) {
21 $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 );
0ab8f81e 22}
b2704119 23
d1256cb1 24sub _exception {
151b5d36 25 my $name = shift;
d1256cb1
RGS
26 $] > 5.008 and return 0; # 5.8.1 or higher then no
27 my %utfs = map { $_ => 1 }
28 qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
29 UTF-32 UTF-32BE UTF-32LE);
30 $utfs{$name} or return 0; # UTFs or no
31 require Config;
32 Config->import();
33 our %Config;
34 return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
151b5d36 35}
fa6f41cf 36
d1256cb1 37sub in_locale { $^H & ( $locale::hint_bits || 0 ) }
b1aeb384
JH
38
39sub _get_locale_encoding {
40 my $locale_encoding;
41
42 # I18N::Langinfo isn't available everywhere
43 eval {
d1256cb1
RGS
44 require I18N::Langinfo;
45 I18N::Langinfo->import(qw(langinfo CODESET));
46 $locale_encoding = langinfo( CODESET() );
b1aeb384 47 };
d1256cb1 48
b1aeb384
JH
49 my $country_language;
50
51 no warnings 'uninitialized';
52
d1256cb1
RGS
53 if ( not $locale_encoding && in_locale() ) {
54 if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/ ) {
55 ( $country_language, $locale_encoding ) = ( $1, $2 );
56 }
57 elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.]+)$/ ) {
58 ( $country_language, $locale_encoding ) = ( $1, $2 );
59 }
60
61 # LANGUAGE affects only LC_MESSAGES only on glibc
62 }
63 elsif ( not $locale_encoding ) {
64 if ( $ENV{LC_ALL} =~ /\butf-?8\b/i
65 || $ENV{LANG} =~ /\butf-?8\b/i )
66 {
67 $locale_encoding = 'utf8';
68 }
69
70 # Could do more heuristics based on the country and language
71 # parts of LC_ALL and LANG (the parts before the dot (if any)),
72 # since we have Locale::Country and Locale::Language available.
73 # TODO: get a database of Language -> Encoding mappings
74 # (the Estonian database at http://www.eki.ee/letter/
75 # would be excellent!) --jhi
b1aeb384 76 }
d1256cb1
RGS
77 if ( defined $locale_encoding
78 && lc($locale_encoding) eq 'euc'
79 && defined $country_language )
80 {
81 if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) {
82 $locale_encoding = 'euc-jp';
83 }
84 elsif ( $country_language =~ /^ko_KR|korean?$/i ) {
85 $locale_encoding = 'euc-kr';
86 }
5a1dbf39 87 elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) {
d1256cb1
RGS
88 $locale_encoding = 'euc-cn';
89 }
90 elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) {
91 $locale_encoding = 'euc-tw';
92 }
93 else {
94 require Carp;
95 Carp::croak(
96 "encoding: Locale encoding '$locale_encoding' too ambiguous"
97 );
98 }
b1aeb384
JH
99 }
100
101 return $locale_encoding;
102}
103
3ef515df
JH
104sub import {
105 my $class = shift;
106 my $name = shift;
d1256cb1
RGS
107 if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm
108 my $caller = caller();
b1aeb384 109 {
d1256cb1
RGS
110 no strict 'refs';
111 *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
112 }
113 return;
b1aeb384
JH
114 }
115 $name = _get_locale_encoding() if $name eq ':locale';
3ef515df 116 my %arg = @_;
b1aeb384 117 $name = $ENV{PERL_ENCODING} unless defined $name;
3ef515df 118 my $enc = find_encoding($name);
d1256cb1
RGS
119 unless ( defined $enc ) {
120 require Carp;
121 Carp::croak("encoding: Unknown encoding '$name'");
122 }
123 $name = $enc->name; # canonize
124 unless ( $arg{Filter} ) {
125 DEBUG and warn "_exception($name) = ", _exception($name);
126 _exception($name) or ${^ENCODING} = $enc;
127 $HAS_PERLIO or return 1;
3ef515df 128 }
d1256cb1
RGS
129 else {
130 defined( ${^ENCODING} ) and undef ${^ENCODING};
131
132 # implicitly 'use utf8'
133 require utf8; # to fetch $utf8::hint_bits;
134 $^H |= $utf8::hint_bits;
135 eval {
136 require Filter::Util::Call;
137 Filter::Util::Call->import;
138 filter_add(
139 sub {
140 my $status = filter_read();
141 if ( $status > 0 ) {
142 $_ = $enc->decode( $_, 1 );
143 DEBUG and warn $_;
144 }
145 $status;
146 }
147 );
148 };
d7fe8a7a 149 $@ eq '' and DEBUG and warn "Filter installed";
b1aeb384 150 }
05ef2f67 151 defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
d1256cb1
RGS
152 for my $h (qw(STDIN STDOUT)) {
153 if ( $arg{$h} ) {
154 unless ( defined find_encoding( $arg{$h} ) ) {
155 require Carp;
156 Carp::croak(
157 "encoding: Unknown encoding for $h, '$arg{$h}'");
158 }
159 eval { binmode( $h, ":raw :encoding($arg{$h})" ) };
160 }
161 else {
162 unless ( exists $arg{$h} ) {
163 eval {
164 no warnings 'uninitialized';
165 binmode( $h, ":raw :encoding($name)" );
166 };
167 }
168 }
169 if ($@) {
170 require Carp;
171 Carp::croak($@);
172 }
3ef515df 173 }
d1256cb1 174 return 1; # I doubt if we need it, though
3ef515df
JH
175}
176
d1256cb1 177sub unimport {
3ef515df
JH
178 no warnings;
179 undef ${^ENCODING};
d1256cb1
RGS
180 if ($HAS_PERLIO) {
181 binmode( STDIN, ":raw" );
182 binmode( STDOUT, ":raw" );
183 }
184 else {
185 binmode(STDIN);
186 binmode(STDOUT);
621b0f8d 187 }
d1256cb1
RGS
188 if ( $INC{"Filter/Util/Call.pm"} ) {
189 eval { filter_del() };
aae85ceb 190 }
3ef515df
JH
191}
192
1931;
194__END__
85982a32 195
3ef515df
JH
196=pod
197
198=head1 NAME
199
0ab8f81e 200encoding - allows you to write your script in non-ascii or non-utf8
3ef515df
JH
201
202=head1 SYNOPSIS
203
962111ca 204 use encoding "greek"; # Perl like Greek to you?
3ef515df
JH
205 use encoding "euc-jp"; # Jperl!
206
962111ca 207 # or you can even do this if your shell supports your native encoding
3ef515df 208
962111ca 209 perl -Mencoding=latin2 -e '...' # Feeling centrally European?
0ab8f81e 210 perl -Mencoding=euc-kr -e '...' # Or Korean?
3ef515df 211
3ef515df
JH
212 # more control
213
962111ca 214 # A simple euc-cn => utf-8 converter
6d1c0808 215 use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
3ef515df
JH
216
217 # "no encoding;" supported (but not scoped!)
218 no encoding;
219
aae85ceb
DK
220 # an alternate way, Filter
221 use encoding "euc-jp", Filter=>1;
aae85ceb
DK
222 # now you can use kanji identifiers -- in euc-jp!
223
b1aeb384
JH
224 # switch on locale -
225 # note that this probably means that unless you have a complete control
226 # over the environments the application is ever going to be run, you should
227 # NOT use the feature of encoding pragma allowing you to write your script
228 # in any recognized encoding because changing locale settings will wreck
229 # the script; you can of course still use the other features of the pragma.
230 use encoding ':locale';
231
3ef515df
JH
232=head1 ABSTRACT
233
962111ca
JH
234Let's start with a bit of history: Perl 5.6.0 introduced Unicode
235support. You could apply C<substr()> and regexes even to complex CJK
236characters -- so long as the script was written in UTF-8. But back
0ab8f81e
JH
237then, text editors that supported UTF-8 were still rare and many users
238instead chose to write scripts in legacy encodings, giving up a whole
239new feature of Perl 5.6.
3ef515df 240
0ab8f81e 241Rewind to the future: starting from perl 5.8.0 with the B<encoding>
962111ca
JH
242pragma, you can write your script in any encoding you like (so long
243as the C<Encode> module supports it) and still enjoy Unicode support.
0f29a567 244This pragma achieves that by doing the following:
05ef2f67
JH
245
246=over
247
248=item *
249
250Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
251the encoding specified to utf8. In Perl 5.8.1 and later, literals in
252C<tr///> and C<DATA> pseudo-filehandle are also converted.
253
254=item *
255
256Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
257 specified.
258
259=back
260
261=head2 Literal Conversions
262
0ab8f81e 263You can write code in EUC-JP as follows:
3ef515df
JH
264
265 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
266 #<-char-><-char-> # 4 octets
267 s/\bCamel\b/$Rakuda/;
268
269And with C<use encoding "euc-jp"> in effect, it is the same thing as
962111ca 270the code in UTF-8:
3ef515df 271
32b9ed1f 272 my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
3ef515df
JH
273 s/\bCamel\b/$Rakuda/;
274
05ef2f67
JH
275=head2 PerlIO layers for C<STD(IN|OUT)>
276
277The B<encoding> pragma also modifies the filehandle layers of
4b291ae6 278STDIN and STDOUT to the specified encoding. Therefore,
3ef515df
JH
279
280 use encoding "euc-jp";
281 my $message = "Camel is the symbol of perl.\n";
282 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
283 $message =~ s/\bCamel\b/$Rakuda/;
284 print $message;
285
962111ca
JH
286Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
287not "\x{99F1}\x{99DD} is the symbol of perl.\n".
3ef515df 288
0ab8f81e 289You can override this by giving extra arguments; see below.
3ef515df 290
990e18f7
AT
291=head2 Implicit upgrading for byte strings
292
293By default, if strings operating under byte semantics and strings
294with Unicode character data are concatenated, the new string will
295be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
296
297The B<encoding> pragma changes this to use the specified encoding
298instead. For example:
299
300 use encoding 'utf8';
301 my $string = chr(20000); # a Unicode string
302 utf8::encode($string); # now it's a UTF-8 encoded byte string
303 # concatenate with another Unicode string
304 print length($string . chr(20000));
305
306Will print C<2>, because C<$string> is upgraded as UTF-8. Without
307C<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
308is three octets when interpreted as Latin-1.
309
2575c402
JW
310=head2 Side effects
311
312If the C<encoding> pragma is in scope then the lengths returned are
313calculated from the length of C<$/> in Unicode characters, which is not
314always the same as the length of C<$/> in the native encoding.
315
316This pragma affects utf8::upgrade, but not utf8::downgrade.
317
51e4e64d
NC
318=head2 Side effects
319
320If the C<encoding> pragma is in scope then the lengths returned are
321calculated from the length of C<$/> in Unicode characters, which is not
322always the same as the length of C<$/> in the native encoding.
323
324This pragma affects utf8::upgrade, but not utf8::downgrade.
325
44b3b9c7
SP
326=head2 Side effects
327
328If the C<encoding> pragma is in scope then the lengths returned are
329calculated from the length of C<$/> in Unicode characters, which is not
330always the same as the length of C<$/> in the native encoding.
331
332This pragma affects utf8::upgrade, but not utf8::downgrade.
333
05ef2f67
JH
334=head1 FEATURES THAT REQUIRE 5.8.1
335
336Some of the features offered by this pragma requires perl 5.8.1. Most
0f29a567 337of these are done by Inaba Hiroto. Any other features and changes
05ef2f67
JH
338are good for 5.8.0.
339
340=over
341
342=item "NON-EUC" doublebyte encodings
343
0f29a567 344Because perl needs to parse script before applying this pragma, such
05ef2f67
JH
345encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
346\x5c) in the second byte fails because the second byte may
0f29a567 347accidentally escape the quoting character that follows. Perl 5.8.1
05ef2f67
JH
348or later fixes this problem.
349
350=item tr//
351
352C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
353See the section below for details.
354
355=item DATA pseudo-filehandle
356
357Another feature that was overlooked was C<DATA>.
358
359=back
360
3ef515df
JH
361=head1 USAGE
362
363=over 4
364
365=item use encoding [I<ENCNAME>] ;
366
05ef2f67
JH
367Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE}
368exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
369":encoding(I<ENCNAME>)".
370
371Note that STDERR WILL NOT be changed.
372
373Also note that non-STD file handles remain unaffected. Use C<use
374open> or C<binmode> to change layers of those.
3ef515df
JH
375
376If no encoding is specified, the environment variable L<PERL_ENCODING>
962111ca
JH
377is consulted. If no encoding can be found, the error C<Unknown encoding
378'I<ENCNAME>'> will be thrown.
3ef515df 379
aae85ceb 380=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
3ef515df 381
0ab8f81e 382You can also individually set encodings of STDIN and STDOUT via the
32b9ed1f
A
383C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the
384first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
aae85ceb 385completely off.
3ef515df 386
05ef2f67
JH
387When ${^UNICODE} exists and non-zero, these options will completely
388ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See
389L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
390details (perl 5.8.1 and later).
391
151b5d36
JH
392=item use encoding I<ENCNAME> Filter=E<gt>1;
393
394This turns the encoding pragma into a source filter. While the
395default approach just decodes interpolated literals (in qq() and
396qr()), this will apply a source filter to the entire source code. See
05ef2f67 397L</"The Filter Option"> below for details.
151b5d36 398
3ef515df
JH
399=item no encoding;
400
05ef2f67 401Unsets the script encoding. The layers of STDIN, STDOUT are
962111ca 402reset to ":raw" (the default unprocessed raw stream of bytes).
3ef515df
JH
403
404=back
405
151b5d36
JH
406=head1 The Filter Option
407
408The magic of C<use encoding> is not applied to the names of
409identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
410is a single Han ideograph) work, you still need to write your script
411in UTF-8 -- or use a source filter. That's what 'Filter=>1' does.
412
151b5d36
JH
413What does this mean? Your source code behaves as if it is written in
414UTF-8 with 'use utf8' in effect. So even if your editor only supports
415Shift_JIS, for example, you can still try examples in Chapter 15 of
416C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8
417identifiers.
418
419This option is significantly slower and (as of this writing) non-ASCII
420identifiers are not very stable WITHOUT this option and with the
421source code written in UTF-8.
422
423=head2 Filter-related changes at Encode version 1.87
424
425=over
426
427=item *
428
429The Filter option now sets STDIN and STDOUT like non-filter options.
430And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
431non-filter version.
432
433=item *
434
435C<use utf8> is implicitly declared so you no longer have to C<use
436utf8> to C<${"\x{4eba}"}++>.
437
438=back
439
3ef515df
JH
440=head1 CAVEATS
441
442=head2 NOT SCOPED
443
444The pragma is a per script, not a per block lexical. Only the last
621b0f8d
DK
445C<use encoding> or C<no encoding> matters, and it affects
446B<the whole script>. However, the <no encoding> pragma is supported and
447B<use encoding> can appear as many times as you want in a given script.
448The multiple use of this pragma is discouraged.
449
0f29a567 450By the same reason, the use this pragma inside modules is also
3c4b39be 451discouraged (though not as strongly discouraged as the case above.
0f29a567 452See below).
05ef2f67
JH
453
454If you still have to write a module with this pragma, be very careful
455of the load order. See the codes below;
456
457 # called module
458 package Module_IN_BAR;
459 use encoding "bar";
460 # stuff in "bar" encoding here
461 1;
462
463 # caller script
464 use encoding "foo"
465 use Module_IN_BAR;
466 # surprise! use encoding "bar" is in effect.
467
468The best way to avoid this oddity is to use this pragma RIGHT AFTER
469other modules are loaded. i.e.
470
471 use Module_IN_BAR;
472 use encoding "foo";
3ef515df
JH
473
474=head2 DO NOT MIX MULTIPLE ENCODINGS
475
476Notice that only literals (string or regular expression) having only
477legacy code points are affected: if you mix data like this
478
d1256cb1 479 \xDF\x{100}
3ef515df
JH
480
481the data is assumed to be in (Latin 1 and) Unicode, not in your native
482encoding. In other words, this will match in "greek":
483
d1256cb1 484 "\xDF" =~ /\x{3af}/
3ef515df
JH
485
486but this will not
487
d1256cb1 488 "\xDF\x{100}" =~ /\x{3af}\x{100}/
3ef515df 489
962111ca
JH
490since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
491the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
492LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You
493should not be mixing your legacy data and Unicode in the same string.
3ef515df
JH
494
495This pragma also affects encoding of the 0x80..0xFF code point range:
496normally characters in that range are left as eight-bit bytes (unless
497they are combined with characters with code points 0x100 or larger,
498in which case all characters need to become UTF-8 encoded), but if
499the C<encoding> pragma is present, even the 0x80..0xFF range always
500gets UTF-8 encoded.
501
502After all, the best thing about this pragma is that you don't have to
0ab8f81e
JH
503resort to \x{....} just to spell your name in a native encoding.
504So feel free to put your strings in your encoding in quotes and
505regexes.
3ef515df 506
151b5d36 507=head2 tr/// with ranges
4b291ae6
DK
508
509The B<encoding> pragma works by decoding string literals in
151b5d36 510C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this
4b291ae6
DK
511does not apply to C<tr///>. Therefore,
512
513 use encoding 'euc-jp';
514 #....
515 $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
516 # -------- -------- -------- --------
517
518Does not work as
519
520 $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
521
522=over
523
524=item Legend of characters above
525
526 utf8 euc-jp charnames::viacode()
527 -----------------------------------------
528 \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
529 \x{3093} \xA4\xF3 HIRAGANA LETTER N
530 \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
531 \x{30f3} \xA5\xF3 KATAKANA LETTER N
532
533=back
534
05ef2f67 535This counterintuitive behavior has been fixed in perl 5.8.1.
151b5d36 536
4b291ae6
DK
537=head3 workaround to tr///;
538
ce16148b 539In perl 5.8.0, you can work around as follows;
4b291ae6
DK
540
541 use encoding 'euc-jp';
151b5d36 542 # ....
4b291ae6
DK
543 eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
544
ce16148b 545Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind
4b291ae6
DK
546is the same as classic idiom that makes C<tr///> 'interpolate'.
547
548 tr/$from/$to/; # wrong!
549 eval qq{ tr/$from/$to/ }; # workaround.
550
551Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
552C<tr///> not being decoded was obviously against the will of Perl5
05ef2f67 553Porters so it has been fixed in Perl 5.8.1 or later.
aae85ceb 554
3ef515df
JH
555=head1 EXAMPLE - Greekperl
556
557 use encoding "iso 8859-7";
558
0ab8f81e 559 # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
3ef515df
JH
560
561 $a = "\xDF";
562 $b = "\x{100}";
563
564 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
565
566 $c = $a . $b;
567
568 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
569
570 # chr() is affected, and ...
571
572 print "mega\n" if ord(chr(0xdf)) == 0x3af;
573
574 # ... ord() is affected by the encoding pragma ...
575
576 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
577
578 # ... as are eq and cmp ...
579
580 print "peta\n" if "\x{3af}" eq pack("C", 0xdf);
581 print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
582
583 # ... but pack/unpack C are not affected, in case you still
0ab8f81e 584 # want to go back to your native encoding
3ef515df
JH
585
586 print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
587
588=head1 KNOWN PROBLEMS
589
151b5d36
JH
590=over
591
0f29a567 592=item literals in regex that are longer than 127 bytes
151b5d36 593
0ab8f81e 594For native multibyte encodings (either fixed or variable length),
3ef515df 595the current implementation of the regular expressions may introduce
0ab8f81e 596recoding errors for regular expression literals longer than 127 bytes.
3ef515df 597
05ef2f67 598=item EBCDIC
151b5d36 599
3ef515df 600The encoding pragma is not supported on EBCDIC platforms.
0ab8f81e
JH
601(Porters who are willing and able to remove this limitation are
602welcome.)
3ef515df 603
05ef2f67
JH
604=item format
605
606This pragma doesn't work well with format because PerlIO does not
607get along very well with it. When format contains non-ascii
608characters it prints funny or gets "wide character warnings".
609To understand it, try the code below.
610
611 # Save this one in utf8
612 # replace *non-ascii* with a non-ascii string
613 my $camel;
614 format STDOUT =
615 *non-ascii*@>>>>>>>
616 $camel
617 .
618 $camel = "*non-ascii*";
619 binmode(STDOUT=>':encoding(utf8)'); # bang!
620 write; # funny
621 print $camel, "\n"; # fine
622
623Without binmode this happens to work but without binmode, print()
624fails instead of write().
625
626At any rate, the very use of format is questionable when it comes to
627unicode characters since you have to consider such things as character
628width (i.e. double-width for ideographs) and directions (i.e. BIDI for
629Arabic and Hebrew).
630
7303322e
RGS
631=item Thread safety
632
633C<use encoding ...> is not thread-safe (i.e., do not use in threaded
634applications).
635
151b5d36
JH
636=back
637
b1aeb384
JH
638=head2 The Logic of :locale
639
640The logic of C<:locale> is as follows:
641
642=over 4
643
644=item 1.
645
646If the platform supports the langinfo(CODESET) interface, the codeset
647returned is used as the default encoding for the open pragma.
648
649=item 2.
650
651If 1. didn't work but we are under the locale pragma, the environment
652variables LC_ALL and LANG (in that order) are matched for encodings
653(the part after C<.>, if any), and if any found, that is used
654as the default encoding for the open pragma.
655
656=item 3.
657
658If 1. and 2. didn't work, the environment variables LC_ALL and LANG
659(in that order) are matched for anything looking like UTF-8, and if
660any found, C<:utf8> is used as the default encoding for the open
661pragma.
662
663=back
664
665If your locale environment variables (LC_ALL, LC_CTYPE, LANG)
666contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
667the default encoding of your STDIN, STDOUT, and STDERR, and of
668B<any subsequent file open>, is UTF-8.
669
05ef2f67
JH
670=head1 HISTORY
671
672This pragma first appeared in Perl 5.8.0. For features that require
6735.8.1 and better, see above.
674
b1aeb384
JH
675The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6.
676
3ef515df
JH
677=head1 SEE ALSO
678
aae85ceb
DK
679L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
680
681Ch. 15 of C<Programming Perl (3rd Edition)>
682by Larry Wall, Tom Christiansen, Jon Orwant;
683O'Reilly & Associates; ISBN 0-596-00027-8
3ef515df
JH
684
685=cut