This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
From Inaba Hiroto: DATA wasn't properly utf8ed
[perl5.git] / ext / Encode / encoding.pm
CommitLineData
3ef515df 1package encoding;
6be7c101 2our $VERSION = do { my @r = (q$Revision: 1.42 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
3ef515df
JH
3
4use Encode;
046f36bf 5use strict;
151b5d36 6our $DEBUG = 0;
3ef515df
JH
7
8BEGIN {
9 if (ord("A") == 193) {
10 require Carp;
10c5ecbb 11 Carp::croak("encoding pragma does not support EBCDIC platforms");
3ef515df
JH
12 }
13}
14
0ab8f81e
JH
15our $HAS_PERLIO = 0;
16eval { require PerlIO::encoding };
17unless ($@){
18 $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
19}
b2704119 20
151b5d36
JH
21sub _exception{
22 my $name = shift;
23 $] > 5.008 and return 0; # 5.8.1 then no
24 my %utfs = map {$_=>1}
25 qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
26 UTF-32 UTF-32BE UTF-32LE);
27 $utfs{$name} or return 0; # UTFs or no
28 require Config; Config->import(); our %Config;
29 return $Config{perl_patchlevel} == 0 # maintperl then no
30}
fa6f41cf 31
3ef515df
JH
32sub import {
33 my $class = shift;
34 my $name = shift;
35 my %arg = @_;
36 $name ||= $ENV{PERL_ENCODING};
3ef515df
JH
37 my $enc = find_encoding($name);
38 unless (defined $enc) {
39 require Carp;
10c5ecbb 40 Carp::croak("Unknown encoding '$name'");
3ef515df 41 }
fa6f41cf 42 $name = $enc->name; # canonize
0f7c507f 43 unless ($arg{Filter}) {
151b5d36
JH
44 $DEBUG and warn "_exception($name) = ", _exception($name);
45 _exception($name) or ${^ENCODING} = $enc;
85982a32 46 $HAS_PERLIO or return 1;
aae85ceb
DK
47 }else{
48 defined(${^ENCODING}) and undef ${^ENCODING};
151b5d36
JH
49 # implicitly 'use utf8'
50 require utf8; # to fetch $utf8::hint_bits;
51 $^H |= $utf8::hint_bits;
aae85ceb
DK
52 eval {
53 require Filter::Util::Call ;
54 Filter::Util::Call->import ;
aae85ceb 55 filter_add(sub{
151b5d36
JH
56 my $status = filter_read();
57 if ($status > 0){
58 # $DEBUG and warn $_;
aae85ceb 59 $_ = $enc->decode($_, 1);
151b5d36 60 $DEBUG and warn $_;
aae85ceb
DK
61 }
62 $status ;
63 });
64 };
6be7c101 65 } $DEBUG and warn "Filter installed";
151b5d36
JH
66 for my $h (qw(STDIN STDOUT)){
67 if ($arg{$h}){
68 unless (defined find_encoding($arg{$h})) {
69 require Carp;
70 Carp::croak("Unknown encoding for $h, '$arg{$h}'");
71 }
6be7c101 72 eval { binmode($h, ":raw :encoding($arg{$h})") };
151b5d36
JH
73 }else{
74 unless (exists $arg{$h}){
75 eval {
76 no warnings 'uninitialized';
6be7c101 77 binmode($h, ":raw :encoding($name)");
151b5d36
JH
78 };
79 }
80 }
81 if ($@){
82 require Carp;
83 Carp::croak($@);
84 }
3ef515df
JH
85 }
86 return 1; # I doubt if we need it, though
87}
88
89sub unimport{
90 no warnings;
91 undef ${^ENCODING};
621b0f8d
DK
92 if ($HAS_PERLIO){
93 binmode(STDIN, ":raw");
94 binmode(STDOUT, ":raw");
95 }else{
6be7c101
JH
96 binmode(STDIN);
97 binmode(STDOUT);
621b0f8d 98 }
aae85ceb
DK
99 if ($INC{"Filter/Util/Call.pm"}){
100 eval { filter_del() };
101 }
3ef515df
JH
102}
103
1041;
105__END__
85982a32 106
3ef515df
JH
107=pod
108
109=head1 NAME
110
0ab8f81e 111encoding - allows you to write your script in non-ascii or non-utf8
3ef515df
JH
112
113=head1 SYNOPSIS
114
962111ca 115 use encoding "greek"; # Perl like Greek to you?
3ef515df
JH
116 use encoding "euc-jp"; # Jperl!
117
962111ca 118 # or you can even do this if your shell supports your native encoding
3ef515df 119
962111ca 120 perl -Mencoding=latin2 -e '...' # Feeling centrally European?
0ab8f81e 121 perl -Mencoding=euc-kr -e '...' # Or Korean?
3ef515df 122
3ef515df
JH
123 # more control
124
962111ca 125 # A simple euc-cn => utf-8 converter
6d1c0808 126 use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
3ef515df
JH
127
128 # "no encoding;" supported (but not scoped!)
129 no encoding;
130
aae85ceb
DK
131 # an alternate way, Filter
132 use encoding "euc-jp", Filter=>1;
aae85ceb
DK
133 # now you can use kanji identifiers -- in euc-jp!
134
3ef515df
JH
135=head1 ABSTRACT
136
962111ca
JH
137Let's start with a bit of history: Perl 5.6.0 introduced Unicode
138support. You could apply C<substr()> and regexes even to complex CJK
139characters -- so long as the script was written in UTF-8. But back
0ab8f81e
JH
140then, text editors that supported UTF-8 were still rare and many users
141instead chose to write scripts in legacy encodings, giving up a whole
142new feature of Perl 5.6.
3ef515df 143
0ab8f81e 144Rewind to the future: starting from perl 5.8.0 with the B<encoding>
962111ca
JH
145pragma, you can write your script in any encoding you like (so long
146as the C<Encode> module supports it) and still enjoy Unicode support.
0ab8f81e 147You can write code in EUC-JP as follows:
3ef515df
JH
148
149 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
150 #<-char-><-char-> # 4 octets
151 s/\bCamel\b/$Rakuda/;
152
153And with C<use encoding "euc-jp"> in effect, it is the same thing as
962111ca 154the code in UTF-8:
3ef515df 155
32b9ed1f 156 my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
3ef515df
JH
157 s/\bCamel\b/$Rakuda/;
158
962111ca 159The B<encoding> pragma also modifies the filehandle disciplines of
4b291ae6 160STDIN and STDOUT to the specified encoding. Therefore,
3ef515df
JH
161
162 use encoding "euc-jp";
163 my $message = "Camel is the symbol of perl.\n";
164 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
165 $message =~ s/\bCamel\b/$Rakuda/;
166 print $message;
167
962111ca
JH
168Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
169not "\x{99F1}\x{99DD} is the symbol of perl.\n".
3ef515df 170
0ab8f81e 171You can override this by giving extra arguments; see below.
3ef515df
JH
172
173=head1 USAGE
174
175=over 4
176
177=item use encoding [I<ENCNAME>] ;
178
0ab8f81e
JH
179Sets the script encoding to I<ENCNAME>. Filehandle disciplines of
180STDIN and STDOUT are set to ":encoding(I<ENCNAME>)". Note that STDERR
181will not be changed.
3ef515df
JH
182
183If no encoding is specified, the environment variable L<PERL_ENCODING>
962111ca
JH
184is consulted. If no encoding can be found, the error C<Unknown encoding
185'I<ENCNAME>'> will be thrown.
3ef515df
JH
186
187Note that non-STD file handles remain unaffected. Use C<use open> or
188C<binmode> to change disciplines of those.
189
aae85ceb 190=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
3ef515df 191
0ab8f81e 192You can also individually set encodings of STDIN and STDOUT via the
32b9ed1f
A
193C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the
194first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
aae85ceb 195completely off.
3ef515df 196
151b5d36
JH
197=item use encoding I<ENCNAME> Filter=E<gt>1;
198
199This turns the encoding pragma into a source filter. While the
200default approach just decodes interpolated literals (in qq() and
201qr()), this will apply a source filter to the entire source code. See
202L</"The Filter Option"> below for details
203
3ef515df
JH
204=item no encoding;
205
0ab8f81e 206Unsets the script encoding. The disciplines of STDIN, STDOUT are
962111ca 207reset to ":raw" (the default unprocessed raw stream of bytes).
3ef515df
JH
208
209=back
210
151b5d36
JH
211=head1 The Filter Option
212
213The magic of C<use encoding> is not applied to the names of
214identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
215is a single Han ideograph) work, you still need to write your script
216in UTF-8 -- or use a source filter. That's what 'Filter=>1' does.
217
218
219What does this mean? Your source code behaves as if it is written in
220UTF-8 with 'use utf8' in effect. So even if your editor only supports
221Shift_JIS, for example, you can still try examples in Chapter 15 of
222C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8
223identifiers.
224
225This option is significantly slower and (as of this writing) non-ASCII
226identifiers are not very stable WITHOUT this option and with the
227source code written in UTF-8.
228
229=head2 Filter-related changes at Encode version 1.87
230
231=over
232
233=item *
234
235The Filter option now sets STDIN and STDOUT like non-filter options.
236And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
237non-filter version.
238
239=item *
240
241C<use utf8> is implicitly declared so you no longer have to C<use
242utf8> to C<${"\x{4eba}"}++>.
243
244=back
245
3ef515df
JH
246=head1 CAVEATS
247
248=head2 NOT SCOPED
249
250The pragma is a per script, not a per block lexical. Only the last
621b0f8d
DK
251C<use encoding> or C<no encoding> matters, and it affects
252B<the whole script>. However, the <no encoding> pragma is supported and
253B<use encoding> can appear as many times as you want in a given script.
254The multiple use of this pragma is discouraged.
255
256Because of this nature, the use of this pragma inside the module is
257strongly discouraged (because the influence of this pragma lasts not
258only for the module but the script that uses). But if you have to,
259make sure you say C<no encoding> at the end of the module so you
260contain the influence of the pragma within the module.
3ef515df
JH
261
262=head2 DO NOT MIX MULTIPLE ENCODINGS
263
264Notice that only literals (string or regular expression) having only
265legacy code points are affected: if you mix data like this
266
267 \xDF\x{100}
268
269the data is assumed to be in (Latin 1 and) Unicode, not in your native
270encoding. In other words, this will match in "greek":
271
272 "\xDF" =~ /\x{3af}/
273
274but this will not
275
276 "\xDF\x{100}" =~ /\x{3af}\x{100}/
277
962111ca
JH
278since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
279the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
280LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You
281should not be mixing your legacy data and Unicode in the same string.
3ef515df
JH
282
283This pragma also affects encoding of the 0x80..0xFF code point range:
284normally characters in that range are left as eight-bit bytes (unless
285they are combined with characters with code points 0x100 or larger,
286in which case all characters need to become UTF-8 encoded), but if
287the C<encoding> pragma is present, even the 0x80..0xFF range always
288gets UTF-8 encoded.
289
290After all, the best thing about this pragma is that you don't have to
0ab8f81e
JH
291resort to \x{....} just to spell your name in a native encoding.
292So feel free to put your strings in your encoding in quotes and
293regexes.
3ef515df 294
151b5d36
JH
295=head2 format doesn't work well
296
297This pragma doesn't work well with format because PerlIO does not
298get along very well with it. When format contains non-ascii
299characters it prints funny or gets "wide character warnings".
300To understand it, try the code below.
301
302 # Save this one in utf8
303 # replace *non-ascii* with a non-ascii string
304 my $camel;
305 format STDOUT =
306 *non-ascii*@>>>>>>>
307 $camel
308 .
309 $camel = "*non-ascii*";
310 binmode(STDOUT=>':encoding(utf8)'); # bang!
311 write; # funny
312 print $camel, "\n"; # fine
313
314Without binmode this happens to work but without binmode, print()
315fails instead of write().
316
317At any rate, the very use of format is questionable when it comes to
318unicode characters since you have to consider such things as character
319width (i.e. double-width for ideographs) and directions (i.e. BIDI for
320Arabic and Hebrew).
321
322=head2 tr/// with ranges
4b291ae6
DK
323
324The B<encoding> pragma works by decoding string literals in
151b5d36 325C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this
4b291ae6
DK
326does not apply to C<tr///>. Therefore,
327
328 use encoding 'euc-jp';
329 #....
330 $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
331 # -------- -------- -------- --------
332
333Does not work as
334
335 $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
336
337=over
338
339=item Legend of characters above
340
341 utf8 euc-jp charnames::viacode()
342 -----------------------------------------
343 \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
344 \x{3093} \xA4\xF3 HIRAGANA LETTER N
345 \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
346 \x{30f3} \xA5\xF3 KATAKANA LETTER N
347
348=back
349
151b5d36
JH
350This counterintuitive behavior has been fixed in perl 5.8.1 and up
351by INABA Hirohito.
352
4b291ae6
DK
353=head3 workaround to tr///;
354
151b5d36 355In perl 5.8.0, you can work aroud as follows;
4b291ae6
DK
356
357 use encoding 'euc-jp';
151b5d36 358 # ....
4b291ae6
DK
359 eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
360
361Note the C<tr//> expression is surronded by C<qq{}>. The idea behind
362is the same as classic idiom that makes C<tr///> 'interpolate'.
363
364 tr/$from/$to/; # wrong!
365 eval qq{ tr/$from/$to/ }; # workaround.
366
367Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
368C<tr///> not being decoded was obviously against the will of Perl5
151b5d36 369Porters so it has been fixed.
aae85ceb 370
3ef515df
JH
371=head1 EXAMPLE - Greekperl
372
373 use encoding "iso 8859-7";
374
0ab8f81e 375 # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
3ef515df
JH
376
377 $a = "\xDF";
378 $b = "\x{100}";
379
380 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
381
382 $c = $a . $b;
383
384 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
385
386 # chr() is affected, and ...
387
388 print "mega\n" if ord(chr(0xdf)) == 0x3af;
389
390 # ... ord() is affected by the encoding pragma ...
391
392 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
393
394 # ... as are eq and cmp ...
395
396 print "peta\n" if "\x{3af}" eq pack("C", 0xdf);
397 print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
398
399 # ... but pack/unpack C are not affected, in case you still
0ab8f81e 400 # want to go back to your native encoding
3ef515df
JH
401
402 print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
403
404=head1 KNOWN PROBLEMS
405
151b5d36
JH
406=over
407
408=item *
409
0ab8f81e 410For native multibyte encodings (either fixed or variable length),
3ef515df 411the current implementation of the regular expressions may introduce
0ab8f81e 412recoding errors for regular expression literals longer than 127 bytes.
3ef515df 413
151b5d36
JH
414=item *
415
3ef515df 416The encoding pragma is not supported on EBCDIC platforms.
0ab8f81e
JH
417(Porters who are willing and able to remove this limitation are
418welcome.)
3ef515df 419
151b5d36
JH
420=back
421
3ef515df
JH
422=head1 SEE ALSO
423
aae85ceb
DK
424L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
425
426Ch. 15 of C<Programming Perl (3rd Edition)>
427by Larry Wall, Tom Christiansen, Jon Orwant;
428O'Reilly & Associates; ISBN 0-596-00027-8
3ef515df
JH
429
430=cut