This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Comment tweak from Philip.
[perl5.git] / ext / Encode / encoding.pm
CommitLineData
3ef515df 1package encoding;
6d1c0808 2our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
3ef515df
JH
3
4use Encode;
046f36bf 5use strict;
3ef515df
JH
6
7BEGIN {
8 if (ord("A") == 193) {
9 require Carp;
10 Carp::croak "encoding pragma does not support EBCDIC platforms";
11 }
12}
13
85982a32
JH
14our $HAS_PERLIO = exists $INC{"PerlIO/encoding.pm"};
15$HAS_PERLIO or binmode(STDIN);
b2704119 16
3ef515df
JH
17sub import {
18 my $class = shift;
19 my $name = shift;
20 my %arg = @_;
21 $name ||= $ENV{PERL_ENCODING};
22
23 my $enc = find_encoding($name);
24 unless (defined $enc) {
25 require Carp;
26 Carp::croak "Unknown encoding '$name'";
27 }
aae85ceb
DK
28 unless ($arg{Filter}){
29 ${^ENCODING} = $enc; # this is all you need, actually.
85982a32 30 $HAS_PERLIO or return 1;
aae85ceb
DK
31 for my $h (qw(STDIN STDOUT)){
32 if ($arg{$h}){
b2704119 33 unless (defined find_encoding($arg{$h})) {
aae85ceb
DK
34 require Carp;
35 Carp::croak "Unknown encoding for $h, '$arg{$h}'";
36 }
37 eval qq{ binmode($h, ":encoding($arg{$h})") };
38 }else{
39 unless (exists $arg{$h}){
40 eval qq{ binmode($h, ":encoding($name)") };
41 }
42 }
43 if ($@){
3ef515df 44 require Carp;
aae85ceb 45 Carp::croak($@);
3ef515df 46 }
3ef515df 47 }
aae85ceb
DK
48 }else{
49 defined(${^ENCODING}) and undef ${^ENCODING};
50 eval {
51 require Filter::Util::Call ;
52 Filter::Util::Call->import ;
b2704119
JH
53 binmode(STDIN);
54 binmode(STDOUT);
aae85ceb
DK
55 filter_add(sub{
56 my $status;
57 if (($status = filter_read()) > 0){
58 $_ = $enc->decode($_, 1);
59 # warn $_;
60 }
61 $status ;
62 });
63 };
64 # warn "Filter installed";
3ef515df
JH
65 }
66 return 1; # I doubt if we need it, though
67}
68
69sub unimport{
70 no warnings;
71 undef ${^ENCODING};
b2704119
JH
72 binmode(STDIN);
73 binmode(STDOUT);
aae85ceb
DK
74 if ($INC{"Filter/Util/Call.pm"}){
75 eval { filter_del() };
76 }
3ef515df
JH
77}
78
791;
80__END__
85982a32 81
3ef515df
JH
82=pod
83
84=head1 NAME
85
32b9ed1f 86encoding - allows you to write your script in non-ascii or non-utf8
3ef515df
JH
87
88=head1 SYNOPSIS
89
962111ca 90 use encoding "greek"; # Perl like Greek to you?
3ef515df
JH
91 use encoding "euc-jp"; # Jperl!
92
962111ca 93 # or you can even do this if your shell supports your native encoding
3ef515df 94
962111ca 95 perl -Mencoding=latin2 -e '...' # Feeling centrally European?
6d1c0808 96 perl -Mencoding=euc-ko -e '...'
3ef515df
JH
97
98 # or from the shebang line
99
962111ca 100 #!/your/path/to/perl -Mencoding="8859-6" # Arabian Nights
6d1c0808 101 #!/your/path/to/perl -Mencoding=euc-tw
3ef515df
JH
102
103 # more control
104
962111ca 105 # A simple euc-cn => utf-8 converter
6d1c0808 106 use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
3ef515df
JH
107
108 # "no encoding;" supported (but not scoped!)
109 no encoding;
110
aae85ceb
DK
111 # an alternate way, Filter
112 use encoding "euc-jp", Filter=>1;
113 use utf8;
114 # now you can use kanji identifiers -- in euc-jp!
115
3ef515df
JH
116=head1 ABSTRACT
117
962111ca
JH
118Let's start with a bit of history: Perl 5.6.0 introduced Unicode
119support. You could apply C<substr()> and regexes even to complex CJK
120characters -- so long as the script was written in UTF-8. But back
121then text editors that supported UTF-8 were still rare and many users
122rather chose to write scripts in legacy encodings, given up whole new
123feature of Perl 5.6.
3ef515df 124
962111ca
JH
125Rewind to the future: starting from perl 5.8.0 with B<encoding>
126pragma, you can write your script in any encoding you like (so long
127as the C<Encode> module supports it) and still enjoy Unicode support.
128You can write a code in EUC-JP as follows:
3ef515df
JH
129
130 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
131 #<-char-><-char-> # 4 octets
132 s/\bCamel\b/$Rakuda/;
133
134And with C<use encoding "euc-jp"> in effect, it is the same thing as
962111ca 135the code in UTF-8:
3ef515df 136
32b9ed1f 137 my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
3ef515df
JH
138 s/\bCamel\b/$Rakuda/;
139
962111ca 140The B<encoding> pragma also modifies the filehandle disciplines of
3ef515df
JH
141STDIN, STDOUT, and STDERR to the specified encoding. Therefore,
142
143 use encoding "euc-jp";
144 my $message = "Camel is the symbol of perl.\n";
145 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
146 $message =~ s/\bCamel\b/$Rakuda/;
147 print $message;
148
962111ca
JH
149Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
150not "\x{99F1}\x{99DD} is the symbol of perl.\n".
3ef515df 151
962111ca 152You can override this by giving extra arguments, see below.
3ef515df
JH
153
154=head1 USAGE
155
156=over 4
157
158=item use encoding [I<ENCNAME>] ;
159
962111ca
JH
160Sets the script encoding to I<ENCNAME> and filehandle disciplines of
161STDIN, STDOUT are set to ":encoding(I<ENCNAME>)". Note STDERR will
162not be changed.
3ef515df
JH
163
164If no encoding is specified, the environment variable L<PERL_ENCODING>
962111ca
JH
165is consulted. If no encoding can be found, the error C<Unknown encoding
166'I<ENCNAME>'> will be thrown.
3ef515df
JH
167
168Note that non-STD file handles remain unaffected. Use C<use open> or
169C<binmode> to change disciplines of those.
170
aae85ceb 171=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
3ef515df 172
aae85ceb 173You can also individually set encodings of STDIN and STDOUT via
32b9ed1f
A
174C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the
175first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
aae85ceb 176completely off.
3ef515df
JH
177
178=item no encoding;
179
f2a2953c 180Unsets the script encoding and the disciplines of STDIN, STDOUT are
962111ca 181reset to ":raw" (the default unprocessed raw stream of bytes).
3ef515df
JH
182
183=back
184
185=head1 CAVEATS
186
187=head2 NOT SCOPED
188
189The pragma is a per script, not a per block lexical. Only the last
32b9ed1f 190C<use encoding> or C<no encoding> matters, and it affects B<the whole script>.
962111ca
JH
191However, <no encoding> pragma is supported and C<use encoding> can
192appear as many times as you want in a given script. The multiple use
3ef515df
JH
193of this pragma is discouraged.
194
195=head2 DO NOT MIX MULTIPLE ENCODINGS
196
197Notice that only literals (string or regular expression) having only
198legacy code points are affected: if you mix data like this
199
200 \xDF\x{100}
201
202the data is assumed to be in (Latin 1 and) Unicode, not in your native
203encoding. In other words, this will match in "greek":
204
205 "\xDF" =~ /\x{3af}/
206
207but this will not
208
209 "\xDF\x{100}" =~ /\x{3af}\x{100}/
210
962111ca
JH
211since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
212the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
213LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You
214should not be mixing your legacy data and Unicode in the same string.
3ef515df
JH
215
216This pragma also affects encoding of the 0x80..0xFF code point range:
217normally characters in that range are left as eight-bit bytes (unless
218they are combined with characters with code points 0x100 or larger,
219in which case all characters need to become UTF-8 encoded), but if
220the C<encoding> pragma is present, even the 0x80..0xFF range always
221gets UTF-8 encoded.
222
223After all, the best thing about this pragma is that you don't have to
962111ca 224resort to \x... just to spell your name in native a encoding. So feel
3ef515df
JH
225free to put your strings in your encoding in quotes and regexes.
226
962111ca 227=head1 Non-ASCII Identifiers and Filter option
aae85ceb 228
962111ca 229The magic of C<use encoding> is not applied to the names of
32b9ed1f 230identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
962111ca
JH
231is a single Han ideograph) work, you still need to write your script
232in UTF-8 or use a source filter.
aae85ceb
DK
233
234In other words, the same restriction as Jperl applies.
235
962111ca 236If you dare to experiment, however, you can try Filter option.
aae85ceb
DK
237
238=over 4
239
240=item use encoding I<ENCNAME> Filter=E<gt>1;
241
242This turns encoding pragma into source filter. While the default
243approach just decodes interpolated literals (in qq() and qr()), this
244will apply source filter to entire source code. In this case, STDIN
245and STDOUT remain untouched.
246
247=back
248
962111ca
JH
249What does this mean? Your source code behaves as if it is written in
250UTF-8. So even if your editor only supports Shift_JIS, for example.
251You can still try examples in Chapter 15 of C<Programming Perl, 3rd
252Ed.> For instance, you can use UTF-8 identifiers.
aae85ceb
DK
253
254This option is significantly slower and (as of this writing) non-ASCII
255identifiers are not very stable WITHOUT this option and with the
256source code written in UTF-8.
257
962111ca
JH
258To make your script in legacy encoding work with minimum effort,
259do not use Filter=E<gt>1.
aae85ceb 260
3ef515df
JH
261=head1 EXAMPLE - Greekperl
262
263 use encoding "iso 8859-7";
264
265 # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
266
267 $a = "\xDF";
268 $b = "\x{100}";
269
270 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
271
272 $c = $a . $b;
273
274 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
275
276 # chr() is affected, and ...
277
278 print "mega\n" if ord(chr(0xdf)) == 0x3af;
279
280 # ... ord() is affected by the encoding pragma ...
281
282 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
283
284 # ... as are eq and cmp ...
285
286 print "peta\n" if "\x{3af}" eq pack("C", 0xdf);
287 print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
288
289 # ... but pack/unpack C are not affected, in case you still
290 # want back to your native encoding
291
292 print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
293
294=head1 KNOWN PROBLEMS
295
296For native multibyte encodings (either fixed or variable length)
297the current implementation of the regular expressions may introduce
298recoding errors for longer regular expression literals than 127 bytes.
299
300The encoding pragma is not supported on EBCDIC platforms.
301(Porters wanted.)
302
303=head1 SEE ALSO
304
aae85ceb
DK
305L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
306
307Ch. 15 of C<Programming Perl (3rd Edition)>
308by Larry Wall, Tom Christiansen, Jon Orwant;
309O'Reilly & Associates; ISBN 0-596-00027-8
3ef515df
JH
310
311=cut