Commit | Line | Data |
---|---|---|
3ef515df | 1 | package encoding; |
fa6f41cf | 2 | our $VERSION = do { my @r = (q$Revision: 1.40 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
3ef515df JH |
3 | |
4 | use Encode; | |
046f36bf | 5 | use strict; |
3ef515df JH |
6 | |
7 | BEGIN { | |
8 | if (ord("A") == 193) { | |
9 | require Carp; | |
10c5ecbb | 10 | Carp::croak("encoding pragma does not support EBCDIC platforms"); |
3ef515df JH |
11 | } |
12 | } | |
13 | ||
0ab8f81e JH |
14 | our $HAS_PERLIO = 0; |
15 | eval { require PerlIO::encoding }; | |
16 | unless ($@){ | |
17 | $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02); | |
18 | } | |
b2704119 | 19 | |
fa6f41cf JH |
20 | my %utfs = map {$_=>1} |
21 | qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE); | |
22 | ||
3ef515df JH |
23 | sub import { |
24 | my $class = shift; | |
25 | my $name = shift; | |
26 | my %arg = @_; | |
27 | $name ||= $ENV{PERL_ENCODING}; | |
28 | ||
29 | my $enc = find_encoding($name); | |
30 | unless (defined $enc) { | |
31 | require Carp; | |
10c5ecbb | 32 | Carp::croak("Unknown encoding '$name'"); |
3ef515df | 33 | } |
fa6f41cf | 34 | $name = $enc->name; # canonize |
0f7c507f | 35 | unless ($arg{Filter}) { |
fa6f41cf | 36 | ${^ENCODING} = $enc unless $] <= 5.008 and $utfs{$name}; |
85982a32 | 37 | $HAS_PERLIO or return 1; |
aae85ceb DK |
38 | for my $h (qw(STDIN STDOUT)){ |
39 | if ($arg{$h}){ | |
b2704119 | 40 | unless (defined find_encoding($arg{$h})) { |
aae85ceb | 41 | require Carp; |
10c5ecbb | 42 | Carp::croak("Unknown encoding for $h, '$arg{$h}'"); |
aae85ceb | 43 | } |
0ab8f81e | 44 | eval { binmode($h, ":encoding($arg{$h})") }; |
aae85ceb DK |
45 | }else{ |
46 | unless (exists $arg{$h}){ | |
0ab8f81e JH |
47 | eval { |
48 | no warnings 'uninitialized'; | |
49 | binmode($h, ":encoding($name)"); | |
50 | }; | |
aae85ceb DK |
51 | } |
52 | } | |
53 | if ($@){ | |
3ef515df | 54 | require Carp; |
aae85ceb | 55 | Carp::croak($@); |
3ef515df | 56 | } |
3ef515df | 57 | } |
aae85ceb DK |
58 | }else{ |
59 | defined(${^ENCODING}) and undef ${^ENCODING}; | |
60 | eval { | |
61 | require Filter::Util::Call ; | |
62 | Filter::Util::Call->import ; | |
b2704119 JH |
63 | binmode(STDIN); |
64 | binmode(STDOUT); | |
aae85ceb DK |
65 | filter_add(sub{ |
66 | my $status; | |
67 | if (($status = filter_read()) > 0){ | |
68 | $_ = $enc->decode($_, 1); | |
69 | # warn $_; | |
70 | } | |
71 | $status ; | |
72 | }); | |
73 | }; | |
74 | # warn "Filter installed"; | |
3ef515df JH |
75 | } |
76 | return 1; # I doubt if we need it, though | |
77 | } | |
78 | ||
79 | sub unimport{ | |
80 | no warnings; | |
81 | undef ${^ENCODING}; | |
621b0f8d DK |
82 | if ($HAS_PERLIO){ |
83 | binmode(STDIN, ":raw"); | |
84 | binmode(STDOUT, ":raw"); | |
85 | }else{ | |
b2704119 JH |
86 | binmode(STDIN); |
87 | binmode(STDOUT); | |
621b0f8d | 88 | } |
aae85ceb DK |
89 | if ($INC{"Filter/Util/Call.pm"}){ |
90 | eval { filter_del() }; | |
91 | } | |
3ef515df JH |
92 | } |
93 | ||
94 | 1; | |
95 | __END__ | |
85982a32 | 96 | |
3ef515df JH |
97 | =pod |
98 | ||
99 | =head1 NAME | |
100 | ||
0ab8f81e | 101 | encoding - allows you to write your script in non-ascii or non-utf8 |
3ef515df JH |
102 | |
103 | =head1 SYNOPSIS | |
104 | ||
962111ca | 105 | use encoding "greek"; # Perl like Greek to you? |
3ef515df JH |
106 | use encoding "euc-jp"; # Jperl! |
107 | ||
962111ca | 108 | # or you can even do this if your shell supports your native encoding |
3ef515df | 109 | |
962111ca | 110 | perl -Mencoding=latin2 -e '...' # Feeling centrally European? |
0ab8f81e | 111 | perl -Mencoding=euc-kr -e '...' # Or Korean? |
3ef515df | 112 | |
3ef515df JH |
113 | # more control |
114 | ||
962111ca | 115 | # A simple euc-cn => utf-8 converter |
6d1c0808 | 116 | use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; |
3ef515df JH |
117 | |
118 | # "no encoding;" supported (but not scoped!) | |
119 | no encoding; | |
120 | ||
aae85ceb DK |
121 | # an alternate way, Filter |
122 | use encoding "euc-jp", Filter=>1; | |
123 | use utf8; | |
124 | # now you can use kanji identifiers -- in euc-jp! | |
125 | ||
3ef515df JH |
126 | =head1 ABSTRACT |
127 | ||
962111ca JH |
128 | Let's start with a bit of history: Perl 5.6.0 introduced Unicode |
129 | support. You could apply C<substr()> and regexes even to complex CJK | |
130 | characters -- so long as the script was written in UTF-8. But back | |
0ab8f81e JH |
131 | then, text editors that supported UTF-8 were still rare and many users |
132 | instead chose to write scripts in legacy encodings, giving up a whole | |
133 | new feature of Perl 5.6. | |
3ef515df | 134 | |
0ab8f81e | 135 | Rewind to the future: starting from perl 5.8.0 with the B<encoding> |
962111ca JH |
136 | pragma, you can write your script in any encoding you like (so long |
137 | as the C<Encode> module supports it) and still enjoy Unicode support. | |
0ab8f81e | 138 | You can write code in EUC-JP as follows: |
3ef515df JH |
139 | |
140 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji | |
141 | #<-char-><-char-> # 4 octets | |
142 | s/\bCamel\b/$Rakuda/; | |
143 | ||
144 | And with C<use encoding "euc-jp"> in effect, it is the same thing as | |
962111ca | 145 | the code in UTF-8: |
3ef515df | 146 | |
32b9ed1f | 147 | my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters |
3ef515df JH |
148 | s/\bCamel\b/$Rakuda/; |
149 | ||
962111ca | 150 | The B<encoding> pragma also modifies the filehandle disciplines of |
4b291ae6 | 151 | STDIN and STDOUT to the specified encoding. Therefore, |
3ef515df JH |
152 | |
153 | use encoding "euc-jp"; | |
154 | my $message = "Camel is the symbol of perl.\n"; | |
155 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji | |
156 | $message =~ s/\bCamel\b/$Rakuda/; | |
157 | print $message; | |
158 | ||
962111ca JH |
159 | Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", |
160 | not "\x{99F1}\x{99DD} is the symbol of perl.\n". | |
3ef515df | 161 | |
0ab8f81e | 162 | You can override this by giving extra arguments; see below. |
3ef515df JH |
163 | |
164 | =head1 USAGE | |
165 | ||
166 | =over 4 | |
167 | ||
168 | =item use encoding [I<ENCNAME>] ; | |
169 | ||
0ab8f81e JH |
170 | Sets the script encoding to I<ENCNAME>. Filehandle disciplines of |
171 | STDIN and STDOUT are set to ":encoding(I<ENCNAME>)". Note that STDERR | |
172 | will not be changed. | |
3ef515df JH |
173 | |
174 | If no encoding is specified, the environment variable L<PERL_ENCODING> | |
962111ca JH |
175 | is consulted. If no encoding can be found, the error C<Unknown encoding |
176 | 'I<ENCNAME>'> will be thrown. | |
3ef515df JH |
177 | |
178 | Note that non-STD file handles remain unaffected. Use C<use open> or | |
179 | C<binmode> to change disciplines of those. | |
180 | ||
aae85ceb | 181 | =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; |
3ef515df | 182 | |
0ab8f81e | 183 | You can also individually set encodings of STDIN and STDOUT via the |
32b9ed1f A |
184 | C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the |
185 | first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding | |
aae85ceb | 186 | completely off. |
3ef515df JH |
187 | |
188 | =item no encoding; | |
189 | ||
0ab8f81e | 190 | Unsets the script encoding. The disciplines of STDIN, STDOUT are |
962111ca | 191 | reset to ":raw" (the default unprocessed raw stream of bytes). |
3ef515df JH |
192 | |
193 | =back | |
194 | ||
195 | =head1 CAVEATS | |
196 | ||
197 | =head2 NOT SCOPED | |
198 | ||
199 | The pragma is a per script, not a per block lexical. Only the last | |
621b0f8d DK |
200 | C<use encoding> or C<no encoding> matters, and it affects |
201 | B<the whole script>. However, the <no encoding> pragma is supported and | |
202 | B<use encoding> can appear as many times as you want in a given script. | |
203 | The multiple use of this pragma is discouraged. | |
204 | ||
205 | Because of this nature, the use of this pragma inside the module is | |
206 | strongly discouraged (because the influence of this pragma lasts not | |
207 | only for the module but the script that uses). But if you have to, | |
208 | make sure you say C<no encoding> at the end of the module so you | |
209 | contain the influence of the pragma within the module. | |
3ef515df JH |
210 | |
211 | =head2 DO NOT MIX MULTIPLE ENCODINGS | |
212 | ||
213 | Notice that only literals (string or regular expression) having only | |
214 | legacy code points are affected: if you mix data like this | |
215 | ||
216 | \xDF\x{100} | |
217 | ||
218 | the data is assumed to be in (Latin 1 and) Unicode, not in your native | |
219 | encoding. In other words, this will match in "greek": | |
220 | ||
221 | "\xDF" =~ /\x{3af}/ | |
222 | ||
223 | but this will not | |
224 | ||
225 | "\xDF\x{100}" =~ /\x{3af}\x{100}/ | |
226 | ||
962111ca JH |
227 | since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on |
228 | the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL | |
229 | LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You | |
230 | should not be mixing your legacy data and Unicode in the same string. | |
3ef515df JH |
231 | |
232 | This pragma also affects encoding of the 0x80..0xFF code point range: | |
233 | normally characters in that range are left as eight-bit bytes (unless | |
234 | they are combined with characters with code points 0x100 or larger, | |
235 | in which case all characters need to become UTF-8 encoded), but if | |
236 | the C<encoding> pragma is present, even the 0x80..0xFF range always | |
237 | gets UTF-8 encoded. | |
238 | ||
239 | After all, the best thing about this pragma is that you don't have to | |
0ab8f81e JH |
240 | resort to \x{....} just to spell your name in a native encoding. |
241 | So feel free to put your strings in your encoding in quotes and | |
242 | regexes. | |
3ef515df | 243 | |
4b291ae6 DK |
244 | =head2 tr/// with ranges remain unaffected |
245 | ||
246 | The B<encoding> pragma works by decoding string literals in | |
247 | C<q//,qq//,qr//,qw///, qx//> and so forth. As of perl 5.8.0, this | |
248 | does not apply to C<tr///>. Therefore, | |
249 | ||
250 | use encoding 'euc-jp'; | |
251 | #.... | |
252 | $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/; | |
253 | # -------- -------- -------- -------- | |
254 | ||
255 | Does not work as | |
256 | ||
257 | $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/; | |
258 | ||
259 | =over | |
260 | ||
261 | =item Legend of characters above | |
262 | ||
263 | utf8 euc-jp charnames::viacode() | |
264 | ----------------------------------------- | |
265 | \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A | |
266 | \x{3093} \xA4\xF3 HIRAGANA LETTER N | |
267 | \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A | |
268 | \x{30f3} \xA5\xF3 KATAKANA LETTER N | |
269 | ||
270 | =back | |
271 | ||
272 | =head3 workaround to tr///; | |
273 | ||
274 | You can, however, achieve the same as simply as follows; | |
275 | ||
276 | use encoding 'euc-jp'; | |
277 | # .... | |
278 | eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; | |
279 | ||
280 | Note the C<tr//> expression is surronded by C<qq{}>. The idea behind | |
281 | is the same as classic idiom that makes C<tr///> 'interpolate'. | |
282 | ||
283 | tr/$from/$to/; # wrong! | |
284 | eval qq{ tr/$from/$to/ }; # workaround. | |
285 | ||
286 | Nevertheless, in case of B<encoding> pragma even C<q//> is affected so | |
287 | C<tr///> not being decoded was obviously against the will of Perl5 | |
288 | Porters. In future version of perl, this counter-intuitive behaviour | |
289 | of C<tr///> will be fixed so C<eval qq{}> trick will be unneccesary. | |
290 | ||
962111ca | 291 | =head1 Non-ASCII Identifiers and Filter option |
aae85ceb | 292 | |
962111ca | 293 | The magic of C<use encoding> is not applied to the names of |
32b9ed1f | 294 | identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human |
962111ca JH |
295 | is a single Han ideograph) work, you still need to write your script |
296 | in UTF-8 or use a source filter. | |
aae85ceb | 297 | |
0ab8f81e | 298 | In other words, the same restriction as with Jperl applies. |
aae85ceb | 299 | |
0ab8f81e | 300 | If you dare to experiment, however, you can try the Filter option. |
aae85ceb DK |
301 | |
302 | =over 4 | |
303 | ||
304 | =item use encoding I<ENCNAME> Filter=E<gt>1; | |
305 | ||
0ab8f81e | 306 | This turns the encoding pragma into a source filter. While the default |
aae85ceb | 307 | approach just decodes interpolated literals (in qq() and qr()), this |
0ab8f81e JH |
308 | will apply a source filter to the entire source code. In this case, |
309 | STDIN and STDOUT remain untouched. | |
aae85ceb DK |
310 | |
311 | =back | |
312 | ||
962111ca | 313 | What does this mean? Your source code behaves as if it is written in |
0ab8f81e JH |
314 | UTF-8. So even if your editor only supports Shift_JIS, for example, |
315 | you can still try examples in Chapter 15 of C<Programming Perl, 3rd | |
316 | Ed.>. For instance, you can use UTF-8 identifiers. | |
aae85ceb DK |
317 | |
318 | This option is significantly slower and (as of this writing) non-ASCII | |
319 | identifiers are not very stable WITHOUT this option and with the | |
320 | source code written in UTF-8. | |
321 | ||
962111ca JH |
322 | To make your script in legacy encoding work with minimum effort, |
323 | do not use Filter=E<gt>1. | |
aae85ceb | 324 | |
3ef515df JH |
325 | =head1 EXAMPLE - Greekperl |
326 | ||
327 | use encoding "iso 8859-7"; | |
328 | ||
0ab8f81e | 329 | # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. |
3ef515df JH |
330 | |
331 | $a = "\xDF"; | |
332 | $b = "\x{100}"; | |
333 | ||
334 | printf "%#x\n", ord($a); # will print 0x3af, not 0xdf | |
335 | ||
336 | $c = $a . $b; | |
337 | ||
338 | # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". | |
339 | ||
340 | # chr() is affected, and ... | |
341 | ||
342 | print "mega\n" if ord(chr(0xdf)) == 0x3af; | |
343 | ||
344 | # ... ord() is affected by the encoding pragma ... | |
345 | ||
346 | print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; | |
347 | ||
348 | # ... as are eq and cmp ... | |
349 | ||
350 | print "peta\n" if "\x{3af}" eq pack("C", 0xdf); | |
351 | print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; | |
352 | ||
353 | # ... but pack/unpack C are not affected, in case you still | |
0ab8f81e | 354 | # want to go back to your native encoding |
3ef515df JH |
355 | |
356 | print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; | |
357 | ||
358 | =head1 KNOWN PROBLEMS | |
359 | ||
0ab8f81e | 360 | For native multibyte encodings (either fixed or variable length), |
3ef515df | 361 | the current implementation of the regular expressions may introduce |
0ab8f81e | 362 | recoding errors for regular expression literals longer than 127 bytes. |
3ef515df JH |
363 | |
364 | The encoding pragma is not supported on EBCDIC platforms. | |
0ab8f81e JH |
365 | (Porters who are willing and able to remove this limitation are |
366 | welcome.) | |
3ef515df JH |
367 | |
368 | =head1 SEE ALSO | |
369 | ||
aae85ceb DK |
370 | L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, |
371 | ||
372 | Ch. 15 of C<Programming Perl (3rd Edition)> | |
373 | by Larry Wall, Tom Christiansen, Jon Orwant; | |
374 | O'Reilly & Associates; ISBN 0-596-00027-8 | |
3ef515df JH |
375 | |
376 | =cut |