Commit | Line | Data |
---|---|---|
3ef515df | 1 | package encoding; |
b2704119 | 2 | our $VERSION = do { my @r = (q$Revision: 1.28 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
3ef515df JH |
3 | |
4 | use Encode; | |
046f36bf | 5 | use strict; |
3ef515df JH |
6 | |
7 | BEGIN { | |
8 | if (ord("A") == 193) { | |
9 | require Carp; | |
10 | Carp::croak "encoding pragma does not support EBCDIC platforms"; | |
11 | } | |
12 | } | |
13 | ||
b2704119 JH |
14 | our $HAS_PERLIO_ENCODING; |
15 | ||
16 | eval { require PerlIO::encoding; }; | |
17 | if ($@){ | |
18 | $HAS_PERLIO_ENCODING = 0; | |
19 | }else{ | |
20 | $HAS_PERLIO_ENCODING = 1; | |
21 | binmode(STDIN); | |
22 | } | |
23 | ||
3ef515df JH |
24 | sub import { |
25 | my $class = shift; | |
26 | my $name = shift; | |
27 | my %arg = @_; | |
28 | $name ||= $ENV{PERL_ENCODING}; | |
29 | ||
30 | my $enc = find_encoding($name); | |
31 | unless (defined $enc) { | |
32 | require Carp; | |
33 | Carp::croak "Unknown encoding '$name'"; | |
34 | } | |
aae85ceb DK |
35 | unless ($arg{Filter}){ |
36 | ${^ENCODING} = $enc; # this is all you need, actually. | |
b2704119 | 37 | $HAS_PERLIO_ENCODING or return 1; |
aae85ceb DK |
38 | for my $h (qw(STDIN STDOUT)){ |
39 | if ($arg{$h}){ | |
b2704119 | 40 | unless (defined find_encoding($arg{$h})) { |
aae85ceb DK |
41 | require Carp; |
42 | Carp::croak "Unknown encoding for $h, '$arg{$h}'"; | |
43 | } | |
44 | eval qq{ binmode($h, ":encoding($arg{$h})") }; | |
45 | }else{ | |
46 | unless (exists $arg{$h}){ | |
47 | eval qq{ binmode($h, ":encoding($name)") }; | |
48 | } | |
49 | } | |
50 | if ($@){ | |
3ef515df | 51 | require Carp; |
aae85ceb | 52 | Carp::croak($@); |
3ef515df | 53 | } |
3ef515df | 54 | } |
aae85ceb DK |
55 | }else{ |
56 | defined(${^ENCODING}) and undef ${^ENCODING}; | |
57 | eval { | |
58 | require Filter::Util::Call ; | |
59 | Filter::Util::Call->import ; | |
b2704119 JH |
60 | binmode(STDIN); |
61 | binmode(STDOUT); | |
aae85ceb DK |
62 | filter_add(sub{ |
63 | my $status; | |
64 | if (($status = filter_read()) > 0){ | |
65 | $_ = $enc->decode($_, 1); | |
66 | # warn $_; | |
67 | } | |
68 | $status ; | |
69 | }); | |
70 | }; | |
71 | # warn "Filter installed"; | |
3ef515df JH |
72 | } |
73 | return 1; # I doubt if we need it, though | |
74 | } | |
75 | ||
76 | sub unimport{ | |
77 | no warnings; | |
78 | undef ${^ENCODING}; | |
b2704119 JH |
79 | binmode(STDIN); |
80 | binmode(STDOUT); | |
aae85ceb DK |
81 | if ($INC{"Filter/Util/Call.pm"}){ |
82 | eval { filter_del() }; | |
83 | } | |
3ef515df JH |
84 | } |
85 | ||
86 | 1; | |
87 | __END__ | |
88 | =pod | |
89 | ||
90 | =head1 NAME | |
91 | ||
92 | encoding - allows you to write your script in non-asii or non-utf8 | |
93 | ||
94 | =head1 SYNOPSIS | |
95 | ||
96 | use encoding "euc-jp"; # Jperl! | |
97 | ||
98 | # or you can even do this if your shell supports euc-jp | |
99 | ||
100 | > perl -Mencoding=euc-jp -e '...' | |
101 | ||
102 | # or from the shebang line | |
103 | ||
104 | #!/your/path/to/perl -Mencoding=euc-jp | |
105 | ||
106 | # more control | |
107 | ||
108 | # A simple euc-jp => utf-8 converter | |
109 | use encoding "euc-jp", STDOUT => "utf8"; while(<>){print}; | |
110 | ||
111 | # "no encoding;" supported (but not scoped!) | |
112 | no encoding; | |
113 | ||
aae85ceb DK |
114 | # an alternate way, Filter |
115 | use encoding "euc-jp", Filter=>1; | |
116 | use utf8; | |
117 | # now you can use kanji identifiers -- in euc-jp! | |
118 | ||
3ef515df JH |
119 | =head1 ABSTRACT |
120 | ||
121 | Perl 5.6.0 has introduced Unicode support. You could apply | |
122 | C<substr()> and regexes even to complex CJK characters -- so long as | |
123 | the script was written in UTF-8. But back then text editors that | |
124 | support UTF-8 was still rare and many users rather chose to writer | |
125 | scripts in legacy encodings, given up whole new feature of Perl 5.6. | |
126 | ||
127 | With B<encoding> pragma, you can write your script in any encoding you like | |
128 | (so long as the C<Encode> module supports it) and still enjoy Unicode | |
129 | support. You can write a code in EUC-JP as follows; | |
130 | ||
131 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji | |
132 | #<-char-><-char-> # 4 octets | |
133 | s/\bCamel\b/$Rakuda/; | |
134 | ||
135 | And with C<use encoding "euc-jp"> in effect, it is the same thing as | |
136 | the code in UTF-8 as follow. | |
137 | ||
138 | my $Rakuda = "\x{99F1}\x{99DD}"; # who Unicode Characters | |
139 | s/\bCamel\b/$Rakuda/; | |
140 | ||
141 | The B<encoding> pragma also modifies the file handle disciplines of | |
142 | STDIN, STDOUT, and STDERR to the specified encoding. Therefore, | |
143 | ||
144 | use encoding "euc-jp"; | |
145 | my $message = "Camel is the symbol of perl.\n"; | |
146 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji | |
147 | $message =~ s/\bCamel\b/$Rakuda/; | |
148 | print $message; | |
149 | ||
150 | Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", not | |
151 | "\x{99F1}\x{99DD} is the symbol of perl.\n". | |
152 | ||
153 | You can override this by giving extra arguments. See below. | |
154 | ||
155 | =head1 USAGE | |
156 | ||
157 | =over 4 | |
158 | ||
159 | =item use encoding [I<ENCNAME>] ; | |
160 | ||
161 | Sets the script encoding to I<ENCNAME> and file handle disciplines of | |
f2a2953c JH |
162 | STDIN, STDOUT are set to ":encoding(I<ENCNAME>)". Note STDERR will not |
163 | be changed. | |
3ef515df JH |
164 | |
165 | If no encoding is specified, the environment variable L<PERL_ENCODING> | |
166 | is consulted. If no encoding can be found, C<Unknown encoding 'I<ENCNAME>'> | |
167 | error will be thrown. | |
168 | ||
169 | Note that non-STD file handles remain unaffected. Use C<use open> or | |
170 | C<binmode> to change disciplines of those. | |
171 | ||
aae85ceb | 172 | =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; |
3ef515df | 173 | |
aae85ceb DK |
174 | You can also individually set encodings of STDIN and STDOUT via |
175 | STDI<FH> =E<gt> I<ENCNAME_FH> form. In this case, you cannot omit the | |
176 | first I<ENCNAME>. C<STDI<FH> =E<gt> undef> turns IO transcoding | |
177 | completely off. | |
3ef515df JH |
178 | |
179 | =item no encoding; | |
180 | ||
f2a2953c JH |
181 | Unsets the script encoding and the disciplines of STDIN, STDOUT are |
182 | reset to ":raw". | |
3ef515df JH |
183 | |
184 | =back | |
185 | ||
186 | =head1 CAVEATS | |
187 | ||
188 | =head2 NOT SCOPED | |
189 | ||
190 | The pragma is a per script, not a per block lexical. Only the last | |
191 | C<use encoding> or C<matters, and it affects B<the whole script>. | |
192 | Though <no encoding> pragma is supported and C<use encoding> can | |
193 | appear as many times as you want in a given script, the multiple use | |
194 | of this pragma is discouraged. | |
195 | ||
196 | =head2 DO NOT MIX MULTIPLE ENCODINGS | |
197 | ||
198 | Notice that only literals (string or regular expression) having only | |
199 | legacy code points are affected: if you mix data like this | |
200 | ||
201 | \xDF\x{100} | |
202 | ||
203 | the data is assumed to be in (Latin 1 and) Unicode, not in your native | |
204 | encoding. In other words, this will match in "greek": | |
205 | ||
206 | "\xDF" =~ /\x{3af}/ | |
207 | ||
208 | but this will not | |
209 | ||
210 | "\xDF\x{100}" =~ /\x{3af}\x{100}/ | |
211 | ||
212 | since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}> | |
213 | because of the C<\x{100}> on the left. You should not be mixing your | |
214 | legacy data and Unicode in the same string. | |
215 | ||
216 | This pragma also affects encoding of the 0x80..0xFF code point range: | |
217 | normally characters in that range are left as eight-bit bytes (unless | |
218 | they are combined with characters with code points 0x100 or larger, | |
219 | in which case all characters need to become UTF-8 encoded), but if | |
220 | the C<encoding> pragma is present, even the 0x80..0xFF range always | |
221 | gets UTF-8 encoded. | |
222 | ||
223 | After all, the best thing about this pragma is that you don't have to | |
224 | resort to \x... just to spell your name in native encoding. So feel | |
225 | free to put your strings in your encoding in quotes and regexes. | |
226 | ||
aae85ceb DK |
227 | =head1 NON-ASCII Identifiers and Filter option |
228 | ||
229 | The magic of C<use encoding> is not applied to the names of identifiers. | |
230 | In order to make C<${"4eba"}++> ($man++, where man is a single ideograph) | |
231 | work, you still need to write your script in UTF-8 or use a source filter. | |
232 | ||
233 | In other words, the same restriction as Jperl applies. | |
234 | ||
235 | If you dare experiment, however, you can try Fitlter option. | |
236 | ||
237 | =over 4 | |
238 | ||
239 | =item use encoding I<ENCNAME> Filter=E<gt>1; | |
240 | ||
241 | This turns encoding pragma into source filter. While the default | |
242 | approach just decodes interpolated literals (in qq() and qr()), this | |
243 | will apply source filter to entire source code. In this case, STDIN | |
244 | and STDOUT remain untouched. | |
245 | ||
246 | =back | |
247 | ||
248 | What does this mean? Your source code behaves as if it is written | |
249 | in UTF-8. So even if your editor only supports Shift_JIS, for | |
250 | example. You can still try examples in Chapter 15 of | |
251 | C<Programming Perl, 3rd Ed.> For instance, you can use UTF-8 | |
252 | identifiers. | |
253 | ||
254 | This option is significantly slower and (as of this writing) non-ASCII | |
255 | identifiers are not very stable WITHOUT this option and with the | |
256 | source code written in UTF-8. | |
257 | ||
258 | To make your script in legacy encoding work with minimum effort, do | |
259 | not use Filter=E<gt>1 | |
260 | ||
261 | ||
3ef515df JH |
262 | =head1 EXAMPLE - Greekperl |
263 | ||
264 | use encoding "iso 8859-7"; | |
265 | ||
266 | # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode. | |
267 | ||
268 | $a = "\xDF"; | |
269 | $b = "\x{100}"; | |
270 | ||
271 | printf "%#x\n", ord($a); # will print 0x3af, not 0xdf | |
272 | ||
273 | $c = $a . $b; | |
274 | ||
275 | # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". | |
276 | ||
277 | # chr() is affected, and ... | |
278 | ||
279 | print "mega\n" if ord(chr(0xdf)) == 0x3af; | |
280 | ||
281 | # ... ord() is affected by the encoding pragma ... | |
282 | ||
283 | print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; | |
284 | ||
285 | # ... as are eq and cmp ... | |
286 | ||
287 | print "peta\n" if "\x{3af}" eq pack("C", 0xdf); | |
288 | print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; | |
289 | ||
290 | # ... but pack/unpack C are not affected, in case you still | |
291 | # want back to your native encoding | |
292 | ||
293 | print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; | |
294 | ||
295 | =head1 KNOWN PROBLEMS | |
296 | ||
297 | For native multibyte encodings (either fixed or variable length) | |
298 | the current implementation of the regular expressions may introduce | |
299 | recoding errors for longer regular expression literals than 127 bytes. | |
300 | ||
301 | The encoding pragma is not supported on EBCDIC platforms. | |
302 | (Porters wanted.) | |
303 | ||
304 | =head1 SEE ALSO | |
305 | ||
aae85ceb DK |
306 | L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, |
307 | ||
308 | Ch. 15 of C<Programming Perl (3rd Edition)> | |
309 | by Larry Wall, Tom Christiansen, Jon Orwant; | |
310 | O'Reilly & Associates; ISBN 0-596-00027-8 | |
3ef515df JH |
311 | |
312 | =cut |