Commit | Line | Data |
---|---|---|
3ef515df JH |
1 | package encoding; |
2 | our $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; | |
3 | ||
4 | use Encode; | |
5 | ||
6 | BEGIN { | |
7 | if (ord("A") == 193) { | |
8 | require Carp; | |
9 | Carp::croak "encoding pragma does not support EBCDIC platforms"; | |
10 | } | |
11 | } | |
12 | ||
13 | sub import { | |
14 | my $class = shift; | |
15 | my $name = shift; | |
16 | my %arg = @_; | |
17 | $name ||= $ENV{PERL_ENCODING}; | |
18 | ||
19 | my $enc = find_encoding($name); | |
20 | unless (defined $enc) { | |
21 | require Carp; | |
22 | Carp::croak "Unknown encoding '$name'"; | |
23 | } | |
24 | ${^ENCODING} = $enc; # this is all you need, actually. | |
25 | ||
26 | # $_OPEN_ORIG = ${^OPEN}; | |
27 | for my $h (qw(STDIN STDOUT STDERR)){ | |
28 | if ($arg{$h}){ | |
29 | unless (defined find_encoding($name)) { | |
30 | require Carp; | |
31 | Carp::croak "Unknown encoding for $fhname, '$arg{$h}'"; | |
32 | } | |
33 | eval qq{ binmode($h, ":encoding($arg{h})") }; | |
34 | }else{ | |
35 | eval qq{ binmode($h, ":encoding($name)") }; | |
36 | } | |
37 | if ($@){ | |
38 | require Carp; | |
39 | Carp::croak($@); | |
40 | } | |
41 | } | |
42 | return 1; # I doubt if we need it, though | |
43 | } | |
44 | ||
45 | sub unimport{ | |
46 | no warnings; | |
47 | undef ${^ENCODING}; | |
48 | binmode(STDIN, ":raw"); | |
49 | binmode(STDOUT, ":raw"); | |
50 | binmode(STDERR, ":raw"); | |
51 | } | |
52 | ||
53 | 1; | |
54 | __END__ | |
55 | =pod | |
56 | ||
57 | =head1 NAME | |
58 | ||
59 | encoding - allows you to write your script in non-asii or non-utf8 | |
60 | ||
61 | =head1 SYNOPSIS | |
62 | ||
63 | use encoding "euc-jp"; # Jperl! | |
64 | ||
65 | # or you can even do this if your shell supports euc-jp | |
66 | ||
67 | > perl -Mencoding=euc-jp -e '...' | |
68 | ||
69 | # or from the shebang line | |
70 | ||
71 | #!/your/path/to/perl -Mencoding=euc-jp | |
72 | ||
73 | # more control | |
74 | ||
75 | # A simple euc-jp => utf-8 converter | |
76 | use encoding "euc-jp", STDOUT => "utf8"; while(<>){print}; | |
77 | ||
78 | # "no encoding;" supported (but not scoped!) | |
79 | no encoding; | |
80 | ||
81 | =head1 ABSTRACT | |
82 | ||
83 | Perl 5.6.0 has introduced Unicode support. You could apply | |
84 | C<substr()> and regexes even to complex CJK characters -- so long as | |
85 | the script was written in UTF-8. But back then text editors that | |
86 | support UTF-8 was still rare and many users rather chose to writer | |
87 | scripts in legacy encodings, given up whole new feature of Perl 5.6. | |
88 | ||
89 | With B<encoding> pragma, you can write your script in any encoding you like | |
90 | (so long as the C<Encode> module supports it) and still enjoy Unicode | |
91 | support. You can write a code in EUC-JP as follows; | |
92 | ||
93 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji | |
94 | #<-char-><-char-> # 4 octets | |
95 | s/\bCamel\b/$Rakuda/; | |
96 | ||
97 | And with C<use encoding "euc-jp"> in effect, it is the same thing as | |
98 | the code in UTF-8 as follow. | |
99 | ||
100 | my $Rakuda = "\x{99F1}\x{99DD}"; # who Unicode Characters | |
101 | s/\bCamel\b/$Rakuda/; | |
102 | ||
103 | The B<encoding> pragma also modifies the file handle disciplines of | |
104 | STDIN, STDOUT, and STDERR to the specified encoding. Therefore, | |
105 | ||
106 | use encoding "euc-jp"; | |
107 | my $message = "Camel is the symbol of perl.\n"; | |
108 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji | |
109 | $message =~ s/\bCamel\b/$Rakuda/; | |
110 | print $message; | |
111 | ||
112 | Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", not | |
113 | "\x{99F1}\x{99DD} is the symbol of perl.\n". | |
114 | ||
115 | You can override this by giving extra arguments. See below. | |
116 | ||
117 | =head1 USAGE | |
118 | ||
119 | =over 4 | |
120 | ||
121 | =item use encoding [I<ENCNAME>] ; | |
122 | ||
123 | Sets the script encoding to I<ENCNAME> and file handle disciplines of | |
124 | STDIN, STDOUT, and STDERR are set to ":encoding(I<ENCNAME>)". | |
125 | ||
126 | If no encoding is specified, the environment variable L<PERL_ENCODING> | |
127 | is consulted. If no encoding can be found, C<Unknown encoding 'I<ENCNAME>'> | |
128 | error will be thrown. | |
129 | ||
130 | Note that non-STD file handles remain unaffected. Use C<use open> or | |
131 | C<binmode> to change disciplines of those. | |
132 | ||
133 | =item use encoding I<ENCNAME> [ STDIN => I<ENCNAME_IN> ...] ; | |
134 | ||
135 | You can also individually set encodings of STDIN, STDOUT, and STDERR | |
136 | via STDI<FH> => I<ENCNAME_FH> form. In this case, you cannot omit the | |
137 | first I<ENCNAME>. | |
138 | ||
139 | =item no encoding; | |
140 | ||
141 | Unsets the script encoding and the disciplines of STDIN, STDOUT, and | |
142 | STDERR are reset to ":raw". | |
143 | ||
144 | =back | |
145 | ||
146 | =head1 CAVEATS | |
147 | ||
148 | =head2 NOT SCOPED | |
149 | ||
150 | The pragma is a per script, not a per block lexical. Only the last | |
151 | C<use encoding> or C<matters, and it affects B<the whole script>. | |
152 | Though <no encoding> pragma is supported and C<use encoding> can | |
153 | appear as many times as you want in a given script, the multiple use | |
154 | of this pragma is discouraged. | |
155 | ||
156 | =head2 DO NOT MIX MULTIPLE ENCODINGS | |
157 | ||
158 | Notice that only literals (string or regular expression) having only | |
159 | legacy code points are affected: if you mix data like this | |
160 | ||
161 | \xDF\x{100} | |
162 | ||
163 | the data is assumed to be in (Latin 1 and) Unicode, not in your native | |
164 | encoding. In other words, this will match in "greek": | |
165 | ||
166 | "\xDF" =~ /\x{3af}/ | |
167 | ||
168 | but this will not | |
169 | ||
170 | "\xDF\x{100}" =~ /\x{3af}\x{100}/ | |
171 | ||
172 | since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}> | |
173 | because of the C<\x{100}> on the left. You should not be mixing your | |
174 | legacy data and Unicode in the same string. | |
175 | ||
176 | This pragma also affects encoding of the 0x80..0xFF code point range: | |
177 | normally characters in that range are left as eight-bit bytes (unless | |
178 | they are combined with characters with code points 0x100 or larger, | |
179 | in which case all characters need to become UTF-8 encoded), but if | |
180 | the C<encoding> pragma is present, even the 0x80..0xFF range always | |
181 | gets UTF-8 encoded. | |
182 | ||
183 | After all, the best thing about this pragma is that you don't have to | |
184 | resort to \x... just to spell your name in native encoding. So feel | |
185 | free to put your strings in your encoding in quotes and regexes. | |
186 | ||
187 | =head1 EXAMPLE - Greekperl | |
188 | ||
189 | use encoding "iso 8859-7"; | |
190 | ||
191 | # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode. | |
192 | ||
193 | $a = "\xDF"; | |
194 | $b = "\x{100}"; | |
195 | ||
196 | printf "%#x\n", ord($a); # will print 0x3af, not 0xdf | |
197 | ||
198 | $c = $a . $b; | |
199 | ||
200 | # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". | |
201 | ||
202 | # chr() is affected, and ... | |
203 | ||
204 | print "mega\n" if ord(chr(0xdf)) == 0x3af; | |
205 | ||
206 | # ... ord() is affected by the encoding pragma ... | |
207 | ||
208 | print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; | |
209 | ||
210 | # ... as are eq and cmp ... | |
211 | ||
212 | print "peta\n" if "\x{3af}" eq pack("C", 0xdf); | |
213 | print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; | |
214 | ||
215 | # ... but pack/unpack C are not affected, in case you still | |
216 | # want back to your native encoding | |
217 | ||
218 | print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; | |
219 | ||
220 | =head1 KNOWN PROBLEMS | |
221 | ||
222 | For native multibyte encodings (either fixed or variable length) | |
223 | the current implementation of the regular expressions may introduce | |
224 | recoding errors for longer regular expression literals than 127 bytes. | |
225 | ||
226 | The encoding pragma is not supported on EBCDIC platforms. | |
227 | (Porters wanted.) | |
228 | ||
229 | =head1 SEE ALSO | |
230 | ||
231 | L<perlunicode>, L<Encode>, L<open> | |
232 | ||
233 | =cut |