Commit | Line | Data |
---|---|---|
d1edabcf | 1 | package open; |
99ef548b | 2 | use warnings; |
ac27b0f5 | 3 | use Carp; |
16fe6d59 GS |
4 | $open::hint_bits = 0x20000; |
5 | ||
0c4f7ff0 | 6 | our $VERSION = '1.01'; |
b75c8c73 | 7 | |
58d53262 JH |
8 | my $locale_encoding; |
9 | ||
b178108d | 10 | sub in_locale { $^H & ($locale::hint_bits || 0)} |
58d53262 JH |
11 | |
12 | sub _get_locale_encoding { | |
13 | unless (defined $locale_encoding) { | |
276c9210 | 14 | # I18N::Langinfo isn't available everywhere |
9615f2ee BT |
15 | eval { |
16 | require I18N::Langinfo; | |
17 | I18N::Langinfo->import(qw(langinfo CODESET)); | |
ba6ce41c | 18 | $locale_encoding = langinfo(CODESET()); |
9615f2ee | 19 | }; |
11fc5dc3 | 20 | my $country_language; |
a4157ebb JH |
21 | |
22 | no warnings 'uninitialized'; | |
23 | ||
58d53262 | 24 | if (not $locale_encoding && in_locale()) { |
11fc5dc3 JH |
25 | if ($ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/) { |
26 | ($country_language, $locale_encoding) = ($1, $2); | |
27 | } elsif ($ENV{LANG} =~ /^([^.]+)\.([^.]+)$/) { | |
28 | ($country_language, $locale_encoding) = ($1, $2); | |
58d53262 | 29 | } |
1e616cf5 JH |
30 | } elsif (not $locale_encoding) { |
31 | if ($ENV{LC_ALL} =~ /\butf-?8\b/i || | |
32 | $ENV{LANG} =~ /\butf-?8\b/i) { | |
33 | $locale_encoding = 'utf8'; | |
34 | } | |
35 | # Could do more heuristics based on the country and language | |
58d53262 JH |
36 | # parts of LC_ALL and LANG (the parts before the dot (if any)), |
37 | # since we have Locale::Country and Locale::Language available. | |
38 | # TODO: get a database of Language -> Encoding mappings | |
421e5dc3 JH |
39 | # (the Estonian database at http://www.eki.ee/letter/ |
40 | # would be excellent!) --jhi | |
58d53262 | 41 | } |
11fc5dc3 JH |
42 | if (defined $locale_encoding && |
43 | $locale_encoding eq 'euc' && | |
44 | defined $country_language) { | |
56fb2e42 | 45 | if ($country_language =~ /^ja_JP|japan(?:ese)?$/i) { |
1e616cf5 | 46 | $locale_encoding = 'euc-jp'; |
5a192dee | 47 | } elsif ($country_language =~ /^ko_KR|korean?$/i) { |
1e616cf5 | 48 | $locale_encoding = 'euc-kr'; |
a4157ebb JH |
49 | } elsif ($country_language =~ /^zh_CN|chin(?:a|ese)?$/i) { |
50 | $locale_encoding = 'euc-cn'; | |
56fb2e42 | 51 | } elsif ($country_language =~ /^zh_TW|taiwan(?:ese)?$/i) { |
011f8d22 | 52 | $locale_encoding = 'euc-tw'; |
11fc5dc3 JH |
53 | } |
54 | croak "Locale encoding 'euc' too ambiguous" | |
55 | if $locale_encoding eq 'euc'; | |
56 | } | |
58d53262 JH |
57 | } |
58 | } | |
59 | ||
16fe6d59 | 60 | sub import { |
dfebf958 NIS |
61 | my ($class,@args) = @_; |
62 | croak("`use open' needs explicit list of disciplines") unless @args; | |
b178108d | 63 | my $std; |
16fe6d59 | 64 | $^H |= $open::hint_bits; |
ba6ce41c | 65 | my ($in,$out) = split(/\0/,(${^OPEN} || "\0"), -1); |
dfebf958 NIS |
66 | while (@args) { |
67 | my $type = shift(@args); | |
1e616cf5 JH |
68 | my $dscp; |
69 | if ($type =~ /^:?(utf8|locale|encoding\(.+\))$/) { | |
70 | $type = 'IO'; | |
71 | $dscp = ":$1"; | |
b178108d JH |
72 | } elsif ($type eq ':std') { |
73 | $std = 1; | |
74 | next; | |
1e616cf5 | 75 | } else { |
725d232a | 76 | $dscp = shift(@args) || ''; |
1e616cf5 | 77 | } |
ac27b0f5 | 78 | my @val; |
1e616cf5 | 79 | foreach my $layer (split(/\s+/,$dscp)) { |
dfebf958 | 80 | $layer =~ s/^://; |
58d53262 JH |
81 | if ($layer eq 'locale') { |
82 | use Encode; | |
83 | _get_locale_encoding() | |
84 | unless defined $locale_encoding; | |
99ef548b | 85 | (warnings::warnif("layer", "Cannot figure out an encoding to use"), last) |
58d53262 | 86 | unless defined $locale_encoding; |
11fc5dc3 JH |
87 | if ($locale_encoding =~ /^utf-?8$/i) { |
88 | $layer = "utf8"; | |
89 | } else { | |
738b23dc | 90 | $layer = "encoding($locale_encoding)"; |
11fc5dc3 | 91 | } |
b178108d | 92 | $std = 1; |
97ed432b | 93 | } else { |
011f8d22 JH |
94 | my $target = $layer; # the layer name itself |
95 | $target =~ s/^(\w+)\(.+\)$/$1/; # strip parameters | |
96 | ||
97 | unless(PerlIO::Layer::->find($target)) { | |
99ef548b | 98 | warnings::warnif("layer", "Unknown discipline layer '$layer'"); |
97ed432b | 99 | } |
ac27b0f5 NIS |
100 | } |
101 | push(@val,":$layer"); | |
102 | if ($layer =~ /^(crlf|raw)$/) { | |
103 | $^H{"open_$type"} = $layer; | |
16fe6d59 | 104 | } |
ac27b0f5 NIS |
105 | } |
106 | if ($type eq 'IN') { | |
107 | $in = join(' ',@val); | |
108 | } | |
109 | elsif ($type eq 'OUT') { | |
110 | $out = join(' ',@val); | |
16fe6d59 | 111 | } |
1e616cf5 | 112 | elsif ($type eq 'IO') { |
f3b00462 JH |
113 | $in = $out = join(' ',@val); |
114 | } | |
16fe6d59 | 115 | else { |
ac27b0f5 | 116 | croak "Unknown discipline class '$type'"; |
16fe6d59 GS |
117 | } |
118 | } | |
a4157ebb | 119 | ${^OPEN} = join("\0",$in,$out) if $in or $out; |
b178108d JH |
120 | if ($std) { |
121 | if ($in) { | |
122 | if ($in =~ /:utf8\b/) { | |
123 | binmode(STDIN, ":utf8"); | |
124 | } elsif ($in =~ /(\w+\(.+\))/) { | |
125 | binmode(STDIN, ":$1"); | |
126 | } | |
127 | } | |
128 | if ($out) { | |
129 | if ($out =~ /:utf8\b/) { | |
130 | binmode(STDOUT, ":utf8"); | |
131 | binmode(STDERR, ":utf8"); | |
132 | } elsif ($out =~ /(\w+\(.+\))/) { | |
133 | binmode(STDOUT, ":$1"); | |
134 | binmode(STDERR, ":$1"); | |
135 | } | |
136 | } | |
137 | } | |
16fe6d59 GS |
138 | } |
139 | ||
140 | 1; | |
141 | __END__ | |
d1edabcf GS |
142 | |
143 | =head1 NAME | |
144 | ||
145 | open - perl pragma to set default disciplines for input and output | |
146 | ||
147 | =head1 SYNOPSIS | |
148 | ||
1e616cf5 JH |
149 | use open IN => ":crlf", OUT => ":raw"; |
150 | use open OUT => ':utf8'; | |
151 | use open IO => ":encoding(iso-8859-7)"; | |
152 | ||
153 | use open IO => ':locale'; | |
725d232a | 154 | |
1e616cf5 JH |
155 | use open ':utf8'; |
156 | use open ':locale'; | |
157 | use open ':encoding(iso-8859-7)'; | |
d1edabcf | 158 | |
b178108d JH |
159 | use open ':std'; |
160 | ||
d1edabcf GS |
161 | =head1 DESCRIPTION |
162 | ||
d151aa0e JH |
163 | Full-fledged support for I/O disciplines is now implemented provided |
164 | Perl is configured to use PerlIO as its IO system (which is now the | |
165 | default). | |
16fe6d59 | 166 | |
7d3b96bb NIS |
167 | The C<open> pragma serves as one of the interfaces to declare default |
168 | "layers" (aka disciplines) for all I/O. | |
169 | ||
170 | The C<open> pragma is used to declare one or more default layers for | |
d151aa0e JH |
171 | I/O operations. Any open(), readpipe() (aka qx//) and similar |
172 | operators found within the lexical scope of this pragma will use the | |
173 | declared defaults. | |
7d3b96bb | 174 | |
1e616cf5 | 175 | With the C<IN> subpragma you can declare the default layers |
d8d29d4f | 176 | of input streams, and with the C<OUT> subpragma you can declare |
1e616cf5 JH |
177 | the default layers of output streams. With the C<IO> subpragma |
178 | you can control both input and output streams simultaneously. | |
179 | ||
180 | If you have a legacy encoding, you can use the C<:encoding(...)> tag. | |
181 | ||
182 | if you want to set your encoding disciplines based on your | |
183 | locale environment variables, you can use the C<:locale> tag. | |
184 | For example: | |
185 | ||
186 | $ENV{LANG} = 'ru_RU.KOI8-R'; | |
dbd62f41 JH |
187 | # the :locale will probe the locale environment variables like LANG |
188 | use open OUT => ':locale'; | |
1e616cf5 | 189 | open(O, ">koi8"); |
23bcb45a | 190 | print O chr(0x430); # Unicode CYRILLIC SMALL LETTER A = KOI8-R 0xc1 |
1e616cf5 JH |
191 | close O; |
192 | open(I, "<koi8"); | |
23bcb45a | 193 | printf "%#x\n", ord(<I>), "\n"; # this should print 0xc1 |
1e616cf5 JH |
194 | close I; |
195 | ||
196 | These are equivalent | |
197 | ||
198 | use open ':utf8'; | |
199 | use open IO => ':utf8'; | |
200 | ||
201 | as are these | |
202 | ||
203 | use open ':locale'; | |
204 | use open IO => ':locale'; | |
205 | ||
206 | and these | |
207 | ||
208 | use open ':encoding(iso-8859-7)'; | |
209 | use open IO => ':encoding(iso-8859-7)'; | |
210 | ||
b5d8778e JH |
211 | The matching of encoding names is loose: case does not matter, and |
212 | many encodings have several aliases. See L<Encode::Supported> for | |
213 | details and the list of supported locales. | |
214 | ||
215 | Note that C<:utf8> discipline must always be specified exactly like | |
216 | that, it is not subject to the loose matching of encoding names. | |
217 | ||
d151aa0e JH |
218 | When open() is given an explicit list of layers they are appended to |
219 | the list declared using this pragma. | |
7d3b96bb | 220 | |
b178108d JH |
221 | The C<:std> subpragma on its own has no effect, but if combined with |
222 | the C<:utf8> or C<:encoding> subpragmas, it converts the standard | |
223 | filehandles (STDIN, STDOUT, STDERR) to comply with encoding selected | |
224 | for input/output handles. For example, if both input and out are | |
225 | chosen to be C<:utf8>, a C<:std> will mean that STDIN, STDOUT, and | |
226 | STDERR are also in C<:utf8>. On the other hand, if only output is | |
fb80c70c | 227 | chosen to be in C<< :encoding(koi8r) >>, a C<:std> will cause only the |
b178108d JH |
228 | STDOUT and STDERR to be in C<koi8r>. The C<:locale> subpragma |
229 | implicitly turns on C<:std>. | |
230 | ||
ba9a69eb JH |
231 | The logic of C<:locale> is as follows: |
232 | ||
233 | =over 4 | |
234 | ||
235 | =item 1. | |
236 | ||
237 | If the platform supports the langinfo(CODESET) interface, the codeset | |
238 | returned is used as the default encoding for the open pragma. | |
239 | ||
240 | =item 2. | |
241 | ||
242 | If 1. didn't work but we are under the locale pragma, the environment | |
243 | variables LC_ALL and LANG (in that order) are matched for encodings | |
244 | (the part after C<.>, if any), and if any found, that is used | |
245 | as the default encoding for the open pragma. | |
246 | ||
247 | =item 3. | |
248 | ||
249 | If 1. and 2. didn't work, the environment variables LC_ALL and LANG | |
250 | (in that order) are matched for anything looking like UTF-8, and if | |
251 | any found, C<:utf8> is used as the default encoding for the open | |
252 | pragma. | |
253 | ||
254 | =back | |
255 | ||
b310b053 JH |
256 | If your locale environment variables (LANGUAGE, LC_ALL, LC_CTYPE, LANG) |
257 | contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), | |
258 | the default encoding of your STDIN, STDOUT, and STDERR, and of | |
259 | B<any subsequent file open>, is UTF-8. | |
260 | ||
7d3b96bb NIS |
261 | Directory handles may also support disciplines in future. |
262 | ||
263 | =head1 NONPERLIO FUNCTIONALITY | |
264 | ||
d151aa0e JH |
265 | If Perl is not built to use PerlIO as its IO system then only the two |
266 | pseudo-disciplines ":raw" and ":crlf" are available. | |
16fe6d59 GS |
267 | |
268 | The ":raw" discipline corresponds to "binary mode" and the ":crlf" | |
269 | discipline corresponds to "text mode" on platforms that distinguish | |
270 | between the two modes when opening files (which is many DOS-like | |
d151aa0e JH |
271 | platforms, including Windows). These two disciplines are no-ops on |
272 | platforms where binmode() is a no-op, but perform their functions | |
273 | everywhere if PerlIO is enabled. | |
7d3b96bb NIS |
274 | |
275 | =head1 IMPLEMENTATION DETAILS | |
d1edabcf | 276 | |
f3b00462 JH |
277 | There is a class method in C<PerlIO::Layer> C<find> which is |
278 | implemented as XS code. It is called by C<import> to validate the | |
279 | layers: | |
0c4f7ff0 NIS |
280 | |
281 | PerlIO::Layer::->find("perlio") | |
282 | ||
f3b00462 JH |
283 | The return value (if defined) is a Perl object, of class |
284 | C<PerlIO::Layer> which is created by the C code in F<perlio.c>. As | |
285 | yet there is nothing useful you can do with the object at the perl | |
286 | level. | |
16fe6d59 | 287 | |
d1edabcf GS |
288 | =head1 SEE ALSO |
289 | ||
1768d7eb JH |
290 | L<perlfunc/"binmode">, L<perlfunc/"open">, L<perlunicode>, L<PerlIO>, |
291 | L<encoding> | |
d1edabcf GS |
292 | |
293 | =cut |