| 1 | package open; |
| 2 | use warnings; |
| 3 | use Carp; |
| 4 | $open::hint_bits = 0x20000; # HINT_LOCALIZE_HH |
| 5 | |
| 6 | our $VERSION = '1.03'; |
| 7 | |
| 8 | my $locale_encoding; |
| 9 | |
| 10 | sub in_locale { $^H & ($locale::hint_bits || 0)} |
| 11 | |
| 12 | sub _get_locale_encoding { |
| 13 | unless (defined $locale_encoding) { |
| 14 | # I18N::Langinfo isn't available everywhere |
| 15 | eval { |
| 16 | require I18N::Langinfo; |
| 17 | I18N::Langinfo->import(qw(langinfo CODESET)); |
| 18 | $locale_encoding = langinfo(CODESET()); |
| 19 | }; |
| 20 | my $country_language; |
| 21 | |
| 22 | no warnings 'uninitialized'; |
| 23 | |
| 24 | if (not $locale_encoding && in_locale()) { |
| 25 | if ($ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/) { |
| 26 | ($country_language, $locale_encoding) = ($1, $2); |
| 27 | } elsif ($ENV{LANG} =~ /^([^.]+)\.([^.]+)$/) { |
| 28 | ($country_language, $locale_encoding) = ($1, $2); |
| 29 | } |
| 30 | # LANGUAGE affects only LC_MESSAGES only on glibc |
| 31 | } elsif (not $locale_encoding) { |
| 32 | if ($ENV{LC_ALL} =~ /\butf-?8\b/i || |
| 33 | $ENV{LANG} =~ /\butf-?8\b/i) { |
| 34 | $locale_encoding = 'utf8'; |
| 35 | } |
| 36 | # Could do more heuristics based on the country and language |
| 37 | # parts of LC_ALL and LANG (the parts before the dot (if any)), |
| 38 | # since we have Locale::Country and Locale::Language available. |
| 39 | # TODO: get a database of Language -> Encoding mappings |
| 40 | # (the Estonian database at http://www.eki.ee/letter/ |
| 41 | # would be excellent!) --jhi |
| 42 | } |
| 43 | if (defined $locale_encoding && |
| 44 | lc($locale_encoding) eq 'euc' && |
| 45 | defined $country_language) { |
| 46 | if ($country_language =~ /^ja_JP|japan(?:ese)?$/i) { |
| 47 | $locale_encoding = 'euc-jp'; |
| 48 | } elsif ($country_language =~ /^ko_KR|korean?$/i) { |
| 49 | $locale_encoding = 'euc-kr'; |
| 50 | } elsif ($country_language =~ /^zh_CN|chin(?:a|ese)?$/i) { |
| 51 | $locale_encoding = 'euc-cn'; |
| 52 | } elsif ($country_language =~ /^zh_TW|taiwan(?:ese)?$/i) { |
| 53 | $locale_encoding = 'euc-tw'; |
| 54 | } else { |
| 55 | croak "Locale encoding 'euc' too ambiguous"; |
| 56 | } |
| 57 | } |
| 58 | } |
| 59 | } |
| 60 | |
| 61 | sub import { |
| 62 | my ($class,@args) = @_; |
| 63 | croak("`use open' needs explicit list of PerlIO layers") unless @args; |
| 64 | my $std; |
| 65 | $^H |= $open::hint_bits; |
| 66 | my ($in,$out) = split(/\0/,(${^OPEN} || "\0"), -1); |
| 67 | while (@args) { |
| 68 | my $type = shift(@args); |
| 69 | my $dscp; |
| 70 | if ($type =~ /^:?(utf8|locale|encoding\(.+\))$/) { |
| 71 | $type = 'IO'; |
| 72 | $dscp = ":$1"; |
| 73 | } elsif ($type eq ':std') { |
| 74 | $std = 1; |
| 75 | next; |
| 76 | } else { |
| 77 | $dscp = shift(@args) || ''; |
| 78 | } |
| 79 | my @val; |
| 80 | foreach my $layer (split(/\s+/,$dscp)) { |
| 81 | $layer =~ s/^://; |
| 82 | if ($layer eq 'locale') { |
| 83 | require Encode; |
| 84 | _get_locale_encoding() |
| 85 | unless defined $locale_encoding; |
| 86 | (warnings::warnif("layer", "Cannot figure out an encoding to use"), last) |
| 87 | unless defined $locale_encoding; |
| 88 | if ($locale_encoding =~ /^utf-?8$/i) { |
| 89 | $layer = "utf8"; |
| 90 | } else { |
| 91 | $layer = "encoding($locale_encoding)"; |
| 92 | } |
| 93 | $std = 1; |
| 94 | } else { |
| 95 | my $target = $layer; # the layer name itself |
| 96 | $target =~ s/^(\w+)\(.+\)$/$1/; # strip parameters |
| 97 | |
| 98 | unless(PerlIO::Layer::->find($target,1)) { |
| 99 | warnings::warnif("layer", "Unknown PerlIO layer '$target'"); |
| 100 | } |
| 101 | } |
| 102 | push(@val,":$layer"); |
| 103 | if ($layer =~ /^(crlf|raw)$/) { |
| 104 | $^H{"open_$type"} = $layer; |
| 105 | } |
| 106 | } |
| 107 | if ($type eq 'IN') { |
| 108 | $in = join(' ',@val); |
| 109 | } |
| 110 | elsif ($type eq 'OUT') { |
| 111 | $out = join(' ',@val); |
| 112 | } |
| 113 | elsif ($type eq 'IO') { |
| 114 | $in = $out = join(' ',@val); |
| 115 | } |
| 116 | else { |
| 117 | croak "Unknown PerlIO layer class '$type'"; |
| 118 | } |
| 119 | } |
| 120 | ${^OPEN} = join("\0",$in,$out) if $in or $out; |
| 121 | if ($std) { |
| 122 | if ($in) { |
| 123 | if ($in =~ /:utf8\b/) { |
| 124 | binmode(STDIN, ":utf8"); |
| 125 | } elsif ($in =~ /(\w+\(.+\))/) { |
| 126 | binmode(STDIN, ":$1"); |
| 127 | } |
| 128 | } |
| 129 | if ($out) { |
| 130 | if ($out =~ /:utf8\b/) { |
| 131 | binmode(STDOUT, ":utf8"); |
| 132 | binmode(STDERR, ":utf8"); |
| 133 | } elsif ($out =~ /(\w+\(.+\))/) { |
| 134 | binmode(STDOUT, ":$1"); |
| 135 | binmode(STDERR, ":$1"); |
| 136 | } |
| 137 | } |
| 138 | } |
| 139 | } |
| 140 | |
| 141 | 1; |
| 142 | __END__ |
| 143 | |
| 144 | =head1 NAME |
| 145 | |
| 146 | open - perl pragma to set default PerlIO layers for input and output |
| 147 | |
| 148 | =head1 SYNOPSIS |
| 149 | |
| 150 | use open IN => ":crlf", OUT => ":bytes"; |
| 151 | use open OUT => ':utf8'; |
| 152 | use open IO => ":encoding(iso-8859-7)"; |
| 153 | |
| 154 | use open IO => ':locale'; |
| 155 | |
| 156 | use open ':utf8'; |
| 157 | use open ':locale'; |
| 158 | use open ':encoding(iso-8859-7)'; |
| 159 | |
| 160 | use open ':std'; |
| 161 | |
| 162 | =head1 DESCRIPTION |
| 163 | |
| 164 | Full-fledged support for I/O layers is now implemented provided |
| 165 | Perl is configured to use PerlIO as its IO system (which is now the |
| 166 | default). |
| 167 | |
| 168 | The C<open> pragma serves as one of the interfaces to declare default |
| 169 | "layers" (also known as "disciplines") for all I/O. Any two-argument |
| 170 | open(), readpipe() (aka qx//) and similar operators found within the |
| 171 | lexical scope of this pragma will use the declared defaults. |
| 172 | Three-argument opens are not affected by this pragma since there you |
| 173 | (can) explicitly specify the layers and are supposed to know what you |
| 174 | are doing. |
| 175 | |
| 176 | With the C<IN> subpragma you can declare the default layers |
| 177 | of input streams, and with the C<OUT> subpragma you can declare |
| 178 | the default layers of output streams. With the C<IO> subpragma |
| 179 | you can control both input and output streams simultaneously. |
| 180 | |
| 181 | If you have a legacy encoding, you can use the C<:encoding(...)> tag. |
| 182 | |
| 183 | if you want to set your encoding layers based on your |
| 184 | locale environment variables, you can use the C<:locale> tag. |
| 185 | For example: |
| 186 | |
| 187 | $ENV{LANG} = 'ru_RU.KOI8-R'; |
| 188 | # the :locale will probe the locale environment variables like LANG |
| 189 | use open OUT => ':locale'; |
| 190 | open(O, ">koi8"); |
| 191 | print O chr(0x430); # Unicode CYRILLIC SMALL LETTER A = KOI8-R 0xc1 |
| 192 | close O; |
| 193 | open(I, "<koi8"); |
| 194 | printf "%#x\n", ord(<I>), "\n"; # this should print 0xc1 |
| 195 | close I; |
| 196 | |
| 197 | These are equivalent |
| 198 | |
| 199 | use open ':utf8'; |
| 200 | use open IO => ':utf8'; |
| 201 | |
| 202 | as are these |
| 203 | |
| 204 | use open ':locale'; |
| 205 | use open IO => ':locale'; |
| 206 | |
| 207 | and these |
| 208 | |
| 209 | use open ':encoding(iso-8859-7)'; |
| 210 | use open IO => ':encoding(iso-8859-7)'; |
| 211 | |
| 212 | The matching of encoding names is loose: case does not matter, and |
| 213 | many encodings have several aliases. See L<Encode::Supported> for |
| 214 | details and the list of supported locales. |
| 215 | |
| 216 | Note that C<:utf8> PerlIO layer must always be specified exactly like |
| 217 | that, it is not subject to the loose matching of encoding names. |
| 218 | |
| 219 | When open() is given an explicit list of layers they are appended to |
| 220 | the list declared using this pragma. |
| 221 | |
| 222 | The C<:std> subpragma on its own has no effect, but if combined with |
| 223 | the C<:utf8> or C<:encoding> subpragmas, it converts the standard |
| 224 | filehandles (STDIN, STDOUT, STDERR) to comply with encoding selected |
| 225 | for input/output handles. For example, if both input and out are |
| 226 | chosen to be C<:utf8>, a C<:std> will mean that STDIN, STDOUT, and |
| 227 | STDERR are also in C<:utf8>. On the other hand, if only output is |
| 228 | chosen to be in C<< :encoding(koi8r) >>, a C<:std> will cause only the |
| 229 | STDOUT and STDERR to be in C<koi8r>. The C<:locale> subpragma |
| 230 | implicitly turns on C<:std>. |
| 231 | |
| 232 | The logic of C<:locale> is as follows: |
| 233 | |
| 234 | =over 4 |
| 235 | |
| 236 | =item 1. |
| 237 | |
| 238 | If the platform supports the langinfo(CODESET) interface, the codeset |
| 239 | returned is used as the default encoding for the open pragma. |
| 240 | |
| 241 | =item 2. |
| 242 | |
| 243 | If 1. didn't work but we are under the locale pragma, the environment |
| 244 | variables LC_ALL and LANG (in that order) are matched for encodings |
| 245 | (the part after C<.>, if any), and if any found, that is used |
| 246 | as the default encoding for the open pragma. |
| 247 | |
| 248 | =item 3. |
| 249 | |
| 250 | If 1. and 2. didn't work, the environment variables LC_ALL and LANG |
| 251 | (in that order) are matched for anything looking like UTF-8, and if |
| 252 | any found, C<:utf8> is used as the default encoding for the open |
| 253 | pragma. |
| 254 | |
| 255 | =back |
| 256 | |
| 257 | If your locale environment variables (LC_ALL, LC_CTYPE, LANG) |
| 258 | contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), |
| 259 | the default encoding of your STDIN, STDOUT, and STDERR, and of |
| 260 | B<any subsequent file open>, is UTF-8. |
| 261 | |
| 262 | Directory handles may also support PerlIO layers in the future. |
| 263 | |
| 264 | =head1 NONPERLIO FUNCTIONALITY |
| 265 | |
| 266 | If Perl is not built to use PerlIO as its IO system then only the two |
| 267 | pseudo-layers C<:bytes> and C<:crlf> are available. |
| 268 | |
| 269 | The C<:bytes> layer corresponds to "binary mode" and the C<:crlf> |
| 270 | layer corresponds to "text mode" on platforms that distinguish |
| 271 | between the two modes when opening files (which is many DOS-like |
| 272 | platforms, including Windows). These two layers are no-ops on |
| 273 | platforms where binmode() is a no-op, but perform their functions |
| 274 | everywhere if PerlIO is enabled. |
| 275 | |
| 276 | =head1 IMPLEMENTATION DETAILS |
| 277 | |
| 278 | There is a class method in C<PerlIO::Layer> C<find> which is |
| 279 | implemented as XS code. It is called by C<import> to validate the |
| 280 | layers: |
| 281 | |
| 282 | PerlIO::Layer::->find("perlio") |
| 283 | |
| 284 | The return value (if defined) is a Perl object, of class |
| 285 | C<PerlIO::Layer> which is created by the C code in F<perlio.c>. As |
| 286 | yet there is nothing useful you can do with the object at the perl |
| 287 | level. |
| 288 | |
| 289 | =head1 SEE ALSO |
| 290 | |
| 291 | L<perlfunc/"binmode">, L<perlfunc/"open">, L<perlunicode>, L<PerlIO>, |
| 292 | L<encoding> |
| 293 | |
| 294 | =cut |