ext/Encode/encoding.pm

   1 # $Id: encoding.pm,v 1.47 2003/08/20 11:15:31 dankogai Exp dankogai $
   2 package encoding;
   3 our $VERSION = do { my @r = (q$Revision: 1.47 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   4
   5 use Encode;
   6 use strict;
   7 sub DEBUG () { 0 }
   8
   9 BEGIN {
  10     if (ord("A") == 193) {
  11         require Carp;
  12         Carp::croak("encoding pragma does not support EBCDIC platforms");
  13     }
  14 }
  15
  16 our $HAS_PERLIO = 0;
  17 eval { require PerlIO::encoding };
  18 unless ($@){
  19     $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
  20 }
  21
  22 sub _exception{
  23     my $name = shift;
  24     $] > 5.008 and return 0;               # 5.8.1 or higher then no
  25     my %utfs = map {$_=>1}
  26         qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
  27            UTF-32 UTF-32BE UTF-32LE);
  28     $utfs{$name} or return 0;               # UTFs or no
  29     require Config; Config->import(); our %Config;
  30     return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
  31 }
  32
  33 sub import {
  34     my $class = shift;
  35     my $name  = shift;
  36     my %arg = @_;
  37     $name ||= $ENV{PERL_ENCODING};
  38     my $enc = find_encoding($name);
  39     unless (defined $enc) {
  40         require Carp;
  41         Carp::croak("Unknown encoding '$name'");
  42     }
  43     $name = $enc->name; # canonize
  44     unless ($arg{Filter}) {
  45         DEBUG and warn "_exception($name) = ", _exception($name);
  46         _exception($name) or ${^ENCODING} = $enc;
  47         $HAS_PERLIO or return 1;
  48     }else{
  49         defined(${^ENCODING}) and undef ${^ENCODING};
  50         # implicitly 'use utf8'
  51         require utf8; # to fetch $utf8::hint_bits;
  52         $^H |= $utf8::hint_bits;
  53         eval {
  54             require Filter::Util::Call ;
  55             Filter::Util::Call->import ;
  56             filter_add(sub{
  57                            my $status = filter_read();
  58                            if ($status > 0){
  59                                $_ = $enc->decode($_, 1);
  60                                DEBUG and warn $_;
  61                            }
  62                            $status ;
  63                        });
  64         };
  65     }   DEBUG and warn "Filter installed";
  66     defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
  67     for my $h (qw(STDIN STDOUT)){
  68         if ($arg{$h}){
  69             unless (defined find_encoding($arg{$h})) {
  70                 require Carp;
  71                 Carp::croak("Unknown encoding for $h, '$arg{$h}'");
  72             }
  73             eval { binmode($h, ":raw :encoding($arg{$h})") };
  74         }else{
  75             unless (exists $arg{$h}){
  76                 eval {
  77                     no warnings 'uninitialized';
  78                     binmode($h, ":raw :encoding($name)");
  79                 };
  80             }
  81         }
  82         if ($@){
  83             require Carp;
  84             Carp::croak($@);
  85         }
  86     }
  87     return 1; # I doubt if we need it, though
  88 }
  89
  90 sub unimport{
  91     no warnings;
  92     undef ${^ENCODING};
  93     if ($HAS_PERLIO){
  94         binmode(STDIN,  ":raw");
  95         binmode(STDOUT, ":raw");
  96     }else{
  97         binmode(STDIN);
  98         binmode(STDOUT);
  99     }
 100     if ($INC{"Filter/Util/Call.pm"}){
 101         eval { filter_del() };
 102     }
 103 }
 104
 105 1;
 106 __END__
 107
 108 =pod
 109
 110 =head1 NAME
 111
 112 encoding - allows you to write your script in non-ascii or non-utf8
 113
 114 =head1 SYNOPSIS
 115
 116   use encoding "greek";  # Perl like Greek to you?
 117   use encoding "euc-jp"; # Jperl!
 118
 119   # or you can even do this if your shell supports your native encoding
 120
 121   perl -Mencoding=latin2 -e '...' # Feeling centrally European?
 122   perl -Mencoding=euc-kr -e '...' # Or Korean?
 123
 124   # more control
 125
 126   # A simple euc-cn => utf-8 converter
 127   use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};
 128
 129   # "no encoding;" supported (but not scoped!)
 130   no encoding;
 131
 132   # an alternate way, Filter
 133   use encoding "euc-jp", Filter=>1;
 134   # now you can use kanji identifiers -- in euc-jp!
 135
 136 =head1 ABSTRACT
 137
 138 Let's start with a bit of history: Perl 5.6.0 introduced Unicode
 139 support.  You could apply C<substr()> and regexes even to complex CJK
 140 characters -- so long as the script was written in UTF-8.  But back
 141 then, text editors that supported UTF-8 were still rare and many users
 142 instead chose to write scripts in legacy encodings, giving up a whole
 143 new feature of Perl 5.6.
 144
 145 Rewind to the future: starting from perl 5.8.0 with the B<encoding>
 146 pragma, you can write your script in any encoding you like (so long
 147 as the C<Encode> module supports it) and still enjoy Unicode support.
 148 This pragma achieves that by doing the following:
 149
 150 =over
 151
 152 =item *
 153
 154 Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
 155 the encoding specified to utf8.  In Perl 5.8.1 and later, literals in
 156 C<tr///> and C<DATA> pseudo-filehandle are also converted.
 157
 158 =item *
 159
 160 Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
 161  specified.
 162
 163 =back
 164
 165 =head2 Literal Conversions
 166
 167 You can write code in EUC-JP as follows:
 168
 169   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
 170                #<-char-><-char->   # 4 octets
 171   s/\bCamel\b/$Rakuda/;
 172
 173 And with C<use encoding "euc-jp"> in effect, it is the same thing as
 174 the code in UTF-8:
 175
 176   my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
 177   s/\bCamel\b/$Rakuda/;
 178
 179 =head2 PerlIO layers for C<STD(IN|OUT)>
 180
 181 The B<encoding> pragma also modifies the filehandle layers of
 182 STDIN and STDOUT to the specified encoding.  Therefore,
 183
 184   use encoding "euc-jp";
 185   my $message = "Camel is the symbol of perl.\n";
 186   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
 187   $message =~ s/\bCamel\b/$Rakuda/;
 188   print $message;
 189
 190 Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
 191 not "\x{99F1}\x{99DD} is the symbol of perl.\n".
 192
 193 You can override this by giving extra arguments; see below.
 194
 195 =head1 FEATURES THAT REQUIRE 5.8.1
 196
 197 Some of the features offered by this pragma requires perl 5.8.1.  Most
 198 of these are done by Inaba Hiroto.  Any other features and changes
 199 are good for 5.8.0.
 200
 201 =over
 202
 203 =item "NON-EUC" doublebyte encodings
 204
 205 Because perl needs to parse script before applying this pragma, such
 206 encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
 207 \x5c) in the second byte fails because the second byte may
 208 accidentally escape the quoting character that follows.  Perl 5.8.1
 209 or later fixes this problem.
 210
 211 =item tr//
 212
 213 C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
 214 See the section below for details.
 215
 216 =item DATA pseudo-filehandle
 217
 218 Another feature that was overlooked was C<DATA>.
 219
 220 =back
 221
 222 =head1 USAGE
 223
 224 =over 4
 225
 226 =item use encoding [I<ENCNAME>] ;
 227
 228 Sets the script encoding to I<ENCNAME>.  And unless ${^UNICODE}
 229 exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
 230 ":encoding(I<ENCNAME>)".
 231
 232 Note that STDERR WILL NOT be changed.
 233
 234 Also note that non-STD file handles remain unaffected.  Use C<use
 235 open> or C<binmode> to change layers of those.
 236
 237 If no encoding is specified, the environment variable L<PERL_ENCODING>
 238 is consulted.  If no encoding can be found, the error C<Unknown encoding
 239 'I<ENCNAME>'> will be thrown.
 240
 241 =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
 242
 243 You can also individually set encodings of STDIN and STDOUT via the
 244 C<< STDIN => I<ENCNAME> >> form.  In this case, you cannot omit the
 245 first I<ENCNAME>.  C<< STDIN => undef >> turns the IO transcoding
 246 completely off.
 247
 248 When ${^UNICODE} exists and non-zero, these options will completely
 249 ignored.  ${^UNICODE} is a variable introduced in perl 5.8.1.  See
 250 L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
 251 details (perl 5.8.1 and later).
 252
 253 =item use encoding I<ENCNAME> Filter=E<gt>1;
 254
 255 This turns the encoding pragma into a source filter.  While the
 256 default approach just decodes interpolated literals (in qq() and
 257 qr()), this will apply a source filter to the entire source code.  See
 258 L</"The Filter Option"> below for details.
 259
 260 =item no encoding;
 261
 262 Unsets the script encoding. The layers of STDIN, STDOUT are
 263 reset to ":raw" (the default unprocessed raw stream of bytes).
 264
 265 =back
 266
 267 =head1 The Filter Option
 268
 269 The magic of C<use encoding> is not applied to the names of
 270 identifiers.  In order to make C<${"\x{4eba}"}++> ($human++, where human
 271 is a single Han ideograph) work, you still need to write your script
 272 in UTF-8 -- or use a source filter.  That's what 'Filter=>1' does.
 273
 274 What does this mean?  Your source code behaves as if it is written in
 275 UTF-8 with 'use utf8' in effect.  So even if your editor only supports
 276 Shift_JIS, for example, you can still try examples in Chapter 15 of
 277 C<Programming Perl, 3rd Ed.>.  For instance, you can use UTF-8
 278 identifiers.
 279
 280 This option is significantly slower and (as of this writing) non-ASCII
 281 identifiers are not very stable WITHOUT this option and with the
 282 source code written in UTF-8.
 283
 284 =head2 Filter-related changes at Encode version 1.87
 285
 286 =over
 287
 288 =item *
 289
 290 The Filter option now sets STDIN and STDOUT like non-filter options.
 291 And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
 292 non-filter version.
 293
 294 =item *
 295
 296 C<use utf8> is implicitly declared so you no longer have to C<use
 297 utf8> to C<${"\x{4eba}"}++>.
 298
 299 =back
 300
 301 =head1 CAVEATS
 302
 303 =head2 NOT SCOPED
 304
 305 The pragma is a per script, not a per block lexical.  Only the last
 306 C<use encoding> or C<no encoding> matters, and it affects
 307 B<the whole script>.  However, the <no encoding> pragma is supported and
 308 B<use encoding> can appear as many times as you want in a given script.
 309 The multiple use of this pragma is discouraged.
 310
 311 By the same reason, the use this pragma inside modules is also
 312 discouraged (though not as strongly discouranged as the case above.
 313 See below).
 314
 315 If you still have to write a module with this pragma, be very careful
 316 of the load order.  See the codes below;
 317
 318   # called module
 319   package Module_IN_BAR;
 320   use encoding "bar";
 321   # stuff in "bar" encoding here
 322   1;
 323
 324   # caller script
 325   use encoding "foo"
 326   use Module_IN_BAR;
 327   # surprise! use encoding "bar" is in effect.
 328
 329 The best way to avoid this oddity is to use this pragma RIGHT AFTER
 330 other modules are loaded.  i.e.
 331
 332   use Module_IN_BAR;
 333   use encoding "foo";
 334
 335 =head2 DO NOT MIX MULTIPLE ENCODINGS
 336
 337 Notice that only literals (string or regular expression) having only
 338 legacy code points are affected: if you mix data like this
 339
 340         \xDF\x{100}
 341
 342 the data is assumed to be in (Latin 1 and) Unicode, not in your native
 343 encoding.  In other words, this will match in "greek":
 344
 345         "\xDF" =~ /\x{3af}/
 346
 347 but this will not
 348
 349         "\xDF\x{100}" =~ /\x{3af}\x{100}/
 350
 351 since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
 352 the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
 353 LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
 354 should not be mixing your legacy data and Unicode in the same string.
 355
 356 This pragma also affects encoding of the 0x80..0xFF code point range:
 357 normally characters in that range are left as eight-bit bytes (unless
 358 they are combined with characters with code points 0x100 or larger,
 359 in which case all characters need to become UTF-8 encoded), but if
 360 the C<encoding> pragma is present, even the 0x80..0xFF range always
 361 gets UTF-8 encoded.
 362
 363 After all, the best thing about this pragma is that you don't have to
 364 resort to \x{....} just to spell your name in a native encoding.
 365 So feel free to put your strings in your encoding in quotes and
 366 regexes.
 367
 368 =head2 tr/// with ranges
 369
 370 The B<encoding> pragma works by decoding string literals in
 371 C<q//,qq//,qr//,qw///, qx//> and so forth.  In perl 5.8.0, this
 372 does not apply to C<tr///>.  Therefore,
 373
 374   use encoding 'euc-jp';
 375   #....
 376   $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
 377   #           -------- -------- -------- --------
 378
 379 Does not work as
 380
 381   $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
 382
 383 =over
 384
 385 =item Legend of characters above
 386
 387   utf8     euc-jp   charnames::viacode()
 388   -----------------------------------------
 389   \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
 390   \x{3093} \xA4\xF3 HIRAGANA LETTER N
 391   \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
 392   \x{30f3} \xA5\xF3 KATAKANA LETTER N
 393
 394 =back
 395
 396 This counterintuitive behavior has been fixed in perl 5.8.1.
 397
 398 =head3 workaround to tr///;
 399
 400 In perl 5.8.0, you can work around as follows;
 401
 402   use encoding 'euc-jp';
 403   #  ....
 404   eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
 405
 406 Note the C<tr//> expression is surrounded by C<qq{}>.  The idea behind
 407 is the same as classic idiom that makes C<tr///> 'interpolate'.
 408
 409    tr/$from/$to/;            # wrong!
 410    eval qq{ tr/$from/$to/ }; # workaround.
 411
 412 Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
 413 C<tr///> not being decoded was obviously against the will of Perl5
 414 Porters so it has been fixed in Perl 5.8.1 or later.
 415
 416 =head1 EXAMPLE - Greekperl
 417
 418     use encoding "iso 8859-7";
 419
 420     # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
 421
 422     $a = "\xDF";
 423     $b = "\x{100}";
 424
 425     printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
 426
 427     $c = $a . $b;
 428
 429     # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
 430
 431     # chr() is affected, and ...
 432
 433     print "mega\n"  if ord(chr(0xdf)) == 0x3af;
 434
 435     # ... ord() is affected by the encoding pragma ...
 436
 437     print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
 438
 439     # ... as are eq and cmp ...
 440
 441     print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
 442     print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
 443
 444     # ... but pack/unpack C are not affected, in case you still
 445     # want to go back to your native encoding
 446
 447     print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
 448
 449 =head1 KNOWN PROBLEMS
 450
 451 =over
 452
 453 =item literals in regex that are longer than 127 bytes
 454
 455 For native multibyte encodings (either fixed or variable length),
 456 the current implementation of the regular expressions may introduce
 457 recoding errors for regular expression literals longer than 127 bytes.
 458
 459 =item EBCDIC
 460
 461 The encoding pragma is not supported on EBCDIC platforms.
 462 (Porters who are willing and able to remove this limitation are
 463 welcome.)
 464
 465 =item format
 466
 467 This pragma doesn't work well with format because PerlIO does not
 468 get along very well with it.  When format contains non-ascii
 469 characters it prints funny or gets "wide character warnings".
 470 To understand it, try the code below.
 471
 472   # Save this one in utf8
 473   # replace *non-ascii* with a non-ascii string
 474   my $camel;
 475   format STDOUT =
 476   *non-ascii*@>>>>>>>
 477   $camel
 478   .
 479   $camel = "*non-ascii*";
 480   binmode(STDOUT=>':encoding(utf8)'); # bang!
 481   write;              # funny
 482   print $camel, "\n"; # fine
 483
 484 Without binmode this happens to work but without binmode, print()
 485 fails instead of write().
 486
 487 At any rate, the very use of format is questionable when it comes to
 488 unicode characters since you have to consider such things as character
 489 width (i.e. double-width for ideographs) and directions (i.e. BIDI for
 490 Arabic and Hebrew).
 491
 492 =back
 493
 494 =head1 HISTORY
 495
 496 This pragma first appeared in Perl 5.8.0.  For features that require
 497 5.8.1 and better, see above.
 498
 499 =head1 SEE ALSO
 500
 501 L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
 502
 503 Ch. 15 of C<Programming Perl (3rd Edition)>
 504 by Larry Wall, Tom Christiansen, Jon Orwant;
 505 O'Reilly & Associates; ISBN 0-596-00027-8
 506
 507 =cut