ext/Encode/encoding.pm

   1 package encoding;
   2 our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   3
   4 use Encode;
   5 use strict;
   6
   7 BEGIN {
   8     if (ord("A") == 193) {
   9         require Carp;
  10         Carp::croak "encoding pragma does not support EBCDIC platforms";
  11     }
  12 }
  13
  14 sub import {
  15     my $class = shift;
  16     my $name  = shift;
  17     my %arg = @_;
  18     $name ||= $ENV{PERL_ENCODING};
  19
  20     my $enc = find_encoding($name);
  21     unless (defined $enc) {
  22         require Carp;
  23         Carp::croak "Unknown encoding '$name'";
  24     }
  25     ${^ENCODING} = $enc; # this is all you need, actually.
  26
  27     # $_OPEN_ORIG = ${^OPEN};
  28     for my $h (qw(STDIN STDOUT STDERR)){
  29         if ($arg{$h}){
  30             unless (defined find_encoding($name)) {
  31                 require Carp;
  32                 Carp::croak "Unknown encoding for $h, '$arg{$h}'";
  33             }
  34             eval qq{ binmode($h, ":encoding($arg{$h})") };
  35         }else{
  36             eval qq{ binmode($h, ":encoding($name)") };
  37         }
  38         if ($@){
  39             require Carp;
  40             Carp::croak($@);
  41         }
  42     }
  43     return 1; # I doubt if we need it, though
  44 }
  45
  46 sub unimport{
  47     no warnings;
  48     undef ${^ENCODING};
  49     binmode(STDIN,  ":raw");
  50     binmode(STDOUT, ":raw");
  51     # Leaves STDERR alone.
  52     # binmode(STDERR, ":raw");
  53 }
  54
  55 1;
  56 __END__
  57 =pod
  58
  59 =head1 NAME
  60
  61 encoding -  allows you to write your script in non-asii or non-utf8
  62
  63 =head1 SYNOPSIS
  64
  65   use encoding "euc-jp"; # Jperl!
  66
  67   # or you can even do this if your shell supports euc-jp
  68
  69   > perl -Mencoding=euc-jp -e '...'
  70
  71   # or from the shebang line
  72
  73   #!/your/path/to/perl -Mencoding=euc-jp
  74
  75   # more control
  76
  77   # A simple euc-jp => utf-8 converter
  78   use encoding "euc-jp", STDOUT => "utf8";  while(<>){print};
  79
  80   # "no encoding;" supported (but not scoped!)
  81   no encoding;
  82
  83 =head1 ABSTRACT
  84
  85 Perl 5.6.0 has introduced Unicode support.  You could apply
  86 C<substr()> and regexes even to complex CJK characters -- so long as
  87 the script was written in UTF-8.  But back then text editors that
  88 support UTF-8 was still rare and many users rather chose to writer
  89 scripts in legacy encodings, given up whole new feature of Perl 5.6.
  90
  91 With B<encoding> pragma, you can write your script in any encoding you like
  92 (so long as the C<Encode> module supports it) and still enjoy Unicode
  93 support.  You can write a code in EUC-JP as follows;
  94
  95   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
  96                #<-char-><-char->   # 4 octets
  97   s/\bCamel\b/$Rakuda/;
  98
  99 And with C<use encoding "euc-jp"> in effect, it is the same thing as
 100 the code in UTF-8 as follow.
 101
 102   my $Rakuda = "\x{99F1}\x{99DD}"; # who Unicode Characters
 103   s/\bCamel\b/$Rakuda/;
 104
 105 The B<encoding> pragma also modifies the file handle disciplines of
 106 STDIN, STDOUT, and STDERR to the specified encoding.  Therefore,
 107
 108   use encoding "euc-jp";
 109   my $message = "Camel is the symbol of perl.\n";
 110   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
 111   $message =~ s/\bCamel\b/$Rakuda/;
 112   print $message;
 113
 114 Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", not
 115 "\x{99F1}\x{99DD} is the symbol of perl.\n".
 116
 117 You can override this by giving extra arguments.  See below.
 118
 119 =head1 USAGE
 120
 121 =over 4
 122
 123 =item use encoding [I<ENCNAME>] ;
 124
 125 Sets the script encoding to I<ENCNAME> and file handle disciplines of
 126 STDIN, STDOUT are set to ":encoding(I<ENCNAME>)". Note STDERR will not
 127 be changed.
 128
 129 If no encoding is specified, the environment variable L<PERL_ENCODING>
 130 is consulted. If no  encoding can be found, C<Unknown encoding 'I<ENCNAME>'>
 131 error will be thrown.
 132
 133 Note that non-STD file handles remain unaffected.  Use C<use open> or
 134 C<binmode> to change disciplines of those.
 135
 136 =item use encoding I<ENCNAME> [ STDIN => I<ENCNAME_IN> ...] ;
 137
 138 You can also individually set encodings of STDIN, STDOUT, and STDERR
 139 via STDI<FH> => I<ENCNAME_FH> form.  In this case, you cannot omit the
 140 first I<ENCNAME>.
 141
 142 =item no encoding;
 143
 144 Unsets the script encoding and the disciplines of STDIN, STDOUT are
 145 reset to ":raw".
 146
 147 =back
 148
 149 =head1 CAVEATS
 150
 151 =head2 NOT SCOPED
 152
 153 The pragma is a per script, not a per block lexical.  Only the last
 154 C<use encoding> or C<matters, and it affects B<the whole script>.
 155 Though <no encoding> pragma is supported and C<use encoding> can
 156 appear as many times as you want in a given script, the multiple use
 157 of this pragma is discouraged.
 158
 159 =head2 DO NOT MIX MULTIPLE ENCODINGS
 160
 161 Notice that only literals (string or regular expression) having only
 162 legacy code points are affected: if you mix data like this
 163
 164         \xDF\x{100}
 165
 166 the data is assumed to be in (Latin 1 and) Unicode, not in your native
 167 encoding.  In other words, this will match in "greek":
 168
 169         "\xDF" =~ /\x{3af}/
 170
 171 but this will not
 172
 173         "\xDF\x{100}" =~ /\x{3af}\x{100}/
 174
 175 since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
 176 because of the C<\x{100}> on the left.  You should not be mixing your
 177 legacy data and Unicode in the same string.
 178
 179 This pragma also affects encoding of the 0x80..0xFF code point range:
 180 normally characters in that range are left as eight-bit bytes (unless
 181 they are combined with characters with code points 0x100 or larger,
 182 in which case all characters need to become UTF-8 encoded), but if
 183 the C<encoding> pragma is present, even the 0x80..0xFF range always
 184 gets UTF-8 encoded.
 185
 186 After all, the best thing about this pragma is that you don't have to
 187 resort to \x... just to spell your name in native encoding.  So feel
 188 free to put your strings in your encoding in quotes and regexes.
 189
 190 =head1 EXAMPLE - Greekperl
 191
 192     use encoding "iso 8859-7";
 193
 194     # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
 195
 196     $a = "\xDF";
 197     $b = "\x{100}";
 198
 199     printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
 200
 201     $c = $a . $b;
 202
 203     # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
 204
 205     # chr() is affected, and ...
 206
 207     print "mega\n"  if ord(chr(0xdf)) == 0x3af;
 208
 209     # ... ord() is affected by the encoding pragma ...
 210
 211     print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
 212
 213     # ... as are eq and cmp ...
 214
 215     print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
 216     print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
 217
 218     # ... but pack/unpack C are not affected, in case you still
 219     # want back to your native encoding
 220
 221     print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
 222
 223 =head1 KNOWN PROBLEMS
 224
 225 For native multibyte encodings (either fixed or variable length)
 226 the current implementation of the regular expressions may introduce
 227 recoding errors for longer regular expression literals than 127 bytes.
 228
 229 The encoding pragma is not supported on EBCDIC platforms.
 230 (Porters wanted.)
 231
 232 =head1 SEE ALSO
 233
 234 L<perlunicode>, L<Encode>, L<open>
 235
 236 =cut