lib/encoding.pm

   1 package encoding;
   2
   3 our $VERSION = '1.00';
   4
   5 use Encode;
   6
   7 sub import {
   8     my ($class, $name) = @_;
   9     $name = $ENV{PERL_ENCODING} if @_ < 2;
  10     $name = "latin1" unless defined $name;
  11     my $enc = find_encoding($name);
  12     unless (defined $enc) {
  13         require Carp;
  14         Carp::croak "Unknown encoding '$name'";
  15     }
  16     ${^ENCODING} = $enc;
  17 }
  18
  19 =pod
  20
  21 =head1 NAME
  22
  23 encoding - pragma to control the conversion of legacy data into Unicode
  24
  25 =head1 SYNOPSIS
  26
  27     use encoding "iso 8859-7";
  28
  29     # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
  30
  31     $a = "\xDF";
  32     $b = "\x{100}";
  33
  34     printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
  35
  36     $c = $a . $b;
  37
  38     # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
  39
  40     # chr() is affected, and ...
  41
  42     print "mega\n"  if ord(chr(0xdf)) == 0x3af;
  43
  44     # ... ord() is affected by the encoding pragma ...
  45
  46     print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
  47
  48     # but pack/unpack are not affected, in case you still
  49     # want back to your native encoding
  50
  51     print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
  52
  53 =head1 DESCRIPTION
  54
  55 Normally when legacy 8-bit data is converted to Unicode the data is
  56 expected to be Latin-1 (or EBCDIC in EBCDIC platforms).  With the
  57 encoding pragma you can change this default.
  58
  59 The pragma is a per script, not a per block lexical.  Only the last
  60 C<use encoding> matters, and it affects B<the whole script>.
  61
  62 Notice that only literals (string or regular expression) having only
  63 legacy code points are affected: if you mix data like this
  64
  65         \xDF\x{100}
  66
  67 the data is assumed to be in (Latin 1 and) Unicode, not in your native
  68 encoding.  In other words, this will match in "greek":
  69
  70         "\xDF" =~ /\x{3af}/
  71
  72 but this will not
  73
  74         "\xDF\x{100}" =~ /\x{3af}\x{100}/
  75
  76 since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
  77 because of the C<\x{100}> on the left.  You should not be mixing your
  78 legacy data and Unicode in the same string.
  79
  80 This pragma also affects encoding of the 0x80..0xFF code point range:
  81 normally characters in that range are left as eight-bit bytes (unless
  82 they are combined with characters with code points 0x100 or larger,
  83 in which case all characters need to become UTF-8 encoded), but if
  84 the C<encoding> pragma is present, even the 0x80..0xFF range always
  85 gets UTF-8 encoded.
  86
  87 If no encoding is specified, the environment variable L<PERL_ENCODING>
  88 is consulted.  If that fails, "latin1" (ISO 8859-1) is assumed.  If no
  89 encoding can be found, C<Unknown encoding '...'> error will be thrown.
  90
  91 =head1 KNOWN PROBLEMS
  92
  93 For native multibyte encodings (either fixed or variable length)
  94 the current implementation of the regular expressions may introduce
  95 recoding errors for longer regular expression literals than 127 bytes.
  96
  97 =head1 SEE ALSO
  98
  99 L<perlunicode>, L<Encode>
 100
 101 =cut
 102
 103 1;