lib/encoding.pm

   1 package encoding;
   2
   3 our $VERSION = '1.00';
   4
   5 use Encode;
   6
   7 BEGIN {
   8     if (ord("A") == 193) {
   9         require Carp;
  10         Carp::croak "encoding pragma does not support EBCDIC platforms";
  11     }
  12 }
  13
  14 sub import {
  15     my ($class, $name) = @_;
  16     $name = $ENV{PERL_ENCODING} if @_ < 2;
  17     $name = "latin1" unless defined $name;
  18     my $enc = find_encoding($name);
  19     unless (defined $enc) {
  20         require Carp;
  21         Carp::croak "Unknown encoding '$name'";
  22     }
  23     ${^ENCODING} = $enc;
  24 }
  25
  26 =pod
  27
  28 =head1 NAME
  29
  30 encoding - pragma to control the conversion of legacy data into Unicode
  31
  32 =head1 SYNOPSIS
  33
  34     use encoding "iso 8859-7";
  35
  36     # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
  37
  38     $a = "\xDF";
  39     $b = "\x{100}";
  40
  41     printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
  42
  43     $c = $a . $b;
  44
  45     # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
  46
  47     # chr() is affected, and ...
  48
  49     print "mega\n"  if ord(chr(0xdf)) == 0x3af;
  50
  51     # ... ord() is affected by the encoding pragma ...
  52
  53     print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
  54
  55     # ... as are eq and cmp ...
  56
  57     print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
  58     print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
  59
  60     # ... but pack/unpack C are not affected, in case you still
  61     # want back to your native encoding
  62
  63     print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
  64
  65 =head1 DESCRIPTION
  66
  67 Normally when legacy 8-bit data is converted to Unicode the data is
  68 expected to be Latin-1 (or EBCDIC in EBCDIC platforms).  With the
  69 encoding pragma you can change this default.
  70
  71 The pragma is a per script, not a per block lexical.  Only the last
  72 C<use encoding> matters, and it affects B<the whole script>.
  73
  74 Notice that only literals (string or regular expression) having only
  75 legacy code points are affected: if you mix data like this
  76
  77         \xDF\x{100}
  78
  79 the data is assumed to be in (Latin 1 and) Unicode, not in your native
  80 encoding.  In other words, this will match in "greek":
  81
  82         "\xDF" =~ /\x{3af}/
  83
  84 but this will not
  85
  86         "\xDF\x{100}" =~ /\x{3af}\x{100}/
  87
  88 since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
  89 because of the C<\x{100}> on the left.  You should not be mixing your
  90 legacy data and Unicode in the same string.
  91
  92 This pragma also affects encoding of the 0x80..0xFF code point range:
  93 normally characters in that range are left as eight-bit bytes (unless
  94 they are combined with characters with code points 0x100 or larger,
  95 in which case all characters need to become UTF-8 encoded), but if
  96 the C<encoding> pragma is present, even the 0x80..0xFF range always
  97 gets UTF-8 encoded.
  98
  99 If no encoding is specified, the environment variable L<PERL_ENCODING>
 100 is consulted.  If that fails, "latin1" (ISO 8859-1) is assumed.  If no
 101 encoding can be found, C<Unknown encoding '...'> error will be thrown.
 102
 103 Note if you want to get back to the original byte encoding, you need
 104 to use things like I/O with encoding discplines (see L<open>) or the
 105 Encode module, since C<no encoding> (or re-C<encoding>) do not work.
 106
 107 =head1 KNOWN PROBLEMS
 108
 109 For native multibyte encodings (either fixed or variable length)
 110 the current implementation of the regular expressions may introduce
 111 recoding errors for longer regular expression literals than 127 bytes.
 112
 113 The encoding pragma is not supported on EBCDIC platforms.
 114 (Porters wanted.)
 115
 116 =head1 SEE ALSO
 117
 118 L<perlunicode>, L<Encode>
 119
 120 =cut
 121
 122 1;