Add a test for for PerlIO ":encoding(...)" layer.

[perl5.git] / ext / Encode / Encode.pm
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm

index 3792324..b84623a 100644 (file)
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -42,6 +42,9 @@ use Carp;
  our %encoding;
  my @alias;  # ordered matching list
  my %alias;  # cached known aliases
+                     # 0  1  2  3  4  5   6   7   8   9  10
+our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
+
  
  sub encodings
  {
@@ -60,7 +63,6 @@ sub findAlias
       my $alias = $alias[$i];
       my $val   = $alias[$i+1];
       my $new;
-
       if (ref($alias) eq 'Regexp' && $_ =~ $alias)
        {
         $new = eval $val;
@@ -100,10 +102,16 @@ sub define_alias
  # Allow variants of iso-8859-1 etc.
  define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
  
+# At least HP-UX has these.
+define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
+
+# This is a font issue, not an encoding issue.
+# (The currency symbol of the Latin 1 upper half
+#  has been redefined as the euro symbol.)
+define_alias( qr/^(.+)\@euro$/i => '"$1"' );
+
  # Allow latin-1 style names as well
-                    # 0  1  2  3  4  5   6   7   8   9  10
-my @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
-define_alias( qr/^latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
+define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
  
  # Common names for non-latin prefered MIME names
  define_alias( 'ascii'    => 'US-ascii',
@@ -112,7 +120,17 @@ define_alias( 'ascii'    => 'US-ascii',
                'greek'    => 'iso-8859-7',
                'hebrew'   => 'iso-8859-8');
  
-define_alias( 'ibm-1047' => 'cp1047');
+# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
+define_alias( qr/^ibm[-_]?(\d\d\d\d?)$/i => '"cp$1"');
+
+# Standardize on the dashed versions.
+define_alias( qr/^utf8$/i  => 'utf-8' );
+define_alias( qr/^koi8r$/i => 'koi8-r' );
+
+# TODO: the HP-UX '8' encodings:  arabic8 greek8 hebrew8 roman8 turkish8
+# TODO: the Thai Encoding tis620
+# TODO: the Chinese Encoding gb18030
+# TODO: what is the Japanese 'ujis' encoding seen in some Linuxes?
  
  # Map white space and _ to '-'
  define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
@@ -136,6 +154,10 @@ sub getEncoding
  {
   my ($class,$name) = @_;
   my $enc;
+ if (ref($name) && $name->can('new_sequence'))
+  {
+   return $name;
+  }
   if (exists $encoding{$name})
    {
     return $encoding{$name};
@@ -526,7 +548,7 @@ L</"Handling Malformed Data">.
  
  Convert B<in-place> the data between two encodings.  How did the data
  in $string originally get to be in FROM_ENCODING?  Either using
-encode() or through PerlIO: See L</"Encode and PerlIO">.  For CHECK
+encode() or through PerlIO: See L</"Encoding and IO">.  For CHECK
  see L</"Handling Malformed Data">.
  
  For example to convert ISO 8859-1 data to UTF-8:
@@ -537,6 +559,9 @@ and to convert it back:
  
         from_to($data, "utf-8", "iso-8859-1");
  
+Note that because the conversion happens in place, the data to be
+converted cannot be a string constant, it must be a scalar variable.
+
  =back
  
  =head2 Handling Malformed Data
@@ -698,8 +723,8 @@ names for the iso-8859-* family.
  
  =head2 Defining Encodings
  
-  use Encode qw(define_alias);
-  define_encoding( $object, 'canonicalName' [,alias...]);
+    use Encode qw(define_alias);
+    define_encoding( $object, 'canonicalName' [,alias...]);
  
  Causes I<canonicalName> to be associated with I<$object>.  The object
  should provide the interface described in L</"IMPLEMENTATION CLASSES">
@@ -714,14 +739,21 @@ If Perl is configured to use the new 'perlio' IO system then
  C<Encode> provides a "layer" (See L<perliol>) which can transform
  data as it is read or written.
  
-     open(my $ilyad,'>:encoding(iso-8859-7)','ilyad.greek');
-     print $ilyad @epic;
+Here is how the blind poet would modernise the encoding:
+
+    use Encode;
+    open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
+    open(my $utf8,'>:utf8','iliad.utf8');
+    my @epic = <$iliad>;
+    print $utf8 @epic;
+    close($utf8);
+    close($illiad);
  
  In addition the new IO system can also be configured to read/write
  UTF-8 encoded characters (as noted above this is efficient):
  
-     open(my $fh,'>:utf8','anything');
-     print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
+    open(my $fh,'>:utf8','anything');
+    print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
  
  Either of the above forms of "layer" specifications can be made the default
  for a lexical scope with the C<use open ...> pragma. See L<open>.
@@ -743,29 +775,22 @@ characters into bytes using the API above before doing writes, and to
  transform the bytes read from a handle into characters before doing
  "character operations" (e.g. C<lc>, C</\W+/>, ...).
  
-=head1 Encode and PerlIO
-
-The PerlIO layer (new since Perl 5.7) can be used to automatically
-convert the data being read in or written out to be converted from
-some encoding into Perl's internal encoding or from Perl's internal
-encoding into some other encoding.
-
-Examples:
-
-       open(my $f, "<:encoding(cp1252)")
-
-       open(my $g, ">:encoding(iso-8859-1)")
-
  You can also use PerlIO to convert larger amounts of data you don't
  want to bring into memory.  For example to convert between ISO 8859-1
  (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
  
-       open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
-       open(G, ">:utf8",                 "data.utf") or die $!;
-       while (<F>) { print G }
+    open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
+    open(G, ">:utf8",                 "data.utf") or die $!;
+    while (<F>) { print G }
+
+    # Could also do "print G <F>" but that would pull
+    # the whole file into memory just to write it out again.
+
+More examples:
  
-       # Could also do "print G <F>" but that would pull
-       # the whole file into memory just to write it out again.
+    open(my $f, "<:encoding(cp1252)")
+    open(my $g, ">:encoding(iso-8859-2)")
+    open(my $h, ">:encoding(latin9)")       # iso-8859-15
  
  See L<PerlIO> for more information.