#
-# $Id: Encode.pm,v 2.46 2012/08/12 05:49:30 dankogai Exp dankogai $
+# $Id: Encode.pm,v 3.01 2019/03/13 00:25:25 dankogai Exp $
#
package Encode;
use strict;
use warnings;
-our $VERSION = sprintf "%d.%02d", q$Revision: 2.46 $ =~ /(\d+)/g;
use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
-use XSLoader ();
-XSLoader::load( __PACKAGE__, $VERSION );
+our $VERSION;
+BEGIN {
+ $VERSION = sprintf "%d.%02d", q$Revision: 3.01 $ =~ /(\d+)/g;
+ require XSLoader;
+ XSLoader::load( __PACKAGE__, $VERSION );
+}
+
+use Exporter 5.57 'import';
-require Exporter;
-use base qw/Exporter/;
+use Carp ();
+our @CARP_NOT = qw(Encode::Encoder);
# Public, encouraged API is exported by default
our @EXPORT = qw(
decode decode_utf8 encode encode_utf8 str2bytes bytes2str
- encodings find_encoding clone_encoding
+ encodings find_encoding find_mime_encoding clone_encoding
);
our @FB_FLAGS = qw(
DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
our $ON_EBCDIC = ( ord("A") == 193 );
-use Encode::Alias;
+use Encode::Alias ();
+use Encode::MIME::Name;
+
+use Storable;
# Make a %Encoding package variable to allow a certain amount of cheating
our %Encoding;
require Encode::Config;
# See
# https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
-# to find why sig handers inside eval{} are disabled.
+# to find why sig handlers inside eval{} are disabled.
eval {
local $SIG{__DIE__};
local $SIG{__WARN__};
+ local @INC = @INC;
+ pop @INC if $INC[-1] eq '.';
require Encode::ConfigLocal;
};
sub encodings {
my %enc;
- if ( @_ and $_[1] eq ":all" ) {
+ my $arg = $_[1] || '';
+ if ( $arg eq ":all" ) {
%enc = ( %Encoding, %ExtModule );
}
else {
my $alias = shift;
define_alias( $alias, $obj );
}
+ my $class = ref($obj);
+ push @Encode::CARP_NOT, $class unless grep { $_ eq $class } @Encode::CARP_NOT;
+ push @Encode::Encoding::CARP_NOT, $class unless grep { $_ eq $class } @Encode::Encoding::CARP_NOT;
return $obj;
}
sub getEncoding {
my ( $class, $name, $skip_external ) = @_;
+ defined($name) or return;
+
$name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
ref($name) && $name->can('renew') and return $name;
return;
}
+# HACK: These two functions must be defined in Encode and because of
+# cyclic dependency between Encode and Encode::Alias, Exporter does not work
+sub find_alias {
+ goto &Encode::Alias::find_alias;
+}
+sub define_alias {
+ goto &Encode::Alias::define_alias;
+}
+
sub find_encoding($;$) {
my ( $name, $skip_external ) = @_;
return __PACKAGE__->getEncoding( $name, $skip_external );
}
+sub find_mime_encoding($;$) {
+ my ( $mime_name, $skip_external ) = @_;
+ my $name = Encode::MIME::Name::get_encode_name( $mime_name );
+ return find_encoding( $name, $skip_external );
+}
+
sub resolve_alias($) {
my $obj = find_encoding(shift);
defined $obj and return $obj->name;
sub clone_encoding($) {
my $obj = find_encoding(shift);
ref $obj or return;
- eval { require Storable };
- $@ and return;
return Storable::dclone($obj);
}
-sub encode($$;$) {
- my ( $name, $string, $check ) = @_;
- return undef unless defined $string;
- $string .= '' if ref $string; # stringify;
- $check ||= 0;
- unless ( defined $name ) {
- require Carp;
- Carp::croak("Encoding name should not be undef");
- }
- my $enc = find_encoding($name);
- unless ( defined $enc ) {
- require Carp;
- Carp::croak("Unknown encoding '$name'");
- }
- my $octets = $enc->encode( $string, $check );
- $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() );
- return $octets;
-}
-*str2bytes = \&encode;
-
-sub decode($$;$) {
- my ( $name, $octets, $check ) = @_;
- return undef unless defined $octets;
- $octets .= '' if ref $octets;
- $check ||= 0;
- my $enc = find_encoding($name);
- unless ( defined $enc ) {
- require Carp;
- Carp::croak("Unknown encoding '$name'");
+onBOOT;
+
+if ($ON_EBCDIC) {
+ package Encode::UTF_EBCDIC;
+ use parent 'Encode::Encoding';
+ my $obj = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
+ Encode::define_encoding($obj, 'Unicode');
+ sub decode {
+ my ( undef, $str, $chk ) = @_;
+ my $res = '';
+ for ( my $i = 0 ; $i < length($str) ; $i++ ) {
+ $res .=
+ chr(
+ utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
+ );
+ }
+ $_[1] = '' if $chk;
+ return $res;
}
- my $string = $enc->decode( $octets, $check );
- $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
- return $string;
-}
-*bytes2str = \&decode;
-
-sub from_to($$$;$) {
- my ( $string, $from, $to, $check ) = @_;
- return undef unless defined $string;
- $check ||= 0;
- my $f = find_encoding($from);
- unless ( defined $f ) {
- require Carp;
- Carp::croak("Unknown encoding '$from'");
+ sub encode {
+ my ( undef, $str, $chk ) = @_;
+ my $res = '';
+ for ( my $i = 0 ; $i < length($str) ; $i++ ) {
+ $res .=
+ chr(
+ utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
+ );
+ }
+ $_[1] = '' if $chk;
+ return $res;
}
- my $t = find_encoding($to);
- unless ( defined $t ) {
- require Carp;
- Carp::croak("Unknown encoding '$to'");
+} else {
+ package Encode::Internal;
+ use parent 'Encode::Encoding';
+ my $obj = bless { Name => "Internal" } => "Encode::Internal";
+ Encode::define_encoding($obj, 'Unicode');
+ sub decode {
+ my ( undef, $str, $chk ) = @_;
+ utf8::upgrade($str);
+ $_[1] = '' if $chk;
+ return $str;
}
- my $uni = $f->decode($string);
- $_[0] = $string = $t->encode( $uni, $check );
- return undef if ( $check && length($uni) );
- return defined( $_[0] ) ? length($string) : undef;
+ *encode = \&decode;
}
-sub encode_utf8($) {
- my ($str) = @_;
- utf8::encode($str);
- return $str;
+{
+ # https://rt.cpan.org/Public/Bug/Display.html?id=103253
+ package Encode::XS;
+ use parent 'Encode::Encoding';
}
-my $utf8enc;
-
-sub decode_utf8($;$) {
- my ( $octets, $check ) = @_;
- return $octets if is_utf8($octets);
- return undef unless defined $octets;
- $octets .= '' if ref $octets;
- $check ||= 0;
- $utf8enc ||= find_encoding('utf8');
- my $string = $utf8enc->decode( $octets, $check );
- $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
- return $string;
-}
-
-# sub decode_utf8($;$) {
-# my ( $str, $check ) = @_;
-# return $str if is_utf8($str);
-# if ($check) {
-# return decode( "utf8", $str, $check );
-# }
-# else {
-# return decode( "utf8", $str );
-# return $str;
-# }
-# }
-
-predefine_encodings(1);
-
-#
-# This is to restore %Encoding if really needed;
-#
-
-sub predefine_encodings {
- require Encode::Encoding;
- no warnings 'redefine';
- my $use_xs = shift;
- if ($ON_EBCDIC) {
-
- # was in Encode::UTF_EBCDIC
- package Encode::UTF_EBCDIC;
- push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
- *decode = sub {
- my ( undef, $str, $chk ) = @_;
- my $res = '';
- for ( my $i = 0 ; $i < length($str) ; $i++ ) {
- $res .=
- chr(
- utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
- );
- }
- $_[1] = '' if $chk;
- return $res;
- };
- *encode = sub {
- my ( undef, $str, $chk ) = @_;
- my $res = '';
- for ( my $i = 0 ; $i < length($str) ; $i++ ) {
- $res .=
- chr(
- utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
- );
- }
- $_[1] = '' if $chk;
- return $res;
- };
- $Encode::Encoding{Unicode} =
- bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
- }
- else {
-
- package Encode::Internal;
- push @Encode::Internal::ISA, 'Encode::Encoding';
- *decode = sub {
- my ( undef, $str, $chk ) = @_;
- utf8::upgrade($str);
- $_[1] = '' if $chk;
- return $str;
- };
- *encode = \&decode;
- $Encode::Encoding{Unicode} =
- bless { Name => "Internal" } => "Encode::Internal";
+{
+ package Encode::utf8;
+ use parent 'Encode::Encoding';
+ my %obj = (
+ 'utf8' => { Name => 'utf8' },
+ 'utf-8-strict' => { Name => 'utf-8-strict', strict_utf8 => 1 }
+ );
+ for ( keys %obj ) {
+ bless $obj{$_} => __PACKAGE__;
+ Encode::define_encoding( $obj{$_} => $_ );
}
-
- {
-
- # was in Encode::utf8
- package Encode::utf8;
- push @Encode::utf8::ISA, 'Encode::Encoding';
-
- #
- if ($use_xs) {
- Encode::DEBUG and warn __PACKAGE__, " XS on";
- *decode = \&decode_xs;
- *encode = \&encode_xs;
+ sub cat_decode {
+ # ($obj, $dst, $src, $pos, $trm, $chk)
+ # currently ignores $chk
+ my ( undef, undef, undef, $pos, $trm ) = @_;
+ my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
+ use bytes;
+ if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
+ $$rdst .=
+ substr( $$rsrc, $pos, $npos - $pos + length($trm) );
+ $$rpos = $npos + length($trm);
+ return 1;
}
- else {
- Encode::DEBUG and warn __PACKAGE__, " XS off";
- *decode = sub {
- my ( undef, $octets, $chk ) = @_;
- my $str = Encode::decode_utf8($octets);
- if ( defined $str ) {
- $_[1] = '' if $chk;
- return $str;
- }
- return undef;
- };
- *encode = sub {
- my ( undef, $string, $chk ) = @_;
- my $octets = Encode::encode_utf8($string);
- $_[1] = '' if $chk;
- return $octets;
- };
- }
- *cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk)
- # currently ignores $chk
- my ( undef, undef, undef, $pos, $trm ) = @_;
- my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
- use bytes;
- if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
- $$rdst .=
- substr( $$rsrc, $pos, $npos - $pos + length($trm) );
- $$rpos = $npos + length($trm);
- return 1;
- }
- $$rdst .= substr( $$rsrc, $pos );
- $$rpos = length($$rsrc);
- return '';
- };
- $Encode::Encoding{utf8} =
- bless { Name => "utf8" } => "Encode::utf8";
- $Encode::Encoding{"utf-8-strict"} =
- bless { Name => "utf-8-strict", strict_utf8 => 1 }
- => "Encode::utf8";
+ $$rdst .= substr( $$rsrc, $pos );
+ $$rpos = length($$rsrc);
+ return '';
}
}
canonical name or an alias. For encoding names and aliases, see
L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">.
+B<CAVEAT>: the input scalar I<STRING> might be modified in-place depending
+on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
+left unchanged.
+
For example, to convert a string from Perl's internal format into
ISO-8859-1, also known as Latin1:
$octets = encode("iso-8859-1", $string);
-B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then
+B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then
$octets I<might not be equal to> $string. Though both contain the
same data, the UTF8 flag for $octets is I<always> off. When you
encode anything, the UTF8 flag on the result is always off, even when it
-contains a completely valid utf8 string. See L</"The UTF8 flag"> below.
+contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below.
If the $string is C<undef>, then C<undef> is returned.
+C<str2bytes> may be used as an alias for C<encode>.
+
=head3 decode
$string = decode(ENCODING, OCTETS[, CHECK])
This function returns the string that results from decoding the scalar
value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
-Perl's internal form. The returns the resulting string. As with encode(),
+Perl's internal form. As with encode(),
I<ENCODING> can be either a canonical name or an alias. For encoding names
and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
Malformed Data">.
+B<CAVEAT>: the input scalar I<OCTETS> might be modified in-place depending
+on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
+left unchanged.
+
For example, to convert ISO-8859-1 data into a string in Perl's
internal format:
$string = decode("iso-8859-1", $octets);
-B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
+B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string
I<might not be equal to> $octets. Though both contain the same data, the
-UTF8 flag for $string is on unless $octets consists entirely of ASCII data
-on ASCII machines or EBCDIC on EBCDIC machines. See L</"The UTF8 flag">
+UTF8 flag for $string is on. See L</"The UTF8 flag">
below.
If the $string is C<undef>, then C<undef> is returned.
+C<bytes2str> may be used as an alias for C<decode>.
+
=head3 find_encoding
[$obj =] find_encoding(ENCODING)
C<undef> if no matching I<ENCODING> is find. The returned object is
what does the actual encoding or decoding.
- $utf8 = decode($name, $bytes);
+ $string = decode($name, $bytes);
is in fact
- $utf8 = do {
+ $string = do {
$obj = find_encoding($name);
croak qq(encoding "$name" not found) unless ref $obj;
$obj->decode($bytes);
my $enc = find_encoding("iso-8859-1");
while(<>) {
- my $utf8 = $enc->decode($_);
- ... # now do something with $utf8;
+ my $string = $enc->decode($_);
+ ... # now do something with $string;
}
Besides L</decode> and L</encode>, other methods are
See L<Encode::Encoding> for details.
+=head3 find_mime_encoding
+
+ [$obj =] find_mime_encoding(MIME_ENCODING)
+
+Returns the I<encoding object> corresponding to I<MIME_ENCODING>. Acts
+same as C<find_encoding()> but C<mime_name()> of returned object must
+match to I<MIME_ENCODING>. So as opposite of C<find_encoding()>
+canonical names and aliases are not used when searching for object.
+
+ find_mime_encoding("utf8"); # returns undef because "utf8" is not valid I<MIME_ENCODING>
+ find_mime_encoding("utf-8"); # returns encode object "utf-8-strict"
+ find_mime_encoding("UTF-8"); # same as "utf-8" because I<MIME_ENCODING> is case insensitive
+ find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not valid I<MIME_ENCODING>
+
=head3 from_to
[$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
B<CAVEAT>: The following operations may look the same, but are not:
- from_to($data, "iso-8859-1", "utf8"); #1
+ from_to($data, "iso-8859-1", "UTF-8"); #1
$data = decode("iso-8859-1", $data); #2
Both #1 and #2 make $data consist of a completely valid UTF-8 string,
but only #2 turns the UTF8 flag on. #1 is equivalent to:
- $data = encode("utf8", decode("iso-8859-1", $data));
+ $data = encode("UTF-8", decode("iso-8859-1", $data));
See L</"The UTF8 flag"> below.
from_to($octets, $from, $to, $check);
-is equivalent t:o
+is equivalent to:
$octets = encode($to, decode($from, $octets), $check);
Equivalent to C<$octets = encode("utf8", $string)>. The characters in
$string are encoded in Perl's internal format, and the result is returned
as a sequence of octets. Because all possible characters in Perl have a
-(loose, not strict) UTF-8 representation, this function cannot fail.
+(loose, not strict) utf8 representation, this function cannot fail.
+
+B<WARNING>: do not use this function for data exchange as it can produce
+not strict utf8 $octets! For strictly valid UTF-8 output use
+C<$octets = encode("UTF-8", $string)>.
=head3 decode_utf8
Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
The sequence of octets represented by $octets is decoded
-from UTF-8 into a sequence of logical characters.
-Because not all sequences of octets are valid UTF-8,
+from (loose, not strict) utf8 into a sequence of logical characters.
+Because not all sequences of octets are valid not strict utf8,
it is quite possible for this function to fail.
For CHECK, see L</"Handling Malformed Data">.
+B<WARNING>: do not use this function for data exchange as it can produce
+$string with not strict utf8 representation! For strictly valid UTF-8
+$string representation use C<$string = decode("UTF-8", $octets [, CHECK])>.
+
+B<CAVEAT>: the input I<$octets> might be modified in-place depending on
+what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
+left unchanged.
+
=head2 Listing available encodings
use Encode;
handle the conversion. In the second, you explicitly translate
from one encoding to the other.
-Unfortunately, it may be that encodings are C<PerlIO>-savvy. You can check
+Unfortunately, it may be that encodings are not C<PerlIO>-savvy. You can check
to see whether your encoding is supported by C<PerlIO> by invoking the
C<perlio_ok> method on it:
This is the same as C<FB_QUIET> above, except that instead of being silent
on errors, it issues a warning. This is handy for when you are debugging.
+B<CAVEAT>: All warnings from Encode module are reported, independently of
+L<pragma warnings|warnings> settings. If you want to follow settings of
+lexical warnings configured by L<pragma warnings|warnings> then append
+also check value C<ENCODE::ONLY_PRAGMA_WARNINGS>. This value is available
+since Encode version 2.99.
+
=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
=over 2
Encode::LEAVE_SRC
If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
-second argument to encode() or decode() will be overwritten in place.
+source string to encode() or decode() will be overwritten in place.
If you're not interested in this, then bitwise-OR it with the bitmask.
=head2 coderef for CHECK
As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
-ordinal value of the unmapped character as an argument and returns a string
-that represents the fallback character. For instance:
+ordinal value of the unmapped character as an argument and returns
+octets that represent the fallback character. For instance:
$ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
+Fallback for C<decode> must return decoded string (sequence of characters)
+and takes a list of ordinal values as its arguments. So for
+example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
+a fallback for bytes that are not valid UTF-8, you could write
+
+ $str = decode 'UTF-8', $octets, sub {
+ my $tmp = join '', map chr, @_;
+ return decode 'ISO-8859-15', $tmp;
+ };
+
=head1 Defining Encodings
To define a new encoding, use:
byte-oriented mode for when the internal UTF8 flag is off, and the other a
character-oriented mode for when the internal UTF8 flag is on.
-Here is how C<Encode> handles the UTF8 flag.
-
-=over 2
-
-=item *
-
-When you I<encode>, the resulting UTF8 flag is always B<off>.
-
-=item *
-
-When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can
-unambiguously represent data. Here is what we mean by "unambiguously".
-After C<$utf8 = decode("foo", $octet)>,
-
- When $octet is... The UTF8 flag in $utf8 is
- ---------------------------------------------
- In ASCII only (or EBCDIC only) OFF
- In ISO-8859-1 ON
- In any other Encoding ON
- ---------------------------------------------
-
-As you see, there is one exception: in ASCII. That way you can assume
-Goal #1. And with C<Encode>, Goal #2 is assumed but you still have to be
-careful in the cases mentioned in the B<CAVEAT> paragraphs above.
-
This UTF8 flag is not visible in Perl scripts, exactly for the same reason
you cannot (or rather, you I<don't have to>) see whether a scalar contains
a string, an integer, or a floating-point number. But you can still peek
and poke these if you will. See the next section.
-=back
-
=head2 Messing with Perl's Internals
The following API uses parts of Perl's internals in the current
If I<CHECK> is true, also checks whether I<STRING> contains well-formed
UTF-8. Returns true if successful, false otherwise.
+Typically only necessary for debugging and testing. Don't use this flag as
+a marker to distinguish character and binary data, that should be decided
+for each variable when you write your code.
+
+B<CAVEAT>: If I<STRING> has UTF8 flag set, it does B<NOT> mean that
+I<STRING> is UTF-8 encoded and vice-versa.
+
As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
=head3 _utf8_on
=head1 MAINTAINER
This project was originated by the late Nick Ing-Simmons and later
-maintained by Dan Kogai I<< <dankogai@dan.co.jp> >>. See AUTHORS
+maintained by Dan Kogai I<< <dankogai@cpan.org> >>. See AUTHORS
for a full list of people involved. For any questions, send mail to
I<< <perl-unicode@perl.org> >> so that we can all share.
=head1 COPYRIGHT
-Copyright 2002-2011 Dan Kogai I<< <dankogai@dan.co.jp> >>.
+Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>.
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.