This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade Encode from version 2.75 to 2.76
[perl5.git] / cpan / Encode / Encode.pm
CommitLineData
10c5ecbb 1#
0dd5b0dc 2# $Id: Encode.pm,v 2.76 2015/07/31 02:17:53 dankogai Exp $
10c5ecbb 3#
2c674647 4package Encode;
51ef4e11 5use strict;
656ebd29 6use warnings;
0dd5b0dc 7our $VERSION = sprintf "%d.%02d", q$Revision: 2.76 $ =~ /(\d+)/g;
e46d9735 8use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
6d1c0808 9use XSLoader ();
d1256cb1 10XSLoader::load( __PACKAGE__, $VERSION );
2c674647 11
369b9ffe 12use Exporter 5.57 'import';
2c674647 13
4411f3b6 14# Public, encouraged API is exported by default
85982a32
JH
15
16our @EXPORT = qw(
0a8c69ed 17 decode decode_utf8 encode encode_utf8 str2bytes bytes2str
a0d8a30e 18 encodings find_encoding clone_encoding
4411f3b6 19);
d1256cb1
RGS
20our @FB_FLAGS = qw(
21 DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
22 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
23);
24our @FB_CONSTS = qw(
25 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
26 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
27);
28our @EXPORT_OK = (
29 qw(
30 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
31 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
85982a32 32 ),
d1256cb1
RGS
33 @FB_FLAGS, @FB_CONSTS,
34);
85982a32 35
d1256cb1
RGS
36our %EXPORT_TAGS = (
37 all => [ @EXPORT, @EXPORT_OK ],
0263186c
NC
38 default => [ @EXPORT ],
39 fallbacks => [ @FB_CONSTS ],
d1256cb1
RGS
40 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
41);
85982a32 42
4411f3b6 43# Documentation moved after __END__ for speed - NI-S
2c674647 44
d1256cb1 45our $ON_EBCDIC = ( ord("A") == 193 );
f2a2953c 46
5d030b67
JH
47use Encode::Alias;
48
5129552c
JH
49# Make a %Encoding package variable to allow a certain amount of cheating
50our %Encoding;
aae85ceb
DK
51our %ExtModule;
52require Encode::Config;
2fd0906e
SH
53# See
54# https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
0a225b3c 55# to find why sig handlers inside eval{} are disabled.
2fd0906e
SH
56eval {
57 local $SIG{__DIE__};
58 local $SIG{__WARN__};
59 require Encode::ConfigLocal;
60};
5129552c 61
d1256cb1 62sub encodings {
fc17bd48 63 my %enc;
64a9a3c0
CBW
64 my $arg = $_[1] || '';
65 if ( $arg eq ":all" ) {
d1256cb1 66 %enc = ( %Encoding, %ExtModule );
5129552c 67 }
d1256cb1
RGS
68 else {
69 %enc = %Encoding;
b9370cdb 70 for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) {
d1256cb1
RGS
71 DEBUG and warn $mod;
72 for my $enc ( keys %ExtModule ) {
73 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
74 }
75 }
76 }
77 return sort { lc $a cmp lc $b }
78 grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
51ef4e11
NIS
79}
80
d1256cb1
RGS
81sub perlio_ok {
82 my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
011b2d2f 83 $obj->can("perlio_ok") and return $obj->perlio_ok();
d1256cb1 84 return 0; # safety net
85982a32
JH
85}
86
d1256cb1 87sub define_encoding {
18586f54
NIS
88 my $obj = shift;
89 my $name = shift;
5129552c 90 $Encoding{$name} = $obj;
18586f54 91 my $lc = lc($name);
d1256cb1
RGS
92 define_alias( $lc => $obj ) unless $lc eq $name;
93 while (@_) {
94 my $alias = shift;
95 define_alias( $alias, $obj );
18586f54
NIS
96 }
97 return $obj;
656753f8
NIS
98}
99
d1256cb1
RGS
100sub getEncoding {
101 my ( $class, $name, $skip_external ) = @_;
10c5ecbb 102
b9370cdb
CBW
103 $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
104
a0d8a30e 105 ref($name) && $name->can('renew') and return $name;
10c5ecbb 106 exists $Encoding{$name} and return $Encoding{$name};
18586f54 107 my $lc = lc $name;
10c5ecbb 108 exists $Encoding{$lc} and return $Encoding{$lc};
c50d192e 109
5129552c 110 my $oc = $class->find_alias($name);
10c5ecbb
JH
111 defined($oc) and return $oc;
112 $lc ne $name and $oc = $class->find_alias($lc);
113 defined($oc) and return $oc;
c50d192e 114
d1256cb1
RGS
115 unless ($skip_external) {
116 if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
117 $mod =~ s,::,/,g;
118 $mod .= '.pm';
119 eval { require $mod; };
120 exists $Encoding{$name} and return $Encoding{$name};
121 }
d1ed7747 122 }
18586f54 123 return;
656753f8
NIS
124}
125
d1256cb1
RGS
126sub find_encoding($;$) {
127 my ( $name, $skip_external ) = @_;
128 return __PACKAGE__->getEncoding( $name, $skip_external );
4411f3b6
NIS
129}
130
d1256cb1 131sub resolve_alias($) {
fcb875d4
JH
132 my $obj = find_encoding(shift);
133 defined $obj and return $obj->name;
134 return;
135}
136
d1256cb1 137sub clone_encoding($) {
a0d8a30e
DK
138 my $obj = find_encoding(shift);
139 ref $obj or return;
140 eval { require Storable };
141 $@ and return;
142 return Storable::dclone($obj);
143}
144
d1256cb1
RGS
145sub encode($$;$) {
146 my ( $name, $string, $check ) = @_;
0f7c507f 147 return undef unless defined $string;
64a9a3c0 148 $string .= ''; # stringify;
d1256cb1 149 $check ||= 0;
4e71788c
RGS
150 unless ( defined $name ) {
151 require Carp;
152 Carp::croak("Encoding name should not be undef");
153 }
18586f54 154 my $enc = find_encoding($name);
d1256cb1
RGS
155 unless ( defined $enc ) {
156 require Carp;
157 Carp::croak("Unknown encoding '$name'");
10c5ecbb 158 }
44f85850
CBW
159 # For Unicode, warnings need to be caught and re-issued at this level
160 # so that callers can disable utf8 warnings lexically.
161 my $octets;
162 if ( ref($enc) eq 'Encode::Unicode' ) {
163 my $warn = '';
164 {
165 local $SIG{__WARN__} = sub { $warn = shift };
166 $octets = $enc->encode( $string, $check );
167 }
168 warnings::warnif('utf8', $warn) if length $warn;
169 }
170 else {
171 $octets = $enc->encode( $string, $check );
172 }
7828f908 173 $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() );
18586f54 174 return $octets;
4411f3b6 175}
0a8c69ed 176*str2bytes = \&encode;
4411f3b6 177
d1256cb1
RGS
178sub decode($$;$) {
179 my ( $name, $octets, $check ) = @_;
0f7c507f 180 return undef unless defined $octets;
64a9a3c0 181 $octets .= '';
d1256cb1 182 $check ||= 0;
18586f54 183 my $enc = find_encoding($name);
d1256cb1
RGS
184 unless ( defined $enc ) {
185 require Carp;
186 Carp::croak("Unknown encoding '$name'");
10c5ecbb 187 }
44f85850
CBW
188 # For Unicode, warnings need to be caught and re-issued at this level
189 # so that callers can disable utf8 warnings lexically.
190 my $string;
191 if ( ref($enc) eq 'Encode::Unicode' ) {
192 my $warn = '';
193 {
194 local $SIG{__WARN__} = sub { $warn = shift };
195 $string = $enc->decode( $octets, $check );
196 }
197 warnings::warnif('utf8', $warn) if length $warn;
198 }
199 else {
200 $string = $enc->decode( $octets, $check );
201 }
7828f908 202 $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
18586f54 203 return $string;
4411f3b6 204}
0a8c69ed 205*bytes2str = \&decode;
4411f3b6 206
d1256cb1
RGS
207sub from_to($$$;$) {
208 my ( $string, $from, $to, $check ) = @_;
0f7c507f 209 return undef unless defined $string;
d1256cb1 210 $check ||= 0;
18586f54 211 my $f = find_encoding($from);
d1256cb1
RGS
212 unless ( defined $f ) {
213 require Carp;
214 Carp::croak("Unknown encoding '$from'");
10c5ecbb 215 }
18586f54 216 my $t = find_encoding($to);
d1256cb1
RGS
217 unless ( defined $t ) {
218 require Carp;
219 Carp::croak("Unknown encoding '$to'");
10c5ecbb 220 }
41c240f5 221 my $uni = $f->decode($string);
d1256cb1
RGS
222 $_[0] = $string = $t->encode( $uni, $check );
223 return undef if ( $check && length($uni) );
224 return defined( $_[0] ) ? length($string) : undef;
4411f3b6
NIS
225}
226
d1256cb1 227sub encode_utf8($) {
18586f54 228 my ($str) = @_;
c731e18e 229 utf8::encode($str);
18586f54 230 return $str;
4411f3b6
NIS
231}
232
45a6a02c
SH
233my $utf8enc;
234
d1256cb1 235sub decode_utf8($;$) {
45a6a02c 236 my ( $octets, $check ) = @_;
45a6a02c 237 return undef unless defined $octets;
919ec23b 238 $octets .= '';
45a6a02c
SH
239 $check ||= 0;
240 $utf8enc ||= find_encoding('utf8');
241 my $string = $utf8enc->decode( $octets, $check );
242 $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
243 return $string;
5ad8ef52
NIS
244}
245
45a6a02c
SH
246# sub decode_utf8($;$) {
247# my ( $str, $check ) = @_;
248# return $str if is_utf8($str);
249# if ($check) {
250# return decode( "utf8", $str, $check );
251# }
252# else {
253# return decode( "utf8", $str );
254# return $str;
255# }
256# }
257
b536bf57 258predefine_encodings(1);
f2a2953c
JH
259
260#
261# This is to restore %Encoding if really needed;
262#
10c5ecbb 263
d1256cb1 264sub predefine_encodings {
51e4e64d 265 require Encode::Encoding;
b536bf57
DK
266 no warnings 'redefine';
267 my $use_xs = shift;
6d1c0808 268 if ($ON_EBCDIC) {
d1256cb1
RGS
269
270 # was in Encode::UTF_EBCDIC
271 package Encode::UTF_EBCDIC;
272 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
273 *decode = sub {
84678a67 274 my ( undef, $str, $chk ) = @_;
d1256cb1
RGS
275 my $res = '';
276 for ( my $i = 0 ; $i < length($str) ; $i++ ) {
277 $res .=
278 chr(
279 utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
280 );
281 }
282 $_[1] = '' if $chk;
283 return $res;
284 };
285 *encode = sub {
84678a67 286 my ( undef, $str, $chk ) = @_;
d1256cb1
RGS
287 my $res = '';
288 for ( my $i = 0 ; $i < length($str) ; $i++ ) {
289 $res .=
290 chr(
291 utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
292 );
293 }
294 $_[1] = '' if $chk;
295 return $res;
296 };
297 $Encode::Encoding{Unicode} =
298 bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
299 }
300 else {
301
302 package Encode::Internal;
303 push @Encode::Internal::ISA, 'Encode::Encoding';
304 *decode = sub {
84678a67 305 my ( undef, $str, $chk ) = @_;
d1256cb1
RGS
306 utf8::upgrade($str);
307 $_[1] = '' if $chk;
308 return $str;
309 };
310 *encode = \&decode;
311 $Encode::Encoding{Unicode} =
312 bless { Name => "Internal" } => "Encode::Internal";
f2a2953c 313 }
33bbbd9c
CBW
314 {
315 # https://rt.cpan.org/Public/Bug/Display.html?id=103253
316 package Encode::XS;
317 push @Encode::XS::ISA, 'Encode::Encoding';
318 }
f2a2953c 319 {
d1256cb1
RGS
320
321 # was in Encode::utf8
322 package Encode::utf8;
323 push @Encode::utf8::ISA, 'Encode::Encoding';
324
325 #
326 if ($use_xs) {
327 Encode::DEBUG and warn __PACKAGE__, " XS on";
328 *decode = \&decode_xs;
329 *encode = \&encode_xs;
330 }
331 else {
332 Encode::DEBUG and warn __PACKAGE__, " XS off";
333 *decode = sub {
84678a67 334 my ( undef, $octets, $chk ) = @_;
d1256cb1
RGS
335 my $str = Encode::decode_utf8($octets);
336 if ( defined $str ) {
337 $_[1] = '' if $chk;
338 return $str;
339 }
340 return undef;
341 };
342 *encode = sub {
84678a67 343 my ( undef, $string, $chk ) = @_;
d1256cb1
RGS
344 my $octets = Encode::encode_utf8($string);
345 $_[1] = '' if $chk;
346 return $octets;
347 };
348 }
349 *cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk)
350 # currently ignores $chk
84678a67 351 my ( undef, undef, undef, $pos, $trm ) = @_;
d1256cb1
RGS
352 my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
353 use bytes;
354 if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
355 $$rdst .=
356 substr( $$rsrc, $pos, $npos - $pos + length($trm) );
357 $$rpos = $npos + length($trm);
358 return 1;
359 }
360 $$rdst .= substr( $$rsrc, $pos );
361 $$rpos = length($$rsrc);
362 return '';
363 };
364 $Encode::Encoding{utf8} =
365 bless { Name => "utf8" } => "Encode::utf8";
366 $Encode::Encoding{"utf-8-strict"} =
b9370cdb
CBW
367 bless { Name => "utf-8-strict", strict_utf8 => 1 }
368 => "Encode::utf8";
f2a2953c 369 }
f2a2953c
JH
370}
371
656753f8
NIS
3721;
373
2a936312
NIS
374__END__
375
4411f3b6
NIS
376=head1 NAME
377
b9370cdb 378Encode - character encodings in Perl
4411f3b6
NIS
379
380=head1 SYNOPSIS
381
84678a67
SH
382 use Encode qw(decode encode);
383 $characters = decode('UTF-8', $octets, Encode::FB_CROAK);
384 $octets = encode('UTF-8', $characters, Encode::FB_CROAK);
4411f3b6 385
67d7b5ef
JH
386=head2 Table of Contents
387
b9370cdb
CBW
388Encode consists of a collection of modules whose details are too extensive
389to fit in one document. This one itself explains the top-level APIs
6d1c0808 390and general topics at a glance. For other topics and more details,
b9370cdb 391see the documentation for these modules:
67d7b5ef 392
84678a67
SH
393=over 2
394
395=item L<Encode::Alias> - Alias definitions to encodings
396
397=item L<Encode::Encoding> - Encode Implementation Base Class
398
399=item L<Encode::Supported> - List of Supported Encodings
400
401=item L<Encode::CN> - Simplified Chinese Encodings
402
403=item L<Encode::JP> - Japanese Encodings
404
405=item L<Encode::KR> - Korean Encodings
406
407=item L<Encode::TW> - Traditional Chinese Encodings
408
409=back
67d7b5ef 410
4411f3b6
NIS
411=head1 DESCRIPTION
412
b9370cdb 413The C<Encode> module provides the interface between Perl strings
67d7b5ef 414and the rest of the system. Perl strings are sequences of
b9370cdb 415I<characters>.
67d7b5ef 416
b9370cdb 417The repertoire of characters that Perl can represent is a superset of those
67d7b5ef 418defined by the Unicode Consortium. On most platforms the ordinal
b9370cdb
CBW
419values of a character as returned by C<ord(I<S>)> is the I<Unicode
420codepoint> for that character. The exceptions are platforms where
421the legacy encoding is some variant of EBCDIC rather than a superset
422of ASCII; see L<perlebcdic>.
423
424During recent history, data is moved around a computer in 8-bit chunks,
425often called "bytes" but also known as "octets" in standards documents.
426Perl is widely used to manipulate data of many types: not only strings of
427characters representing human or computer languages, but also "binary"
428data, being the machine's representation of numbers, pixels in an image, or
429just about anything.
67d7b5ef 430
0ab8f81e 431When Perl is processing "binary data", the programmer wants Perl to
b9370cdb 432process "sequences of bytes". This is not a problem for Perl: because a
0ab8f81e 433byte has 256 possible values, it easily fits in Perl's much larger
67d7b5ef
JH
434"logical character".
435
84678a67
SH
436This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq>
437explain the I<why>.
4411f3b6 438
84678a67 439=head2 TERMINOLOGY
21938dfa 440
84678a67 441=head3 character
67d7b5ef 442
84678a67 443A character in the range 0 .. 2**32-1 (or more);
b9370cdb 444what Perl's strings are made of.
67d7b5ef 445
84678a67 446=head3 byte
67d7b5ef 447
84678a67
SH
448A character in the range 0..255;
449a special case of a Perl character.
67d7b5ef 450
84678a67 451=head3 octet
67d7b5ef 452
84678a67
SH
4538 bits of data, with ordinal values 0..255;
454term for bytes passed to or from a non-Perl context, such as a disk file,
455standard I/O stream, database, command-line argument, environment variable,
456socket etc.
4411f3b6 457
b9370cdb 458=head1 THE PERL ENCODING API
4411f3b6 459
84678a67
SH
460=head2 Basic methods
461
462=head3 encode
4411f3b6 463
84678a67 464 $octets = encode(ENCODING, STRING[, CHECK])
4411f3b6 465
b9370cdb
CBW
466Encodes the scalar value I<STRING> from Perl's internal form into
467I<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a
468canonical name or an alias. For encoding names and aliases, see
469L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">.
4411f3b6 470
b9370cdb
CBW
471For example, to convert a string from Perl's internal format into
472ISO-8859-1, also known as Latin1:
681a7c68 473
b7a5c9de 474 $octets = encode("iso-8859-1", $string);
7e19fb92 475
44b3b9c7 476B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then
b9370cdb
CBW
477$octets I<might not be equal to> $string. Though both contain the
478same data, the UTF8 flag for $octets is I<always> off. When you
479encode anything, the UTF8 flag on the result is always off, even when it
480contains a completely valid utf8 string. See L</"The UTF8 flag"> below.
681a7c68 481
b9370cdb 482If the $string is C<undef>, then C<undef> is returned.
4089adc4 483
84678a67
SH
484=head3 decode
485
486 $string = decode(ENCODING, OCTETS[, CHECK])
4411f3b6 487
b9370cdb
CBW
488This function returns the string that results from decoding the scalar
489value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
127a7155 490Perl's internal form. As with encode(),
b9370cdb
CBW
491I<ENCODING> can be either a canonical name or an alias. For encoding names
492and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
493Malformed Data">.
47bfe92f 494
b9370cdb
CBW
495For example, to convert ISO-8859-1 data into a string in Perl's
496internal format:
681a7c68 497
b7a5c9de 498 $string = decode("iso-8859-1", $octets);
681a7c68 499
b7a5c9de 500B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
b9370cdb 501I<might not be equal to> $octets. Though both contain the same data, the
0a225b3c 502UTF8 flag for $string is on. See L</"The UTF8 flag">
7e19fb92 503below.
47bfe92f 504
b9370cdb 505If the $string is C<undef>, then C<undef> is returned.
4089adc4 506
84678a67
SH
507=head3 find_encoding
508
509 [$obj =] find_encoding(ENCODING)
44b3b9c7 510
b9370cdb
CBW
511Returns the I<encoding object> corresponding to I<ENCODING>. Returns
512C<undef> if no matching I<ENCODING> is find. The returned object is
513what does the actual encoding or decoding.
44b3b9c7
SP
514
515 $utf8 = decode($name, $bytes);
516
517is in fact
518
b9370cdb
CBW
519 $utf8 = do {
520 $obj = find_encoding($name);
521 croak qq(encoding "$name" not found) unless ref $obj;
522 $obj->decode($bytes);
523 };
44b3b9c7
SP
524
525with more error checking.
526
b9370cdb 527You can therefore save time by reusing this object as follows;
44b3b9c7 528
b9370cdb
CBW
529 my $enc = find_encoding("iso-8859-1");
530 while(<>) {
531 my $utf8 = $enc->decode($_);
532 ... # now do something with $utf8;
533 }
44b3b9c7 534
84678a67
SH
535Besides L</decode> and L</encode>, other methods are
536available as well. For instance, C<name()> returns the canonical
44b3b9c7
SP
537name of the encoding object.
538
539 find_encoding("latin1")->name; # iso-8859-1
540
541See L<Encode::Encoding> for details.
542
84678a67
SH
543=head3 from_to
544
545 [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
7e19fb92 546
b9370cdb
CBW
547Converts I<in-place> data between two encodings. The data in $octets
548must be encoded as octets and I<not> as characters in Perl's internal
549format. For example, to convert ISO-8859-1 data into Microsoft's CP1250
f9d05ba3 550encoding:
2b106fbe 551
b7a5c9de 552 from_to($octets, "iso-8859-1", "cp1250");
2b106fbe
JH
553
554and to convert it back:
555
b7a5c9de 556 from_to($octets, "cp1250", "iso-8859-1");
4411f3b6 557
b9370cdb
CBW
558Because the conversion happens in place, the data to be
559converted cannot be a string constant: it must be a scalar variable.
ab97ca19 560
84678a67 561C<from_to()> returns the length of the converted string in octets on success,
b9370cdb 562and C<undef> on error.
3ef515df 563
b9370cdb 564B<CAVEAT>: The following operations may look the same, but are not:
7e19fb92 565
b7a5c9de 566 from_to($data, "iso-8859-1", "utf8"); #1
7e19fb92 567 $data = decode("iso-8859-1", $data); #2
4411f3b6 568
b9370cdb
CBW
569Both #1 and #2 make $data consist of a completely valid UTF-8 string,
570but only #2 turns the UTF8 flag on. #1 is equivalent to:
f2a2953c 571
7e19fb92 572 $data = encode("utf8", decode("iso-8859-1", $data));
f2a2953c 573
2575c402 574See L</"The UTF8 flag"> below.
f2a2953c 575
b9370cdb 576Also note that:
7828f908
RGS
577
578 from_to($octets, $from, $to, $check);
579
127a7155 580is equivalent to:
7828f908
RGS
581
582 $octets = encode($to, decode($from, $octets), $check);
583
b9370cdb
CBW
584Yes, it does I<not> respect the $check during decoding. It is
585deliberately done that way. If you need minute control, use C<decode>
586followed by C<encode> as follows:
7828f908
RGS
587
588 $octets = encode($to, decode($from, $octets, $check_from), $check_to);
589
84678a67
SH
590=head3 encode_utf8
591
592 $octets = encode_utf8($string);
f2a2953c 593
b9370cdb
CBW
594Equivalent to C<$octets = encode("utf8", $string)>. The characters in
595$string are encoded in Perl's internal format, and the result is returned
596as a sequence of octets. Because all possible characters in Perl have a
597(loose, not strict) UTF-8 representation, this function cannot fail.
f2a2953c 598
84678a67
SH
599=head3 decode_utf8
600
601 $string = decode_utf8($octets [, CHECK]);
f2a2953c 602
b9370cdb
CBW
603Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
604The sequence of octets represented by $octets is decoded
605from UTF-8 into a sequence of logical characters.
606Because not all sequences of octets are valid UTF-8,
607it is quite possible for this function to fail.
608For CHECK, see L</"Handling Malformed Data">.
f2a2953c 609
51ef4e11
NIS
610=head2 Listing available encodings
611
5129552c
JH
612 use Encode;
613 @list = Encode->encodings();
614
b9370cdb
CBW
615Returns a list of canonical names of available encodings that have already
616been loaded. To get a list of all available encodings including those that
617have not yet been loaded, say:
5129552c
JH
618
619 @all_encodings = Encode->encodings(":all");
620
b9370cdb 621Or you can give the name of a specific module:
5129552c 622
c731e18e
JH
623 @with_jp = Encode->encodings("Encode::JP");
624
b9370cdb 625When "C<::>" is not in the name, "C<Encode::>" is assumed.
51ef4e11 626
c731e18e 627 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 628
0ab8f81e 629To find out in detail which encodings are supported by this package,
5d030b67 630see L<Encode::Supported>.
51ef4e11
NIS
631
632=head2 Defining Aliases
633
0ab8f81e 634To add a new alias to a given encoding, use:
67d7b5ef 635
5129552c
JH
636 use Encode;
637 use Encode::Alias;
b9370cdb 638 define_alias(NEWNAME => ENCODING);
51ef4e11 639
b9370cdb 640After that, I<NEWNAME> can be used as an alias for I<ENCODING>.
84678a67 641I<ENCODING> may be either the name of an encoding or an
b9370cdb 642I<encoding object>.
51ef4e11 643
b9370cdb 644Before you do that, first make sure the alias is nonexistent using
fcb875d4 645C<resolve_alias()>, which returns the canonical name thereof.
b9370cdb 646For example:
fcb875d4
JH
647
648 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
649 Encode::resolve_alias("iso-8859-12") # false; nonexistent
650 Encode::resolve_alias($name) eq $name # true if $name is canonical
651
84678a67 652C<resolve_alias()> does not need C<use Encode::Alias>; it can be
b9370cdb 653imported via C<use Encode qw(resolve_alias)>.
fcb875d4 654
0ab8f81e 655See L<Encode::Alias> for details.
51ef4e11 656
742555bd
SP
657=head2 Finding IANA Character Set Registry names
658
659The canonical name of a given encoding does not necessarily agree with
b9370cdb
CBW
660IANA Character Set Registry, commonly seen as C<< Content-Type:
661text/plain; charset=I<WHATEVER> >>. For most cases, the canonical name
662works, but sometimes it does not, most notably with "utf-8-strict".
742555bd 663
84678a67 664As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added.
742555bd
SP
665
666 use Encode;
b9370cdb 667 my $enc = find_encoding("UTF-8");
742555bd
SP
668 warn $enc->name; # utf-8-strict
669 warn $enc->mime_name; # UTF-8
670
671See also: L<Encode::Encoding>
672
85982a32 673=head1 Encoding via PerlIO
4411f3b6 674
b9370cdb
CBW
675If your perl supports C<PerlIO> (which is the default), you can use a
676C<PerlIO> layer to decode and encode directly via a filehandle. The
677following two examples are fully identical in functionality:
678
679 ### Version 1 via PerlIO
680 open(INPUT, "< :encoding(shiftjis)", $infile)
681 || die "Can't open < $infile for reading: $!";
682 open(OUTPUT, "> :encoding(euc-jp)", $outfile)
683 || die "Can't open > $output for writing: $!";
684 while (<INPUT>) { # auto decodes $_
685 print OUTPUT; # auto encodes $_
686 }
687 close(INPUT) || die "can't close $infile: $!";
688 close(OUTPUT) || die "can't close $outfile: $!";
689
690 ### Version 2 via from_to()
691 open(INPUT, "< :raw", $infile)
692 || die "Can't open < $infile for reading: $!";
693 open(OUTPUT, "> :raw", $outfile)
694 || die "Can't open > $output for writing: $!";
695
696 while (<INPUT>) {
697 from_to($_, "shiftjis", "euc-jp", 1); # switch encoding
698 print OUTPUT; # emit raw (but properly encoded) data
699 }
700 close(INPUT) || die "can't close $infile: $!";
701 close(OUTPUT) || die "can't close $outfile: $!";
8e86646e 702
b9370cdb
CBW
703In the first version above, you let the appropriate encoding layer
704handle the conversion. In the second, you explicitly translate
705from one encoding to the other.
4411f3b6 706
127a7155 707Unfortunately, it may be that encodings are not C<PerlIO>-savvy. You can check
b9370cdb
CBW
708to see whether your encoding is supported by C<PerlIO> by invoking the
709C<perlio_ok> method on it:
0ab8f81e 710
b9370cdb
CBW
711 Encode::perlio_ok("hz"); # false
712 find_encoding("euc-cn")->perlio_ok; # true wherever PerlIO is available
0ab8f81e 713
b9370cdb 714 use Encode qw(perlio_ok); # imported upon request
0ab8f81e 715 perlio_ok("euc-jp")
4411f3b6 716
b9370cdb 717Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy
84678a67 718except for C<hz> and C<ISO-2022-kr>. For the gory details, see
f9d05ba3 719L<Encode::Encoding> and L<Encode::PerlIO>.
4411f3b6 720
85982a32 721=head1 Handling Malformed Data
4411f3b6 722
b9370cdb
CBW
723The optional I<CHECK> argument tells C<Encode> what to do when
724encountering malformed data. Without I<CHECK>, C<Encode::FB_DEFAULT>
725(== 0) is assumed.
8e180e82 726
b9370cdb
CBW
727As of version 2.12, C<Encode> supports coderef values for C<CHECK>;
728see below.
f9d05ba3 729
84678a67
SH
730B<NOTE:> Not all encodings support this feature.
731Some encodings ignore the I<CHECK> argument. For example,
f9d05ba3
RGS
732L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
733
84678a67 734=head2 List of I<CHECK> values
47bfe92f 735
84678a67 736=head3 FB_DEFAULT
151b5d36 737
84678a67 738 I<CHECK> = Encode::FB_DEFAULT ( == 0)
47bfe92f 739
b9370cdb
CBW
740If I<CHECK> is 0, encoding and decoding replace any malformed character
741with a I<substitution character>. When you encode, I<SUBCHAR> is used.
742When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is
743used. If the data is supposed to be UTF-8, an optional lexical warning of
744warning category C<"utf8"> is given.
e9692b5b 745
84678a67
SH
746=head3 FB_CROAK
747
748 I<CHECK> = Encode::FB_CROAK ( == 1)
e9692b5b 749
b9370cdb
CBW
750If I<CHECK> is 1, methods immediately die with an error
751message. Therefore, when I<CHECK> is 1, you should trap
752exceptions with C<eval{}>, unless you really want to let it C<die>.
47bfe92f 753
84678a67
SH
754=head3 FB_QUIET
755
756 I<CHECK> = Encode::FB_QUIET
47bfe92f 757
b9370cdb 758If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately
f9d05ba3 759return the portion of the data that has been processed so far when an
b9370cdb
CBW
760error occurs. The data argument is overwritten with everything
761after that point; that is, the unprocessed portion of the data. This is
762handy when you have to call C<decode> repeatedly in the case where your
f9d05ba3 763source data may contain partial multi-byte character sequences,
b9370cdb
CBW
764(that is, you are reading with a fixed-width buffer). Here's some sample
765code to do exactly that:
4411f3b6 766
b9370cdb
CBW
767 my($buffer, $string) = ("", "");
768 while (read($fh, $buffer, 256, length($buffer))) {
769 $string .= decode($encoding, $buffer, Encode::FB_QUIET);
770 # $buffer now contains the unprocessed partial character
771 }
1768d7eb 772
84678a67
SH
773=head3 FB_WARN
774
775 I<CHECK> = Encode::FB_WARN
67d7b5ef 776
b9370cdb
CBW
777This is the same as C<FB_QUIET> above, except that instead of being silent
778on errors, it issues a warning. This is handy for when you are debugging.
85982a32 779
84678a67
SH
780=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
781
782=over 2
783
85982a32
JH
784=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
785
af1f55d9
JH
786=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
787
788=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
789
84678a67
SH
790=back
791
b9370cdb
CBW
792For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
793C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
85982a32 794
b9370cdb
CBW
795When you decode, C<\xI<HH>> is inserted for a malformed character, where
796I<HH> is the hex representation of the octet that could not be decoded to
797utf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
798the Unicode code point (in any number of hex digits) of the character that
799cannot be found in the character repertoire of the encoding.
85982a32 800
b9370cdb
CBW
801The HTML/XML character reference modes are about the same. In place of
802C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
78589665 803XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
af1f55d9 804
b9370cdb 805In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
7f0d54d7 806
84678a67 807=head3 The bitmask
85982a32 808
b9370cdb
CBW
809These modes are all actually set via a bitmask. Here is how the C<FB_I<XXX>>
810constants are laid out. You can import the C<FB_I<XXX>> constants via
811C<use Encode qw(:fallbacks)>, and you can import the generic bitmask
0ab8f81e 812constants via C<use Encode qw(:fallback_all)>.
85982a32 813
b0b300a3
JH
814 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
815 DIE_ON_ERR 0x0001 X
4089adc4 816 WARN_ON_ERR 0x0002 X
b0b300a3 817 RETURN_ON_ERR 0x0004 X X
7f0d54d7 818 LEAVE_SRC 0x0008 X
b0b300a3 819 PERLQQ 0x0100 X
b7a5c9de
JH
820 HTMLCREF 0x0200
821 XMLCREF 0x0400
67d7b5ef 822
84678a67 823=head3 LEAVE_SRC
44b3b9c7 824
84678a67 825 Encode::LEAVE_SRC
51e4e64d 826
b9370cdb 827If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
64a9a3c0 828source string to encode() or decode() will be overwritten in place.
b9370cdb 829If you're not interested in this, then bitwise-OR it with the bitmask.
51e4e64d 830
0dbed2e5 831=head2 coderef for CHECK
8e180e82 832
b9370cdb 833As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
c7981a06
SH
834ordinal value of the unmapped character as an argument and returns
835octets that represent the fallback character. For instance:
67d7b5ef 836
8e180e82 837 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
67d7b5ef 838
b9370cdb 839Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
982a4085 840
c7981a06
SH
841Even the fallback for C<decode> must return octets, which are
842then decoded with the character encoding that C<decode> accepts. So for
127a7155 843example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
c7981a06
SH
844a fallback for bytes that are not valid UTF-8, you could write
845
846 $str = decode 'UTF-8', $octets, sub {
847 my $tmp = chr shift;
848 from_to $tmp, 'ISO-8859-15', 'UTF-8';
849 return $tmp;
850 };
851
67d7b5ef
JH
852=head1 Defining Encodings
853
854To define a new encoding, use:
855
b7a5c9de 856 use Encode qw(define_encoding);
b9370cdb 857 define_encoding($object, CANONICAL_NAME [, alias...]);
67d7b5ef 858
b9370cdb 859I<CANONICAL_NAME> will be associated with I<$object>. The object
0ab8f81e 860should provide the interface described in L<Encode::Encoding>.
b9370cdb
CBW
861If more than two arguments are provided, additional
862arguments are considered aliases for I<$object>.
67d7b5ef 863
b9370cdb 864See L<Encode::Encoding> for details.
f2a2953c 865
2575c402 866=head1 The UTF8 flag
7e19fb92 867
b9370cdb 868Before the introduction of Unicode support in Perl, The C<eq> operator
b7a5c9de 869just compared the strings represented by two scalars. Beginning with
b9370cdb
CBW
870Perl 5.8, C<eq> compares two strings with simultaneous consideration of
871I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of
872I<Programming Perl, 3rd ed.>
7e19fb92
JH
873
874=over 2
875
876=item Goal #1:
877
878Old byte-oriented programs should not spontaneously break on the old
879byte-oriented data they used to work on.
880
881=item Goal #2:
882
883Old byte-oriented programs should magically start working on the new
884character-oriented data when appropriate.
885
886=item Goal #3:
887
888Programs should run just as fast in the new character-oriented mode
889as in the old byte-oriented mode.
890
891=item Goal #4:
892
893Perl should remain one language, rather than forking into a
894byte-oriented Perl and a character-oriented Perl.
895
896=back
897
b9370cdb
CBW
898When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been
899born yet, many features documented in the book remained unimplemented for a
900long time. Perl 5.8 corrected much of this, and the introduction of the
901UTF8 flag is one of them. You can think of there being two fundamentally
902different kinds of strings and string-operations in Perl: one a
903byte-oriented mode for when the internal UTF8 flag is off, and the other a
904character-oriented mode for when the internal UTF8 flag is on.
7e19fb92 905
b9370cdb 906Here is how C<Encode> handles the UTF8 flag.
7e19fb92 907
4bdf5738 908=over 2
7e19fb92
JH
909
910=item *
911
b9370cdb 912When you I<encode>, the resulting UTF8 flag is always B<off>.
7e19fb92 913
151b5d36 914=item *
7e19fb92 915
b9370cdb
CBW
916When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can
917unambiguously represent data. Here is what we mean by "unambiguously".
918After C<$utf8 = decode("foo", $octet)>,
7e19fb92 919
2575c402 920 When $octet is... The UTF8 flag in $utf8 is
7e19fb92
JH
921 ---------------------------------------------
922 In ASCII only (or EBCDIC only) OFF
923 In ISO-8859-1 ON
924 In any other Encoding ON
925 ---------------------------------------------
926
b9370cdb
CBW
927As you see, there is one exception: in ASCII. That way you can assume
928Goal #1. And with C<Encode>, Goal #2 is assumed but you still have to be
929careful in the cases mentioned in the B<CAVEAT> paragraphs above.
7e19fb92 930
b9370cdb
CBW
931This UTF8 flag is not visible in Perl scripts, exactly for the same reason
932you cannot (or rather, you I<don't have to>) see whether a scalar contains
933a string, an integer, or a floating-point number. But you can still peek
934and poke these if you will. See the next section.
7e19fb92
JH
935
936=back
937
938=head2 Messing with Perl's Internals
4411f3b6 939
47bfe92f 940The following API uses parts of Perl's internals in the current
b9370cdb
CBW
941implementation. As such, they are efficient but may change in a future
942release.
4411f3b6 943
84678a67 944=head3 is_utf8
4411f3b6 945
84678a67 946 is_utf8(STRING [, CHECK])
4411f3b6 947
b9370cdb
CBW
948[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>.
949If I<CHECK> is true, also checks whether I<STRING> contains well-formed
47bfe92f 950UTF-8. Returns true if successful, false otherwise.
4411f3b6 951
b9370cdb 952As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
b5ab1f6f 953
84678a67
SH
954=head3 _utf8_on
955
956 _utf8_on(STRING)
4411f3b6 957
b9370cdb
CBW
958[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>. The I<STRING>
959is I<not> checked for containing only well-formed UTF-8. Do not use this
960unless you I<know with absolute certainty> that the STRING holds only
961well-formed UTF-8. Returns the previous state of the UTF8 flag (so please
962don't treat the return value as indicating success or failure), or C<undef>
963if I<STRING> is not a string.
4411f3b6 964
b9370cdb 965B<NOTE>: For security reasons, this function does not work on tainted values.
64bc6d54 966
84678a67
SH
967=head3 _utf8_off
968
969 _utf8_off(STRING)
4411f3b6 970
b9370cdb
CBW
971[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>. Do not use
972frivolously. Returns the previous state of the UTF8 flag, or C<undef> if
973I<STRING> is not a string. Do not treat the return value as indicative of
974success or failure, because that isn't what it means: it is only the
975previous setting.
4411f3b6 976
b9370cdb 977B<NOTE>: For security reasons, this function does not work on tainted values.
64bc6d54 978
2575c402 979=head1 UTF-8 vs. utf8 vs. UTF8
7f0d54d7
RGS
980
981 ....We now view strings not as sequences of bytes, but as sequences
982 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
983 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
984
b9370cdb
CBW
985That has historically been Perl's notion of UTF-8, as that is how UTF-8 was
986first conceived by Ken Thompson when he invented it. However, thanks to
987later revisions to the applicable standards, official UTF-8 is now rather
988stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF
989to cover only 21 bits instead of 32 or 64 bits) and some sequences
990are not allowed, like those used in surrogate pairs, the 31 non-character
991code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane
992(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc.
7f0d54d7 993
b9370cdb
CBW
994The former default in which Perl would always use a loose interpretation of
995UTF-8 has now been overruled:
7f0d54d7
RGS
996
997 From: Larry Wall <larry@wall.org>
998 Date: December 04, 2004 11:51:58 JST
999 To: perl-unicode@perl.org
1000 Subject: Re: Make Encode.pm support the real UTF-8
1001 Message-Id: <20041204025158.GA28754@wall.org>
b9370cdb 1002
7f0d54d7
RGS
1003 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
1004 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
1005 : but "UTF-8" is the name of the standard and should give the
1006 : corresponding behaviour.
b9370cdb 1007
7f0d54d7
RGS
1008 For what it's worth, that's how I've always kept them straight in my
1009 head.
b9370cdb 1010
7f0d54d7
RGS
1011 Also for what it's worth, Perl 6 will mostly default to strict but
1012 make it easy to switch back to lax.
b9370cdb 1013
7f0d54d7
RGS
1014 Larry
1015
b9370cdb
CBW
1016Got that? As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current
1017sense, which is conservative and strict and security-conscious, whereas
1018B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and
1019lax. C<Encode> version 2.10 or later thus groks this subtle but critically
1020important distinction between C<"UTF-8"> and C<"utf8">.
7f0d54d7
RGS
1021
1022 encode("utf8", "\x{FFFF_FFFF}", 1); # okay
1023 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
1024
b9370cdb
CBW
1025In the C<Encode> module, C<"UTF-8"> is actually a canonical name for
1026C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is
1027critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
7f0d54d7
RGS
1028
1029 find_encoding("UTF-8")->name # is 'utf-8-strict'
1030 find_encoding("utf-8")->name # ditto. names are case insensitive
b9370cdb 1031 find_encoding("utf_8")->name # ditto. "_" are treated as "-"
7f0d54d7
RGS
1032 find_encoding("UTF8")->name # is 'utf8'.
1033
b9370cdb
CBW
1034Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates
1035whether a string is internally encoded as "utf8", also without a hyphen.
7f0d54d7 1036
4411f3b6
NIS
1037=head1 SEE ALSO
1038
5d030b67
JH
1039L<Encode::Encoding>,
1040L<Encode::Supported>,
6d1c0808 1041L<Encode::PerlIO>,
5d030b67 1042L<encoding>,
6d1c0808
JH
1043L<perlebcdic>,
1044L<perlfunc/open>,
370462a2 1045L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
6d1c0808 1046L<utf8>,
84678a67 1047the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html>
4411f3b6 1048
85982a32 1049=head1 MAINTAINER
aae85ceb 1050
b9370cdb 1051This project was originated by the late Nick Ing-Simmons and later
fd172611 1052maintained by Dan Kogai I<< <dankogai@cpan.org> >>. See AUTHORS
b9370cdb
CBW
1053for a full list of people involved. For any questions, send mail to
1054I<< <perl-unicode@perl.org> >> so that we can all share.
aae85ceb 1055
b9370cdb
CBW
1056While Dan Kogai retains the copyright as a maintainer, credit
1057should go to all those involved. See AUTHORS for a list of those
1058who submitted code to the project.
d1256cb1
RGS
1059
1060=head1 COPYRIGHT
1061
28e02325 1062Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>.
d1256cb1
RGS
1063
1064This library is free software; you can redistribute it and/or modify
1065it under the same terms as Perl itself.
1066
4411f3b6 1067=cut