This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
check --whole-archive is supported before using it
[perl5.git] / cpan / Encode / Encode.pm
CommitLineData
10c5ecbb 1#
b85802c5 2# $Id: Encode.pm,v 2.42 2010/12/31 22:48:10 dankogai Exp $
10c5ecbb 3#
2c674647 4package Encode;
51ef4e11 5use strict;
656ebd29 6use warnings;
b85802c5 7our $VERSION = sprintf "%d.%02d", q$Revision: 2.42 $ =~ /(\d+)/g;
8f139f4c 8sub DEBUG () { 0 }
6d1c0808 9use XSLoader ();
d1256cb1 10XSLoader::load( __PACKAGE__, $VERSION );
2c674647 11
2c674647 12require Exporter;
7e19fb92 13use base qw/Exporter/;
2c674647 14
4411f3b6 15# Public, encouraged API is exported by default
85982a32
JH
16
17our @EXPORT = qw(
0a8c69ed 18 decode decode_utf8 encode encode_utf8 str2bytes bytes2str
a0d8a30e 19 encodings find_encoding clone_encoding
4411f3b6 20);
d1256cb1
RGS
21our @FB_FLAGS = qw(
22 DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
23 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
24);
25our @FB_CONSTS = qw(
26 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
27 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
28);
29our @EXPORT_OK = (
30 qw(
31 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
32 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
85982a32 33 ),
d1256cb1
RGS
34 @FB_FLAGS, @FB_CONSTS,
35);
85982a32 36
d1256cb1
RGS
37our %EXPORT_TAGS = (
38 all => [ @EXPORT, @EXPORT_OK ],
0263186c
NC
39 default => [ @EXPORT ],
40 fallbacks => [ @FB_CONSTS ],
d1256cb1
RGS
41 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
42);
85982a32 43
4411f3b6 44# Documentation moved after __END__ for speed - NI-S
2c674647 45
d1256cb1 46our $ON_EBCDIC = ( ord("A") == 193 );
f2a2953c 47
5d030b67
JH
48use Encode::Alias;
49
5129552c
JH
50# Make a %Encoding package variable to allow a certain amount of cheating
51our %Encoding;
aae85ceb
DK
52our %ExtModule;
53require Encode::Config;
2fd0906e
SH
54# See
55# https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
56# to find why sig handers inside eval{} are disabled.
57eval {
58 local $SIG{__DIE__};
59 local $SIG{__WARN__};
60 require Encode::ConfigLocal;
61};
5129552c 62
d1256cb1 63sub encodings {
5129552c 64 my $class = shift;
fc17bd48 65 my %enc;
d1256cb1
RGS
66 if ( @_ and $_[0] eq ":all" ) {
67 %enc = ( %Encoding, %ExtModule );
5129552c 68 }
d1256cb1
RGS
69 else {
70 %enc = %Encoding;
71 for my $mod ( map { m/::/o ? $_ : "Encode::$_" } @_ ) {
72 DEBUG and warn $mod;
73 for my $enc ( keys %ExtModule ) {
74 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
75 }
76 }
77 }
78 return sort { lc $a cmp lc $b }
79 grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
51ef4e11
NIS
80}
81
d1256cb1
RGS
82sub perlio_ok {
83 my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
011b2d2f 84 $obj->can("perlio_ok") and return $obj->perlio_ok();
d1256cb1 85 return 0; # safety net
85982a32
JH
86}
87
d1256cb1 88sub define_encoding {
18586f54
NIS
89 my $obj = shift;
90 my $name = shift;
5129552c 91 $Encoding{$name} = $obj;
18586f54 92 my $lc = lc($name);
d1256cb1
RGS
93 define_alias( $lc => $obj ) unless $lc eq $name;
94 while (@_) {
95 my $alias = shift;
96 define_alias( $alias, $obj );
18586f54
NIS
97 }
98 return $obj;
656753f8
NIS
99}
100
d1256cb1
RGS
101sub getEncoding {
102 my ( $class, $name, $skip_external ) = @_;
10c5ecbb 103
a0d8a30e 104 ref($name) && $name->can('renew') and return $name;
10c5ecbb 105 exists $Encoding{$name} and return $Encoding{$name};
18586f54 106 my $lc = lc $name;
10c5ecbb 107 exists $Encoding{$lc} and return $Encoding{$lc};
c50d192e 108
5129552c 109 my $oc = $class->find_alias($name);
10c5ecbb
JH
110 defined($oc) and return $oc;
111 $lc ne $name and $oc = $class->find_alias($lc);
112 defined($oc) and return $oc;
c50d192e 113
d1256cb1
RGS
114 unless ($skip_external) {
115 if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
116 $mod =~ s,::,/,g;
117 $mod .= '.pm';
118 eval { require $mod; };
119 exists $Encoding{$name} and return $Encoding{$name};
120 }
d1ed7747 121 }
18586f54 122 return;
656753f8
NIS
123}
124
d1256cb1
RGS
125sub find_encoding($;$) {
126 my ( $name, $skip_external ) = @_;
127 return __PACKAGE__->getEncoding( $name, $skip_external );
4411f3b6
NIS
128}
129
d1256cb1 130sub resolve_alias($) {
fcb875d4
JH
131 my $obj = find_encoding(shift);
132 defined $obj and return $obj->name;
133 return;
134}
135
d1256cb1 136sub clone_encoding($) {
a0d8a30e
DK
137 my $obj = find_encoding(shift);
138 ref $obj or return;
139 eval { require Storable };
140 $@ and return;
141 return Storable::dclone($obj);
142}
143
d1256cb1
RGS
144sub encode($$;$) {
145 my ( $name, $string, $check ) = @_;
0f7c507f 146 return undef unless defined $string;
d1256cb1
RGS
147 $string .= '' if ref $string; # stringify;
148 $check ||= 0;
4e71788c
RGS
149 unless ( defined $name ) {
150 require Carp;
151 Carp::croak("Encoding name should not be undef");
152 }
18586f54 153 my $enc = find_encoding($name);
d1256cb1
RGS
154 unless ( defined $enc ) {
155 require Carp;
156 Carp::croak("Unknown encoding '$name'");
10c5ecbb 157 }
d1256cb1 158 my $octets = $enc->encode( $string, $check );
7828f908 159 $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() );
18586f54 160 return $octets;
4411f3b6 161}
0a8c69ed 162*str2bytes = \&encode;
4411f3b6 163
d1256cb1
RGS
164sub decode($$;$) {
165 my ( $name, $octets, $check ) = @_;
0f7c507f 166 return undef unless defined $octets;
78589665 167 $octets .= '' if ref $octets;
d1256cb1 168 $check ||= 0;
18586f54 169 my $enc = find_encoding($name);
d1256cb1
RGS
170 unless ( defined $enc ) {
171 require Carp;
172 Carp::croak("Unknown encoding '$name'");
10c5ecbb 173 }
d1256cb1 174 my $string = $enc->decode( $octets, $check );
7828f908 175 $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
18586f54 176 return $string;
4411f3b6 177}
0a8c69ed 178*bytes2str = \&decode;
4411f3b6 179
d1256cb1
RGS
180sub from_to($$$;$) {
181 my ( $string, $from, $to, $check ) = @_;
0f7c507f 182 return undef unless defined $string;
d1256cb1 183 $check ||= 0;
18586f54 184 my $f = find_encoding($from);
d1256cb1
RGS
185 unless ( defined $f ) {
186 require Carp;
187 Carp::croak("Unknown encoding '$from'");
10c5ecbb 188 }
18586f54 189 my $t = find_encoding($to);
d1256cb1
RGS
190 unless ( defined $t ) {
191 require Carp;
192 Carp::croak("Unknown encoding '$to'");
10c5ecbb 193 }
41c240f5 194 my $uni = $f->decode($string);
d1256cb1
RGS
195 $_[0] = $string = $t->encode( $uni, $check );
196 return undef if ( $check && length($uni) );
197 return defined( $_[0] ) ? length($string) : undef;
4411f3b6
NIS
198}
199
d1256cb1 200sub encode_utf8($) {
18586f54 201 my ($str) = @_;
c731e18e 202 utf8::encode($str);
18586f54 203 return $str;
4411f3b6
NIS
204}
205
45a6a02c
SH
206my $utf8enc;
207
d1256cb1 208sub decode_utf8($;$) {
45a6a02c
SH
209 my ( $octets, $check ) = @_;
210 return $octets if is_utf8($octets);
211 return undef unless defined $octets;
212 $octets .= '' if ref $octets;
213 $check ||= 0;
214 $utf8enc ||= find_encoding('utf8');
215 my $string = $utf8enc->decode( $octets, $check );
216 $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
217 return $string;
5ad8ef52
NIS
218}
219
45a6a02c
SH
220# sub decode_utf8($;$) {
221# my ( $str, $check ) = @_;
222# return $str if is_utf8($str);
223# if ($check) {
224# return decode( "utf8", $str, $check );
225# }
226# else {
227# return decode( "utf8", $str );
228# return $str;
229# }
230# }
231
b536bf57 232predefine_encodings(1);
f2a2953c
JH
233
234#
235# This is to restore %Encoding if really needed;
236#
10c5ecbb 237
d1256cb1 238sub predefine_encodings {
51e4e64d 239 require Encode::Encoding;
b536bf57
DK
240 no warnings 'redefine';
241 my $use_xs = shift;
6d1c0808 242 if ($ON_EBCDIC) {
d1256cb1
RGS
243
244 # was in Encode::UTF_EBCDIC
245 package Encode::UTF_EBCDIC;
246 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
247 *decode = sub {
248 my ( $obj, $str, $chk ) = @_;
249 my $res = '';
250 for ( my $i = 0 ; $i < length($str) ; $i++ ) {
251 $res .=
252 chr(
253 utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
254 );
255 }
256 $_[1] = '' if $chk;
257 return $res;
258 };
259 *encode = sub {
260 my ( $obj, $str, $chk ) = @_;
261 my $res = '';
262 for ( my $i = 0 ; $i < length($str) ; $i++ ) {
263 $res .=
264 chr(
265 utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
266 );
267 }
268 $_[1] = '' if $chk;
269 return $res;
270 };
271 $Encode::Encoding{Unicode} =
272 bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
273 }
274 else {
275
276 package Encode::Internal;
277 push @Encode::Internal::ISA, 'Encode::Encoding';
278 *decode = sub {
279 my ( $obj, $str, $chk ) = @_;
280 utf8::upgrade($str);
281 $_[1] = '' if $chk;
282 return $str;
283 };
284 *encode = \&decode;
285 $Encode::Encoding{Unicode} =
286 bless { Name => "Internal" } => "Encode::Internal";
f2a2953c
JH
287 }
288
289 {
d1256cb1
RGS
290
291 # was in Encode::utf8
292 package Encode::utf8;
293 push @Encode::utf8::ISA, 'Encode::Encoding';
294
295 #
296 if ($use_xs) {
297 Encode::DEBUG and warn __PACKAGE__, " XS on";
298 *decode = \&decode_xs;
299 *encode = \&encode_xs;
300 }
301 else {
302 Encode::DEBUG and warn __PACKAGE__, " XS off";
303 *decode = sub {
304 my ( $obj, $octets, $chk ) = @_;
305 my $str = Encode::decode_utf8($octets);
306 if ( defined $str ) {
307 $_[1] = '' if $chk;
308 return $str;
309 }
310 return undef;
311 };
312 *encode = sub {
313 my ( $obj, $string, $chk ) = @_;
314 my $octets = Encode::encode_utf8($string);
315 $_[1] = '' if $chk;
316 return $octets;
317 };
318 }
319 *cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk)
320 # currently ignores $chk
321 my ( $obj, undef, undef, $pos, $trm ) = @_;
322 my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
323 use bytes;
324 if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
325 $$rdst .=
326 substr( $$rsrc, $pos, $npos - $pos + length($trm) );
327 $$rpos = $npos + length($trm);
328 return 1;
329 }
330 $$rdst .= substr( $$rsrc, $pos );
331 $$rpos = length($$rsrc);
332 return '';
333 };
334 $Encode::Encoding{utf8} =
335 bless { Name => "utf8" } => "Encode::utf8";
336 $Encode::Encoding{"utf-8-strict"} =
337 bless { Name => "utf-8-strict", strict_utf8 => 1 } =>
338 "Encode::utf8";
f2a2953c 339 }
f2a2953c
JH
340}
341
656753f8
NIS
3421;
343
2a936312
NIS
344__END__
345
4411f3b6
NIS
346=head1 NAME
347
348Encode - character encodings
349
350=head1 SYNOPSIS
351
352 use Encode;
353
67d7b5ef
JH
354=head2 Table of Contents
355
0ab8f81e 356Encode consists of a collection of modules whose details are too big
67d7b5ef 357to fit in one document. This POD itself explains the top-level APIs
6d1c0808 358and general topics at a glance. For other topics and more details,
0ab8f81e 359see the PODs below:
67d7b5ef
JH
360
361 Name Description
362 --------------------------------------------------------
6d1c0808 363 Encode::Alias Alias definitions to encodings
67d7b5ef
JH
364 Encode::Encoding Encode Implementation Base Class
365 Encode::Supported List of Supported Encodings
366 Encode::CN Simplified Chinese Encodings
367 Encode::JP Japanese Encodings
368 Encode::KR Korean Encodings
369 Encode::TW Traditional Chinese Encodings
370 --------------------------------------------------------
371
4411f3b6
NIS
372=head1 DESCRIPTION
373
47bfe92f 374The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef
JH
375and the rest of the system. Perl strings are sequences of
376B<characters>.
377
378The repertoire of characters that Perl can represent is at least that
379defined by the Unicode Consortium. On most platforms the ordinal
380values of the characters (as returned by C<ord(ch)>) is the "Unicode
381codepoint" for the character (the exceptions are those platforms where
382the legacy encoding is some variant of EBCDIC rather than a super-set
383of ASCII - see L<perlebcdic>).
384
0ab8f81e 385Traditionally, computer data has been moved around in 8-bit chunks
67d7b5ef
JH
386often called "bytes". These chunks are also known as "octets" in
387networking standards. Perl is widely used to manipulate data of many
388types - not only strings of characters representing human or computer
0ab8f81e 389languages but also "binary" data being the machine's representation of
67d7b5ef
JH
390numbers, pixels in an image - or just about anything.
391
0ab8f81e 392When Perl is processing "binary data", the programmer wants Perl to
67d7b5ef 393process "sequences of bytes". This is not a problem for Perl - as a
0ab8f81e 394byte has 256 possible values, it easily fits in Perl's much larger
67d7b5ef
JH
395"logical character".
396
397=head2 TERMINOLOGY
4411f3b6 398
7e19fb92 399=over 2
21938dfa 400
67d7b5ef
JH
401=item *
402
403I<character>: a character in the range 0..(2**32-1) (or more).
404(What Perl's strings are made of.)
405
406=item *
407
408I<byte>: a character in the range 0..255
409(A special case of a Perl character.)
410
411=item *
412
413I<octet>: 8 bits of data, with ordinal values 0..255
0ab8f81e 414(Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
67d7b5ef
JH
415
416=back
4411f3b6 417
67d7b5ef 418=head1 PERL ENCODING API
4411f3b6 419
7e19fb92 420=over 2
4411f3b6 421
b7a5c9de 422=item $octets = encode(ENCODING, $string [, CHECK])
4411f3b6 423
0ab8f81e 424Encodes a string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 425a sequence of octets. ENCODING can be either a canonical name or
0ab8f81e
JH
426an alias. For encoding names and aliases, see L</"Defining Aliases">.
427For CHECK, see L</"Handling Malformed Data">.
4411f3b6 428
b7a5c9de 429For example, to convert a string from Perl's internal format to
6d1c0808 430iso-8859-1 (also known as Latin1),
681a7c68 431
b7a5c9de 432 $octets = encode("iso-8859-1", $string);
7e19fb92 433
44b3b9c7
SP
434B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then
435$octets B<may not be equal to> $string. Though they both contain the
436same data, the UTF8 flag for $octets is B<always> off. When you
437encode anything, UTF8 flag of the result is always off, even when it
438contains completely valid utf8 string. See L</"The UTF8 flag"> below.
681a7c68 439
7f0d54d7 440If the $string is C<undef> then C<undef> is returned.
4089adc4 441
b7a5c9de 442=item $string = decode(ENCODING, $octets [, CHECK])
4411f3b6 443
0ab8f81e
JH
444Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
445internal form and returns the resulting string. As in encode(),
446ENCODING can be either a canonical name or an alias. For encoding names
447and aliases, see L</"Defining Aliases">. For CHECK, see
47bfe92f
JH
448L</"Handling Malformed Data">.
449
b7a5c9de 450For example, to convert ISO-8859-1 data to a string in Perl's internal format:
681a7c68 451
b7a5c9de 452 $string = decode("iso-8859-1", $octets);
681a7c68 453
b7a5c9de
JH
454B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
455B<may not be equal to> $octets. Though they both contain the same data,
2575c402
JW
456the UTF8 flag for $string is on unless $octets entirely consists of
457ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF8 flag">
7e19fb92 458below.
47bfe92f 459
7f0d54d7 460If the $string is C<undef> then C<undef> is returned.
4089adc4 461
44b3b9c7
SP
462=item [$obj =] find_encoding(ENCODING)
463
464Returns the I<encoding object> corresponding to ENCODING. Returns
465undef if no matching ENCODING is find.
466
467This object is what actually does the actual (en|de)coding.
468
469 $utf8 = decode($name, $bytes);
470
471is in fact
472
473 $utf8 = do{
474 $obj = find_encoding($name);
475 croak qq(encoding "$name" not found) unless ref $obj;
476 $obj->decode($bytes)
477 };
478
479with more error checking.
480
481Therefore you can save time by reusing this object as follows;
482
483 my $enc = find_encoding("iso-8859-1");
484 while(<>){
485 my $utf8 = $enc->decode($_);
486 # and do someting with $utf8;
487 }
488
489Besides C<< ->decode >> and C<< ->encode >>, other methods are
490available as well. For instance, C<< -> name >> returns the canonical
491name of the encoding object.
492
493 find_encoding("latin1")->name; # iso-8859-1
494
495See L<Encode::Encoding> for details.
496
b7a5c9de 497=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
7e19fb92 498
b7a5c9de
JH
499Converts B<in-place> data between two encodings. The data in $octets
500must be encoded as octets and not as characters in Perl's internal
f9d05ba3
RGS
501format. For example, to convert ISO-8859-1 data to Microsoft's CP1250
502encoding:
2b106fbe 503
b7a5c9de 504 from_to($octets, "iso-8859-1", "cp1250");
2b106fbe
JH
505
506and to convert it back:
507
b7a5c9de 508 from_to($octets, "cp1250", "iso-8859-1");
4411f3b6 509
ab97ca19 510Note that because the conversion happens in place, the data to be
0ab8f81e 511converted cannot be a string constant; it must be a scalar variable.
ab97ca19 512
f9d05ba3
RGS
513from_to() returns the length of the converted string in octets on
514success, I<undef> on error.
3ef515df 515
b7a5c9de 516B<CAVEAT>: The following operations look the same but are not quite so;
7e19fb92 517
b7a5c9de 518 from_to($data, "iso-8859-1", "utf8"); #1
7e19fb92 519 $data = decode("iso-8859-1", $data); #2
4411f3b6 520
b7a5c9de 521Both #1 and #2 make $data consist of a completely valid UTF-8 string
2575c402 522but only #2 turns UTF8 flag on. #1 is equivalent to
f2a2953c 523
7e19fb92 524 $data = encode("utf8", decode("iso-8859-1", $data));
f2a2953c 525
2575c402 526See L</"The UTF8 flag"> below.
f2a2953c 527
7828f908
RGS
528Also note that
529
530 from_to($octets, $from, $to, $check);
531
532is equivalent to
533
534 $octets = encode($to, decode($from, $octets), $check);
535
536Yes, it does not respect the $check during decoding. It is
537deliberately done that way. If you need minute control, C<decode>
538then C<encode> as follows;
539
540 $octets = encode($to, decode($from, $octets, $check_from), $check_to);
541
f2a2953c
JH
542=item $octets = encode_utf8($string);
543
7e19fb92 544Equivalent to C<$octets = encode("utf8", $string);> The characters
b7a5c9de
JH
545that comprise $string are encoded in Perl's internal format and the
546result is returned as a sequence of octets. All possible
7e19fb92
JH
547characters have a UTF-8 representation so this function cannot fail.
548
f2a2953c
JH
549
550=item $string = decode_utf8($octets [, CHECK]);
551
7e19fb92 552equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
b7a5c9de 553The sequence of octets represented by
7e19fb92
JH
554$octets is decoded from UTF-8 into a sequence of logical
555characters. Not all sequences of octets form valid UTF-8 encodings, so
556it is possible for this call to fail. For CHECK, see
557L</"Handling Malformed Data">.
f2a2953c
JH
558
559=back
560
51ef4e11
NIS
561=head2 Listing available encodings
562
5129552c
JH
563 use Encode;
564 @list = Encode->encodings();
565
566Returns a list of the canonical names of the available encodings that
567are loaded. To get a list of all available encodings including the
568ones that are not loaded yet, say
569
570 @all_encodings = Encode->encodings(":all");
571
0ab8f81e 572Or you can give the name of a specific module.
5129552c 573
c731e18e
JH
574 @with_jp = Encode->encodings("Encode::JP");
575
576When "::" is not in the name, "Encode::" is assumed.
51ef4e11 577
c731e18e 578 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 579
0ab8f81e 580To find out in detail which encodings are supported by this package,
5d030b67 581see L<Encode::Supported>.
51ef4e11
NIS
582
583=head2 Defining Aliases
584
0ab8f81e 585To add a new alias to a given encoding, use:
67d7b5ef 586
5129552c
JH
587 use Encode;
588 use Encode::Alias;
a63c962f 589 define_alias(newName => ENCODING);
51ef4e11 590
3ef515df 591After that, newName can be used as an alias for ENCODING.
f2a2953c
JH
592ENCODING may be either the name of an encoding or an
593I<encoding object>
51ef4e11 594
fcb875d4
JH
595But before you do so, make sure the alias is nonexistent with
596C<resolve_alias()>, which returns the canonical name thereof.
597i.e.
598
599 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
600 Encode::resolve_alias("iso-8859-12") # false; nonexistent
601 Encode::resolve_alias($name) eq $name # true if $name is canonical
602
0ab8f81e
JH
603resolve_alias() does not need C<use Encode::Alias>; it can be
604exported via C<use Encode qw(resolve_alias)>.
fcb875d4 605
0ab8f81e 606See L<Encode::Alias> for details.
51ef4e11 607
742555bd
SP
608=head2 Finding IANA Character Set Registry names
609
610The canonical name of a given encoding does not necessarily agree with
611IANA IANA Character Set Registry, commonly seen as C<< Content-Type:
612text/plain; charset=I<whatever> >>. For most cases canonical names
613work but sometimes it does not (notably 'utf-8-strict').
614
615Therefore as of Encode version 2.21, a new method C<mime_name()> is added.
616
617 use Encode;
618 my $enc = find_encoding('UTF-8');
619 warn $enc->name; # utf-8-strict
620 warn $enc->mime_name; # UTF-8
621
622See also: L<Encode::Encoding>
623
85982a32 624=head1 Encoding via PerlIO
4411f3b6 625
44b3b9c7
SP
626If your perl supports I<PerlIO> (which is the default), you can use a
627PerlIO layer to decode and encode directly via a filehandle. The
628following two examples are totally identical in their functionality.
4411f3b6 629
85982a32
JH
630 # via PerlIO
631 open my $in, "<:encoding(shiftjis)", $infile or die;
632 open my $out, ">:encoding(euc-jp)", $outfile or die;
b7a5c9de 633 while(<$in>){ print $out $_; }
8e86646e 634
85982a32 635 # via from_to
0ab8f81e
JH
636 open my $in, "<", $infile or die;
637 open my $out, ">", $outfile or die;
b7a5c9de 638 while(<$in>){
0ab8f81e 639 from_to($_, "shiftjis", "euc-jp", 1);
b7a5c9de 640 print $out $_;
85982a32 641 }
4411f3b6 642
b7a5c9de 643Unfortunately, it may be that encodings are PerlIO-savvy. You can check
0ab8f81e
JH
644if your encoding is supported by PerlIO by calling the C<perlio_ok>
645method.
646
647 Encode::perlio_ok("hz"); # False
648 find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available
649
650 use Encode qw(perlio_ok); # exported upon request
651 perlio_ok("euc-jp")
4411f3b6 652
0ab8f81e 653Fortunately, all encodings that come with Encode core are PerlIO-savvy
f9d05ba3
RGS
654except for hz and ISO-2022-kr. For gory details, see
655L<Encode::Encoding> and L<Encode::PerlIO>.
4411f3b6 656
85982a32 657=head1 Handling Malformed Data
4411f3b6 658
8e180e82
SP
659The optional I<CHECK> argument tells Encode what to do when it
660encounters malformed data. Without CHECK, Encode::FB_DEFAULT ( == 0 )
661is assumed.
662
663As of version 2.12 Encode supports coderef values for CHECK. See below.
f9d05ba3
RGS
664
665=over 2
666
3c4b39be 667=item B<NOTE:> Not all encoding support this feature
f9d05ba3
RGS
668
669Some encodings ignore I<CHECK> argument. For example,
670L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
671
672=back
673
674Now here is the list of I<CHECK> values available
47bfe92f 675
151b5d36
JH
676=over 2
677
85982a32 678=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
47bfe92f 679
f9d05ba3 680If I<CHECK> is 0, (en|de)code will put a I<substitution character> in
78589665
RGS
681place of a malformed character. When you encode, E<lt>subcharE<gt>
682will be used. When you decode the code point C<0xFFFD> is used. If
683the data is supposed to be UTF-8, an optional lexical warning
684(category utf8) is given.
e9692b5b 685
7e19fb92 686=item I<CHECK> = Encode::FB_CROAK ( == 1)
e9692b5b 687
b7a5c9de 688If I<CHECK> is 1, methods will die on error immediately with an error
0ab8f81e 689message. Therefore, when I<CHECK> is set to 1, you should trap the
f9d05ba3 690error with eval{} unless you really want to let it die.
47bfe92f 691
85982a32 692=item I<CHECK> = Encode::FB_QUIET
47bfe92f 693
85982a32 694If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
f9d05ba3
RGS
695return the portion of the data that has been processed so far when an
696error occurs. The data argument will be overwritten with everything
697after that point (that is, the unprocessed part of data). This is
698handy when you have to call decode repeatedly in the case where your
699source data may contain partial multi-byte character sequences,
700(i.e. you are reading with a fixed-width buffer). Here is a sample
701code that does exactly this:
4411f3b6 702
78589665
RGS
703 my $buffer = ''; my $string = '';
704 while(read $fh, $buffer, 256, length($buffer)){
705 $string .= decode($encoding, $buffer, Encode::FB_QUIET);
706 # $buffer now contains the unprocessed partial character
85982a32 707 }
1768d7eb 708
85982a32 709=item I<CHECK> = Encode::FB_WARN
67d7b5ef 710
0ab8f81e
JH
711This is the same as above, except that it warns on error. Handy when
712you are debugging the mode above.
85982a32
JH
713
714=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
715
af1f55d9
JH
716=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
717
718=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
719
85982a32
JH
720For encodings that are implemented by Encode::XS, CHECK ==
721Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
722
b7a5c9de
JH
723When you decode, C<\xI<HH>> will be inserted for a malformed character,
724where I<HH> is the hex representation of the octet that could not be
725decoded to utf8. And when you encode, C<\x{I<HHHH>}> will be inserted,
726where I<HHHH> is the Unicode ID of the character that cannot be found
0ab8f81e 727in the character repertoire of the encoding.
85982a32 728
af1f55d9 729HTML/XML character reference modes are about the same, in place of
78589665
RGS
730C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number and
731XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
af1f55d9 732
7f0d54d7
RGS
733In Encode 2.10 or later, C<LEAVE_SRC> is also implied.
734
85982a32
JH
735=item The bitmask
736
0ab8f81e
JH
737These modes are actually set via a bitmask. Here is how the FB_XX
738constants are laid out. You can import the FB_XX constants via
739C<use Encode qw(:fallbacks)>; you can import the generic bitmask
740constants via C<use Encode qw(:fallback_all)>.
85982a32 741
b0b300a3
JH
742 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
743 DIE_ON_ERR 0x0001 X
4089adc4 744 WARN_ON_ERR 0x0002 X
b0b300a3 745 RETURN_ON_ERR 0x0004 X X
7f0d54d7 746 LEAVE_SRC 0x0008 X
b0b300a3 747 PERLQQ 0x0100 X
b7a5c9de
JH
748 HTMLCREF 0x0200
749 XMLCREF 0x0400
67d7b5ef 750
151b5d36
JH
751=back
752
44b3b9c7
SP
753=over 2
754
51e4e64d
NC
755=item Encode::LEAVE_SRC
756
757If the C<Encode::LEAVE_SRC> bit is not set, but I<CHECK> is, then the second
758argument to C<encode()> or C<decode()> may be assigned to by the functions. If
759you're not interested in this, then bitwise-or the bitmask with it.
760
44b3b9c7
SP
761=back
762
0dbed2e5 763=head2 coderef for CHECK
8e180e82
SP
764
765As of Encode 2.12 CHECK can also be a code reference which takes the
766ord value of unmapped caharacter as an argument and returns a string
767that represents the fallback character. For instance,
67d7b5ef 768
8e180e82 769 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
67d7b5ef 770
8e180e82
SP
771Acts like FB_PERLQQ but E<lt>U+I<XXXX>E<gt> is used instead of
772\x{I<XXXX>}.
982a4085 773
67d7b5ef
JH
774=head1 Defining Encodings
775
776To define a new encoding, use:
777
b7a5c9de 778 use Encode qw(define_encoding);
67d7b5ef
JH
779 define_encoding($object, 'canonicalName' [, alias...]);
780
781I<canonicalName> will be associated with I<$object>. The object
0ab8f81e 782should provide the interface described in L<Encode::Encoding>.
67d7b5ef 783If more than two arguments are provided then additional
b7a5c9de 784arguments are taken as aliases for I<$object>.
67d7b5ef 785
f2a2953c
JH
786See L<Encode::Encoding> for more details.
787
2575c402 788=head1 The UTF8 flag
7e19fb92 789
2575c402 790Before the introduction of Unicode support in perl, The C<eq> operator
b7a5c9de 791just compared the strings represented by two scalars. Beginning with
2575c402
JW
792perl 5.8, C<eq> compares two strings with simultaneous consideration of
793I<the UTF8 flag>. To explain why we made it so, I will quote page 402 of
794C<Programming Perl, 3rd ed.>
7e19fb92
JH
795
796=over 2
797
798=item Goal #1:
799
800Old byte-oriented programs should not spontaneously break on the old
801byte-oriented data they used to work on.
802
803=item Goal #2:
804
805Old byte-oriented programs should magically start working on the new
806character-oriented data when appropriate.
807
808=item Goal #3:
809
810Programs should run just as fast in the new character-oriented mode
811as in the old byte-oriented mode.
812
813=item Goal #4:
814
815Perl should remain one language, rather than forking into a
816byte-oriented Perl and a character-oriented Perl.
817
818=back
819
820Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0
821was born and many features documented in the book remained
b7a5c9de 822unimplemented for a long time. Perl 5.8 corrected this and the introduction
2575c402
JW
823of the UTF8 flag is one of them. You can think of this perl notion as of a
824byte-oriented mode (UTF8 flag off) and a character-oriented mode (UTF8
7e19fb92
JH
825flag on).
826
2575c402 827Here is how Encode takes care of the UTF8 flag.
7e19fb92 828
4bdf5738 829=over 2
7e19fb92
JH
830
831=item *
832
2575c402 833When you encode, the resulting UTF8 flag is always off.
7e19fb92 834
151b5d36 835=item *
7e19fb92 836
2575c402 837When you decode, the resulting UTF8 flag is on unless you can
7e19fb92
JH
838unambiguously represent data. Here is the definition of
839dis-ambiguity.
840
b7a5c9de 841After C<$utf8 = decode('foo', $octet);>,
7e19fb92 842
2575c402 843 When $octet is... The UTF8 flag in $utf8 is
7e19fb92
JH
844 ---------------------------------------------
845 In ASCII only (or EBCDIC only) OFF
846 In ISO-8859-1 ON
847 In any other Encoding ON
848 ---------------------------------------------
849
3c4b39be 850As you see, there is one exception, In ASCII. That way you can assume
7e19fb92
JH
851Goal #1. And with Encode Goal #2 is assumed but you still have to be
852careful in such cases mentioned in B<CAVEAT> paragraphs.
853
2575c402 854This UTF8 flag is not visible in perl scripts, exactly for the same
7e19fb92
JH
855reason you cannot (or you I<don't have to>) see if a scalar contains a
856string, integer, or floating point number. But you can still peek
857and poke these if you will. See the section below.
858
859=back
860
861=head2 Messing with Perl's Internals
4411f3b6 862
47bfe92f 863The following API uses parts of Perl's internals in the current
0ab8f81e 864implementation. As such, they are efficient but may change.
4411f3b6 865
7e19fb92 866=over 2
4411f3b6 867
a63c962f 868=item is_utf8(STRING [, CHECK])
4411f3b6 869
2575c402 870[INTERNAL] Tests whether the UTF8 flag is turned on in the STRING.
47bfe92f
JH
871If CHECK is true, also checks the data in STRING for being well-formed
872UTF-8. Returns true if successful, false otherwise.
4411f3b6 873
2c246b25 874As of perl 5.8.1, L<utf8> also has utf8::is_utf8().
b5ab1f6f 875
a63c962f 876=item _utf8_on(STRING)
4411f3b6 877
2575c402 878[INTERNAL] Turns on the UTF8 flag in STRING. The data in STRING is
4411f3b6
NIS
879B<not> checked for being well-formed UTF-8. Do not use unless you
880B<know> that the STRING is well-formed UTF-8. Returns the previous
2575c402 881state of the UTF8 flag (so please don't treat the return value as
0ab8f81e 882indicating success or failure), or C<undef> if STRING is not a string.
4411f3b6 883
64bc6d54
SH
884This function does not work on tainted values.
885
a63c962f 886=item _utf8_off(STRING)
4411f3b6 887
2575c402
JW
888[INTERNAL] Turns off the UTF8 flag in STRING. Do not use frivolously.
889Returns the previous state of the UTF8 flag (so please don't treat the
0ab8f81e 890return value as indicating success or failure), or C<undef> if STRING is
4411f3b6
NIS
891not a string.
892
64bc6d54
SH
893This function does not work on tainted values.
894
4411f3b6
NIS
895=back
896
2575c402 897=head1 UTF-8 vs. utf8 vs. UTF8
7f0d54d7
RGS
898
899 ....We now view strings not as sequences of bytes, but as sequences
900 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
901 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
902
903That has been the perl's notion of UTF-8 but official UTF-8 is more
904strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are
905not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al).
906
907Now that is overruled by Larry Wall himself.
908
909 From: Larry Wall <larry@wall.org>
910 Date: December 04, 2004 11:51:58 JST
911 To: perl-unicode@perl.org
912 Subject: Re: Make Encode.pm support the real UTF-8
913 Message-Id: <20041204025158.GA28754@wall.org>
914
915 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
916 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
917 : but "UTF-8" is the name of the standard and should give the
918 : corresponding behaviour.
919
920 For what it's worth, that's how I've always kept them straight in my
921 head.
8e180e82 922
7f0d54d7
RGS
923 Also for what it's worth, Perl 6 will mostly default to strict but
924 make it easy to switch back to lax.
925
926 Larry
927
928Do you copy? As of Perl 5.8.7, B<UTF-8> means strict, official UTF-8
929while B<utf8> means liberal, lax, version thereof. And Encode version
9302.10 or later thus groks the difference between C<UTF-8> and C"utf8".
931
932 encode("utf8", "\x{FFFF_FFFF}", 1); # okay
933 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
934
935C<UTF-8> in Encode is actually a canonical name for C<utf-8-strict>.
936Yes, the hyphen between "UTF" and "8" is important. Without it Encode
937goes "liberal"
938
939 find_encoding("UTF-8")->name # is 'utf-8-strict'
940 find_encoding("utf-8")->name # ditto. names are case insensitive
50c1ac04 941 find_encoding("utf_8")->name # ditto. "_" are treated as "-"
7f0d54d7
RGS
942 find_encoding("UTF8")->name # is 'utf8'.
943
2575c402
JW
944The UTF8 flag is internally called UTF8, without a hyphen. It indicates
945whether a string is internally encoded as utf8, also without a hypen.
7f0d54d7 946
4411f3b6
NIS
947=head1 SEE ALSO
948
5d030b67
JH
949L<Encode::Encoding>,
950L<Encode::Supported>,
6d1c0808 951L<Encode::PerlIO>,
5d030b67 952L<encoding>,
6d1c0808
JH
953L<perlebcdic>,
954L<perlfunc/open>,
370462a2 955L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
6d1c0808 956L<utf8>,
5d030b67 957the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 958
85982a32 959=head1 MAINTAINER
aae85ceb
DK
960
961This project was originated by Nick Ing-Simmons and later maintained
7e19fb92
JH
962by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full
963list of people involved. For any questions, use
b7a5c9de 964E<lt>perl-unicode@perl.orgE<gt> so we can all share.
aae85ceb 965
d1256cb1
RGS
966While Dan Kogai retains the copyright as a maintainer, the credit
967should go to all those involoved. See AUTHORS for those submitted
968codes.
969
970=head1 COPYRIGHT
971
972Copyright 2002-2006 Dan Kogai E<lt>dankogai@dan.co.jpE<gt>
973
974This library is free software; you can redistribute it and/or modify
975it under the same terms as Perl itself.
976
4411f3b6 977=cut