This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to Encode-2.29.
[perl5.git] / ext / Encode / Encode.pm
CommitLineData
10c5ecbb 1#
a37eaad4 2# $Id: Encode.pm,v 2.29 2009/02/01 13:10:07 dankogai Exp $
10c5ecbb 3#
2c674647 4package Encode;
51ef4e11 5use strict;
656ebd29 6use warnings;
a37eaad4 7our $VERSION = sprintf "%d.%02d", q$Revision: 2.29 $ =~ /(\d+)/g;
8f139f4c 8sub DEBUG () { 0 }
6d1c0808 9use XSLoader ();
d1256cb1 10XSLoader::load( __PACKAGE__, $VERSION );
2c674647 11
2c674647 12require Exporter;
7e19fb92 13use base qw/Exporter/;
2c674647 14
4411f3b6 15# Public, encouraged API is exported by default
85982a32
JH
16
17our @EXPORT = qw(
0a8c69ed 18 decode decode_utf8 encode encode_utf8 str2bytes bytes2str
a0d8a30e 19 encodings find_encoding clone_encoding
4411f3b6 20);
d1256cb1
RGS
21our @FB_FLAGS = qw(
22 DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
23 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
24);
25our @FB_CONSTS = qw(
26 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
27 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
28);
29our @EXPORT_OK = (
30 qw(
31 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
32 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
85982a32 33 ),
d1256cb1
RGS
34 @FB_FLAGS, @FB_CONSTS,
35);
85982a32 36
d1256cb1
RGS
37our %EXPORT_TAGS = (
38 all => [ @EXPORT, @EXPORT_OK ],
0263186c
NC
39 default => [ @EXPORT ],
40 fallbacks => [ @FB_CONSTS ],
d1256cb1
RGS
41 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
42);
85982a32 43
4411f3b6 44# Documentation moved after __END__ for speed - NI-S
2c674647 45
d1256cb1 46our $ON_EBCDIC = ( ord("A") == 193 );
f2a2953c 47
5d030b67
JH
48use Encode::Alias;
49
5129552c
JH
50# Make a %Encoding package variable to allow a certain amount of cheating
51our %Encoding;
aae85ceb
DK
52our %ExtModule;
53require Encode::Config;
2fd0906e
SH
54# See
55# https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
56# to find why sig handers inside eval{} are disabled.
57eval {
58 local $SIG{__DIE__};
59 local $SIG{__WARN__};
60 require Encode::ConfigLocal;
61};
5129552c 62
d1256cb1 63sub encodings {
5129552c 64 my $class = shift;
fc17bd48 65 my %enc;
d1256cb1
RGS
66 if ( @_ and $_[0] eq ":all" ) {
67 %enc = ( %Encoding, %ExtModule );
5129552c 68 }
d1256cb1
RGS
69 else {
70 %enc = %Encoding;
71 for my $mod ( map { m/::/o ? $_ : "Encode::$_" } @_ ) {
72 DEBUG and warn $mod;
73 for my $enc ( keys %ExtModule ) {
74 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
75 }
76 }
77 }
78 return sort { lc $a cmp lc $b }
79 grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
51ef4e11
NIS
80}
81
d1256cb1
RGS
82sub perlio_ok {
83 my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
011b2d2f 84 $obj->can("perlio_ok") and return $obj->perlio_ok();
d1256cb1 85 return 0; # safety net
85982a32
JH
86}
87
d1256cb1 88sub define_encoding {
18586f54
NIS
89 my $obj = shift;
90 my $name = shift;
5129552c 91 $Encoding{$name} = $obj;
18586f54 92 my $lc = lc($name);
d1256cb1
RGS
93 define_alias( $lc => $obj ) unless $lc eq $name;
94 while (@_) {
95 my $alias = shift;
96 define_alias( $alias, $obj );
18586f54
NIS
97 }
98 return $obj;
656753f8
NIS
99}
100
d1256cb1
RGS
101sub getEncoding {
102 my ( $class, $name, $skip_external ) = @_;
10c5ecbb 103
a0d8a30e 104 ref($name) && $name->can('renew') and return $name;
10c5ecbb 105 exists $Encoding{$name} and return $Encoding{$name};
18586f54 106 my $lc = lc $name;
10c5ecbb 107 exists $Encoding{$lc} and return $Encoding{$lc};
c50d192e 108
5129552c 109 my $oc = $class->find_alias($name);
10c5ecbb
JH
110 defined($oc) and return $oc;
111 $lc ne $name and $oc = $class->find_alias($lc);
112 defined($oc) and return $oc;
c50d192e 113
d1256cb1
RGS
114 unless ($skip_external) {
115 if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
116 $mod =~ s,::,/,g;
117 $mod .= '.pm';
118 eval { require $mod; };
119 exists $Encoding{$name} and return $Encoding{$name};
120 }
d1ed7747 121 }
18586f54 122 return;
656753f8
NIS
123}
124
d1256cb1
RGS
125sub find_encoding($;$) {
126 my ( $name, $skip_external ) = @_;
127 return __PACKAGE__->getEncoding( $name, $skip_external );
4411f3b6
NIS
128}
129
d1256cb1 130sub resolve_alias($) {
fcb875d4
JH
131 my $obj = find_encoding(shift);
132 defined $obj and return $obj->name;
133 return;
134}
135
d1256cb1 136sub clone_encoding($) {
a0d8a30e
DK
137 my $obj = find_encoding(shift);
138 ref $obj or return;
139 eval { require Storable };
140 $@ and return;
141 return Storable::dclone($obj);
142}
143
d1256cb1
RGS
144sub encode($$;$) {
145 my ( $name, $string, $check ) = @_;
0f7c507f 146 return undef unless defined $string;
d1256cb1
RGS
147 $string .= '' if ref $string; # stringify;
148 $check ||= 0;
18586f54 149 my $enc = find_encoding($name);
d1256cb1
RGS
150 unless ( defined $enc ) {
151 require Carp;
152 Carp::croak("Unknown encoding '$name'");
10c5ecbb 153 }
d1256cb1 154 my $octets = $enc->encode( $string, $check );
7828f908 155 $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() );
18586f54 156 return $octets;
4411f3b6 157}
0a8c69ed 158*str2bytes = \&encode;
4411f3b6 159
d1256cb1
RGS
160sub decode($$;$) {
161 my ( $name, $octets, $check ) = @_;
0f7c507f 162 return undef unless defined $octets;
78589665 163 $octets .= '' if ref $octets;
d1256cb1 164 $check ||= 0;
18586f54 165 my $enc = find_encoding($name);
d1256cb1
RGS
166 unless ( defined $enc ) {
167 require Carp;
168 Carp::croak("Unknown encoding '$name'");
10c5ecbb 169 }
d1256cb1 170 my $string = $enc->decode( $octets, $check );
7828f908 171 $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
18586f54 172 return $string;
4411f3b6 173}
0a8c69ed 174*bytes2str = \&decode;
4411f3b6 175
d1256cb1
RGS
176sub from_to($$$;$) {
177 my ( $string, $from, $to, $check ) = @_;
0f7c507f 178 return undef unless defined $string;
d1256cb1 179 $check ||= 0;
18586f54 180 my $f = find_encoding($from);
d1256cb1
RGS
181 unless ( defined $f ) {
182 require Carp;
183 Carp::croak("Unknown encoding '$from'");
10c5ecbb 184 }
18586f54 185 my $t = find_encoding($to);
d1256cb1
RGS
186 unless ( defined $t ) {
187 require Carp;
188 Carp::croak("Unknown encoding '$to'");
10c5ecbb 189 }
41c240f5 190 my $uni = $f->decode($string);
d1256cb1
RGS
191 $_[0] = $string = $t->encode( $uni, $check );
192 return undef if ( $check && length($uni) );
193 return defined( $_[0] ) ? length($string) : undef;
4411f3b6
NIS
194}
195
d1256cb1 196sub encode_utf8($) {
18586f54 197 my ($str) = @_;
c731e18e 198 utf8::encode($str);
18586f54 199 return $str;
4411f3b6
NIS
200}
201
d1256cb1
RGS
202sub decode_utf8($;$) {
203 my ( $str, $check ) = @_;
41c240f5 204 return $str if is_utf8($str);
d1256cb1
RGS
205 if ($check) {
206 return decode( "utf8", $str, $check );
207 }
208 else {
209 return decode( "utf8", $str );
210 return $str;
c2cbba7d 211 }
5ad8ef52
NIS
212}
213
b536bf57 214predefine_encodings(1);
f2a2953c
JH
215
216#
217# This is to restore %Encoding if really needed;
218#
10c5ecbb 219
d1256cb1 220sub predefine_encodings {
51e4e64d 221 require Encode::Encoding;
b536bf57
DK
222 no warnings 'redefine';
223 my $use_xs = shift;
6d1c0808 224 if ($ON_EBCDIC) {
d1256cb1
RGS
225
226 # was in Encode::UTF_EBCDIC
227 package Encode::UTF_EBCDIC;
228 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
229 *decode = sub {
230 my ( $obj, $str, $chk ) = @_;
231 my $res = '';
232 for ( my $i = 0 ; $i < length($str) ; $i++ ) {
233 $res .=
234 chr(
235 utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
236 );
237 }
238 $_[1] = '' if $chk;
239 return $res;
240 };
241 *encode = sub {
242 my ( $obj, $str, $chk ) = @_;
243 my $res = '';
244 for ( my $i = 0 ; $i < length($str) ; $i++ ) {
245 $res .=
246 chr(
247 utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
248 );
249 }
250 $_[1] = '' if $chk;
251 return $res;
252 };
253 $Encode::Encoding{Unicode} =
254 bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
255 }
256 else {
257
258 package Encode::Internal;
259 push @Encode::Internal::ISA, 'Encode::Encoding';
260 *decode = sub {
261 my ( $obj, $str, $chk ) = @_;
262 utf8::upgrade($str);
263 $_[1] = '' if $chk;
264 return $str;
265 };
266 *encode = \&decode;
267 $Encode::Encoding{Unicode} =
268 bless { Name => "Internal" } => "Encode::Internal";
f2a2953c
JH
269 }
270
271 {
d1256cb1
RGS
272
273 # was in Encode::utf8
274 package Encode::utf8;
275 push @Encode::utf8::ISA, 'Encode::Encoding';
276
277 #
278 if ($use_xs) {
279 Encode::DEBUG and warn __PACKAGE__, " XS on";
280 *decode = \&decode_xs;
281 *encode = \&encode_xs;
282 }
283 else {
284 Encode::DEBUG and warn __PACKAGE__, " XS off";
285 *decode = sub {
286 my ( $obj, $octets, $chk ) = @_;
287 my $str = Encode::decode_utf8($octets);
288 if ( defined $str ) {
289 $_[1] = '' if $chk;
290 return $str;
291 }
292 return undef;
293 };
294 *encode = sub {
295 my ( $obj, $string, $chk ) = @_;
296 my $octets = Encode::encode_utf8($string);
297 $_[1] = '' if $chk;
298 return $octets;
299 };
300 }
301 *cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk)
302 # currently ignores $chk
303 my ( $obj, undef, undef, $pos, $trm ) = @_;
304 my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
305 use bytes;
306 if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
307 $$rdst .=
308 substr( $$rsrc, $pos, $npos - $pos + length($trm) );
309 $$rpos = $npos + length($trm);
310 return 1;
311 }
312 $$rdst .= substr( $$rsrc, $pos );
313 $$rpos = length($$rsrc);
314 return '';
315 };
316 $Encode::Encoding{utf8} =
317 bless { Name => "utf8" } => "Encode::utf8";
318 $Encode::Encoding{"utf-8-strict"} =
319 bless { Name => "utf-8-strict", strict_utf8 => 1 } =>
320 "Encode::utf8";
f2a2953c 321 }
f2a2953c
JH
322}
323
656753f8
NIS
3241;
325
2a936312
NIS
326__END__
327
4411f3b6
NIS
328=head1 NAME
329
330Encode - character encodings
331
332=head1 SYNOPSIS
333
334 use Encode;
335
67d7b5ef
JH
336=head2 Table of Contents
337
0ab8f81e 338Encode consists of a collection of modules whose details are too big
67d7b5ef 339to fit in one document. This POD itself explains the top-level APIs
6d1c0808 340and general topics at a glance. For other topics and more details,
0ab8f81e 341see the PODs below:
67d7b5ef
JH
342
343 Name Description
344 --------------------------------------------------------
6d1c0808 345 Encode::Alias Alias definitions to encodings
67d7b5ef
JH
346 Encode::Encoding Encode Implementation Base Class
347 Encode::Supported List of Supported Encodings
348 Encode::CN Simplified Chinese Encodings
349 Encode::JP Japanese Encodings
350 Encode::KR Korean Encodings
351 Encode::TW Traditional Chinese Encodings
352 --------------------------------------------------------
353
4411f3b6
NIS
354=head1 DESCRIPTION
355
47bfe92f 356The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef
JH
357and the rest of the system. Perl strings are sequences of
358B<characters>.
359
360The repertoire of characters that Perl can represent is at least that
361defined by the Unicode Consortium. On most platforms the ordinal
362values of the characters (as returned by C<ord(ch)>) is the "Unicode
363codepoint" for the character (the exceptions are those platforms where
364the legacy encoding is some variant of EBCDIC rather than a super-set
365of ASCII - see L<perlebcdic>).
366
0ab8f81e 367Traditionally, computer data has been moved around in 8-bit chunks
67d7b5ef
JH
368often called "bytes". These chunks are also known as "octets" in
369networking standards. Perl is widely used to manipulate data of many
370types - not only strings of characters representing human or computer
0ab8f81e 371languages but also "binary" data being the machine's representation of
67d7b5ef
JH
372numbers, pixels in an image - or just about anything.
373
0ab8f81e 374When Perl is processing "binary data", the programmer wants Perl to
67d7b5ef 375process "sequences of bytes". This is not a problem for Perl - as a
0ab8f81e 376byte has 256 possible values, it easily fits in Perl's much larger
67d7b5ef
JH
377"logical character".
378
379=head2 TERMINOLOGY
4411f3b6 380
7e19fb92 381=over 2
21938dfa 382
67d7b5ef
JH
383=item *
384
385I<character>: a character in the range 0..(2**32-1) (or more).
386(What Perl's strings are made of.)
387
388=item *
389
390I<byte>: a character in the range 0..255
391(A special case of a Perl character.)
392
393=item *
394
395I<octet>: 8 bits of data, with ordinal values 0..255
0ab8f81e 396(Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
67d7b5ef
JH
397
398=back
4411f3b6 399
67d7b5ef 400=head1 PERL ENCODING API
4411f3b6 401
7e19fb92 402=over 2
4411f3b6 403
b7a5c9de 404=item $octets = encode(ENCODING, $string [, CHECK])
4411f3b6 405
0ab8f81e 406Encodes a string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 407a sequence of octets. ENCODING can be either a canonical name or
0ab8f81e
JH
408an alias. For encoding names and aliases, see L</"Defining Aliases">.
409For CHECK, see L</"Handling Malformed Data">.
4411f3b6 410
b7a5c9de 411For example, to convert a string from Perl's internal format to
6d1c0808 412iso-8859-1 (also known as Latin1),
681a7c68 413
b7a5c9de 414 $octets = encode("iso-8859-1", $string);
7e19fb92 415
44b3b9c7
SP
416B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then
417$octets B<may not be equal to> $string. Though they both contain the
418same data, the UTF8 flag for $octets is B<always> off. When you
419encode anything, UTF8 flag of the result is always off, even when it
420contains completely valid utf8 string. See L</"The UTF8 flag"> below.
681a7c68 421
7f0d54d7 422If the $string is C<undef> then C<undef> is returned.
4089adc4 423
b7a5c9de 424=item $string = decode(ENCODING, $octets [, CHECK])
4411f3b6 425
0ab8f81e
JH
426Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
427internal form and returns the resulting string. As in encode(),
428ENCODING can be either a canonical name or an alias. For encoding names
429and aliases, see L</"Defining Aliases">. For CHECK, see
47bfe92f
JH
430L</"Handling Malformed Data">.
431
b7a5c9de 432For example, to convert ISO-8859-1 data to a string in Perl's internal format:
681a7c68 433
b7a5c9de 434 $string = decode("iso-8859-1", $octets);
681a7c68 435
b7a5c9de
JH
436B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
437B<may not be equal to> $octets. Though they both contain the same data,
2575c402
JW
438the UTF8 flag for $string is on unless $octets entirely consists of
439ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF8 flag">
7e19fb92 440below.
47bfe92f 441
7f0d54d7 442If the $string is C<undef> then C<undef> is returned.
4089adc4 443
44b3b9c7
SP
444=item [$obj =] find_encoding(ENCODING)
445
446Returns the I<encoding object> corresponding to ENCODING. Returns
447undef if no matching ENCODING is find.
448
449This object is what actually does the actual (en|de)coding.
450
451 $utf8 = decode($name, $bytes);
452
453is in fact
454
455 $utf8 = do{
456 $obj = find_encoding($name);
457 croak qq(encoding "$name" not found) unless ref $obj;
458 $obj->decode($bytes)
459 };
460
461with more error checking.
462
463Therefore you can save time by reusing this object as follows;
464
465 my $enc = find_encoding("iso-8859-1");
466 while(<>){
467 my $utf8 = $enc->decode($_);
468 # and do someting with $utf8;
469 }
470
471Besides C<< ->decode >> and C<< ->encode >>, other methods are
472available as well. For instance, C<< -> name >> returns the canonical
473name of the encoding object.
474
475 find_encoding("latin1")->name; # iso-8859-1
476
477See L<Encode::Encoding> for details.
478
b7a5c9de 479=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
7e19fb92 480
b7a5c9de
JH
481Converts B<in-place> data between two encodings. The data in $octets
482must be encoded as octets and not as characters in Perl's internal
f9d05ba3
RGS
483format. For example, to convert ISO-8859-1 data to Microsoft's CP1250
484encoding:
2b106fbe 485
b7a5c9de 486 from_to($octets, "iso-8859-1", "cp1250");
2b106fbe
JH
487
488and to convert it back:
489
b7a5c9de 490 from_to($octets, "cp1250", "iso-8859-1");
4411f3b6 491
ab97ca19 492Note that because the conversion happens in place, the data to be
0ab8f81e 493converted cannot be a string constant; it must be a scalar variable.
ab97ca19 494
f9d05ba3
RGS
495from_to() returns the length of the converted string in octets on
496success, I<undef> on error.
3ef515df 497
b7a5c9de 498B<CAVEAT>: The following operations look the same but are not quite so;
7e19fb92 499
b7a5c9de 500 from_to($data, "iso-8859-1", "utf8"); #1
7e19fb92 501 $data = decode("iso-8859-1", $data); #2
4411f3b6 502
b7a5c9de 503Both #1 and #2 make $data consist of a completely valid UTF-8 string
2575c402 504but only #2 turns UTF8 flag on. #1 is equivalent to
f2a2953c 505
7e19fb92 506 $data = encode("utf8", decode("iso-8859-1", $data));
f2a2953c 507
2575c402 508See L</"The UTF8 flag"> below.
f2a2953c 509
7828f908
RGS
510Also note that
511
512 from_to($octets, $from, $to, $check);
513
514is equivalent to
515
516 $octets = encode($to, decode($from, $octets), $check);
517
518Yes, it does not respect the $check during decoding. It is
519deliberately done that way. If you need minute control, C<decode>
520then C<encode> as follows;
521
522 $octets = encode($to, decode($from, $octets, $check_from), $check_to);
523
f2a2953c
JH
524=item $octets = encode_utf8($string);
525
7e19fb92 526Equivalent to C<$octets = encode("utf8", $string);> The characters
b7a5c9de
JH
527that comprise $string are encoded in Perl's internal format and the
528result is returned as a sequence of octets. All possible
7e19fb92
JH
529characters have a UTF-8 representation so this function cannot fail.
530
f2a2953c
JH
531
532=item $string = decode_utf8($octets [, CHECK]);
533
7e19fb92 534equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
b7a5c9de 535The sequence of octets represented by
7e19fb92
JH
536$octets is decoded from UTF-8 into a sequence of logical
537characters. Not all sequences of octets form valid UTF-8 encodings, so
538it is possible for this call to fail. For CHECK, see
539L</"Handling Malformed Data">.
f2a2953c
JH
540
541=back
542
51ef4e11
NIS
543=head2 Listing available encodings
544
5129552c
JH
545 use Encode;
546 @list = Encode->encodings();
547
548Returns a list of the canonical names of the available encodings that
549are loaded. To get a list of all available encodings including the
550ones that are not loaded yet, say
551
552 @all_encodings = Encode->encodings(":all");
553
0ab8f81e 554Or you can give the name of a specific module.
5129552c 555
c731e18e
JH
556 @with_jp = Encode->encodings("Encode::JP");
557
558When "::" is not in the name, "Encode::" is assumed.
51ef4e11 559
c731e18e 560 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 561
0ab8f81e 562To find out in detail which encodings are supported by this package,
5d030b67 563see L<Encode::Supported>.
51ef4e11
NIS
564
565=head2 Defining Aliases
566
0ab8f81e 567To add a new alias to a given encoding, use:
67d7b5ef 568
5129552c
JH
569 use Encode;
570 use Encode::Alias;
a63c962f 571 define_alias(newName => ENCODING);
51ef4e11 572
3ef515df 573After that, newName can be used as an alias for ENCODING.
f2a2953c
JH
574ENCODING may be either the name of an encoding or an
575I<encoding object>
51ef4e11 576
fcb875d4
JH
577But before you do so, make sure the alias is nonexistent with
578C<resolve_alias()>, which returns the canonical name thereof.
579i.e.
580
581 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
582 Encode::resolve_alias("iso-8859-12") # false; nonexistent
583 Encode::resolve_alias($name) eq $name # true if $name is canonical
584
0ab8f81e
JH
585resolve_alias() does not need C<use Encode::Alias>; it can be
586exported via C<use Encode qw(resolve_alias)>.
fcb875d4 587
0ab8f81e 588See L<Encode::Alias> for details.
51ef4e11 589
742555bd
SP
590=head2 Finding IANA Character Set Registry names
591
592The canonical name of a given encoding does not necessarily agree with
593IANA IANA Character Set Registry, commonly seen as C<< Content-Type:
594text/plain; charset=I<whatever> >>. For most cases canonical names
595work but sometimes it does not (notably 'utf-8-strict').
596
597Therefore as of Encode version 2.21, a new method C<mime_name()> is added.
598
599 use Encode;
600 my $enc = find_encoding('UTF-8');
601 warn $enc->name; # utf-8-strict
602 warn $enc->mime_name; # UTF-8
603
604See also: L<Encode::Encoding>
605
85982a32 606=head1 Encoding via PerlIO
4411f3b6 607
44b3b9c7
SP
608If your perl supports I<PerlIO> (which is the default), you can use a
609PerlIO layer to decode and encode directly via a filehandle. The
610following two examples are totally identical in their functionality.
4411f3b6 611
85982a32
JH
612 # via PerlIO
613 open my $in, "<:encoding(shiftjis)", $infile or die;
614 open my $out, ">:encoding(euc-jp)", $outfile or die;
b7a5c9de 615 while(<$in>){ print $out $_; }
8e86646e 616
85982a32 617 # via from_to
0ab8f81e
JH
618 open my $in, "<", $infile or die;
619 open my $out, ">", $outfile or die;
b7a5c9de 620 while(<$in>){
0ab8f81e 621 from_to($_, "shiftjis", "euc-jp", 1);
b7a5c9de 622 print $out $_;
85982a32 623 }
4411f3b6 624
b7a5c9de 625Unfortunately, it may be that encodings are PerlIO-savvy. You can check
0ab8f81e
JH
626if your encoding is supported by PerlIO by calling the C<perlio_ok>
627method.
628
629 Encode::perlio_ok("hz"); # False
630 find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available
631
632 use Encode qw(perlio_ok); # exported upon request
633 perlio_ok("euc-jp")
4411f3b6 634
0ab8f81e 635Fortunately, all encodings that come with Encode core are PerlIO-savvy
f9d05ba3
RGS
636except for hz and ISO-2022-kr. For gory details, see
637L<Encode::Encoding> and L<Encode::PerlIO>.
4411f3b6 638
85982a32 639=head1 Handling Malformed Data
4411f3b6 640
8e180e82
SP
641The optional I<CHECK> argument tells Encode what to do when it
642encounters malformed data. Without CHECK, Encode::FB_DEFAULT ( == 0 )
643is assumed.
644
645As of version 2.12 Encode supports coderef values for CHECK. See below.
f9d05ba3
RGS
646
647=over 2
648
3c4b39be 649=item B<NOTE:> Not all encoding support this feature
f9d05ba3
RGS
650
651Some encodings ignore I<CHECK> argument. For example,
652L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
653
654=back
655
656Now here is the list of I<CHECK> values available
47bfe92f 657
151b5d36
JH
658=over 2
659
85982a32 660=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
47bfe92f 661
f9d05ba3 662If I<CHECK> is 0, (en|de)code will put a I<substitution character> in
78589665
RGS
663place of a malformed character. When you encode, E<lt>subcharE<gt>
664will be used. When you decode the code point C<0xFFFD> is used. If
665the data is supposed to be UTF-8, an optional lexical warning
666(category utf8) is given.
e9692b5b 667
7e19fb92 668=item I<CHECK> = Encode::FB_CROAK ( == 1)
e9692b5b 669
b7a5c9de 670If I<CHECK> is 1, methods will die on error immediately with an error
0ab8f81e 671message. Therefore, when I<CHECK> is set to 1, you should trap the
f9d05ba3 672error with eval{} unless you really want to let it die.
47bfe92f 673
85982a32 674=item I<CHECK> = Encode::FB_QUIET
47bfe92f 675
85982a32 676If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
f9d05ba3
RGS
677return the portion of the data that has been processed so far when an
678error occurs. The data argument will be overwritten with everything
679after that point (that is, the unprocessed part of data). This is
680handy when you have to call decode repeatedly in the case where your
681source data may contain partial multi-byte character sequences,
682(i.e. you are reading with a fixed-width buffer). Here is a sample
683code that does exactly this:
4411f3b6 684
78589665
RGS
685 my $buffer = ''; my $string = '';
686 while(read $fh, $buffer, 256, length($buffer)){
687 $string .= decode($encoding, $buffer, Encode::FB_QUIET);
688 # $buffer now contains the unprocessed partial character
85982a32 689 }
1768d7eb 690
85982a32 691=item I<CHECK> = Encode::FB_WARN
67d7b5ef 692
0ab8f81e
JH
693This is the same as above, except that it warns on error. Handy when
694you are debugging the mode above.
85982a32
JH
695
696=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
697
af1f55d9
JH
698=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
699
700=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
701
85982a32
JH
702For encodings that are implemented by Encode::XS, CHECK ==
703Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
704
b7a5c9de
JH
705When you decode, C<\xI<HH>> will be inserted for a malformed character,
706where I<HH> is the hex representation of the octet that could not be
707decoded to utf8. And when you encode, C<\x{I<HHHH>}> will be inserted,
708where I<HHHH> is the Unicode ID of the character that cannot be found
0ab8f81e 709in the character repertoire of the encoding.
85982a32 710
af1f55d9 711HTML/XML character reference modes are about the same, in place of
78589665
RGS
712C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number and
713XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
af1f55d9 714
7f0d54d7
RGS
715In Encode 2.10 or later, C<LEAVE_SRC> is also implied.
716
85982a32
JH
717=item The bitmask
718
0ab8f81e
JH
719These modes are actually set via a bitmask. Here is how the FB_XX
720constants are laid out. You can import the FB_XX constants via
721C<use Encode qw(:fallbacks)>; you can import the generic bitmask
722constants via C<use Encode qw(:fallback_all)>.
85982a32 723
b0b300a3
JH
724 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
725 DIE_ON_ERR 0x0001 X
4089adc4 726 WARN_ON_ERR 0x0002 X
b0b300a3 727 RETURN_ON_ERR 0x0004 X X
7f0d54d7 728 LEAVE_SRC 0x0008 X
b0b300a3 729 PERLQQ 0x0100 X
b7a5c9de
JH
730 HTMLCREF 0x0200
731 XMLCREF 0x0400
67d7b5ef 732
151b5d36
JH
733=back
734
44b3b9c7
SP
735=over 2
736
51e4e64d
NC
737=item Encode::LEAVE_SRC
738
739If the C<Encode::LEAVE_SRC> bit is not set, but I<CHECK> is, then the second
740argument to C<encode()> or C<decode()> may be assigned to by the functions. If
741you're not interested in this, then bitwise-or the bitmask with it.
742
44b3b9c7
SP
743=back
744
0dbed2e5 745=head2 coderef for CHECK
8e180e82
SP
746
747As of Encode 2.12 CHECK can also be a code reference which takes the
748ord value of unmapped caharacter as an argument and returns a string
749that represents the fallback character. For instance,
67d7b5ef 750
8e180e82 751 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
67d7b5ef 752
8e180e82
SP
753Acts like FB_PERLQQ but E<lt>U+I<XXXX>E<gt> is used instead of
754\x{I<XXXX>}.
982a4085 755
67d7b5ef
JH
756=head1 Defining Encodings
757
758To define a new encoding, use:
759
b7a5c9de 760 use Encode qw(define_encoding);
67d7b5ef
JH
761 define_encoding($object, 'canonicalName' [, alias...]);
762
763I<canonicalName> will be associated with I<$object>. The object
0ab8f81e 764should provide the interface described in L<Encode::Encoding>.
67d7b5ef 765If more than two arguments are provided then additional
b7a5c9de 766arguments are taken as aliases for I<$object>.
67d7b5ef 767
f2a2953c
JH
768See L<Encode::Encoding> for more details.
769
2575c402 770=head1 The UTF8 flag
7e19fb92 771
2575c402 772Before the introduction of Unicode support in perl, The C<eq> operator
b7a5c9de 773just compared the strings represented by two scalars. Beginning with
2575c402
JW
774perl 5.8, C<eq> compares two strings with simultaneous consideration of
775I<the UTF8 flag>. To explain why we made it so, I will quote page 402 of
776C<Programming Perl, 3rd ed.>
7e19fb92
JH
777
778=over 2
779
780=item Goal #1:
781
782Old byte-oriented programs should not spontaneously break on the old
783byte-oriented data they used to work on.
784
785=item Goal #2:
786
787Old byte-oriented programs should magically start working on the new
788character-oriented data when appropriate.
789
790=item Goal #3:
791
792Programs should run just as fast in the new character-oriented mode
793as in the old byte-oriented mode.
794
795=item Goal #4:
796
797Perl should remain one language, rather than forking into a
798byte-oriented Perl and a character-oriented Perl.
799
800=back
801
802Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0
803was born and many features documented in the book remained
b7a5c9de 804unimplemented for a long time. Perl 5.8 corrected this and the introduction
2575c402
JW
805of the UTF8 flag is one of them. You can think of this perl notion as of a
806byte-oriented mode (UTF8 flag off) and a character-oriented mode (UTF8
7e19fb92
JH
807flag on).
808
2575c402 809Here is how Encode takes care of the UTF8 flag.
7e19fb92 810
4bdf5738 811=over 2
7e19fb92
JH
812
813=item *
814
2575c402 815When you encode, the resulting UTF8 flag is always off.
7e19fb92 816
151b5d36 817=item *
7e19fb92 818
2575c402 819When you decode, the resulting UTF8 flag is on unless you can
7e19fb92
JH
820unambiguously represent data. Here is the definition of
821dis-ambiguity.
822
b7a5c9de 823After C<$utf8 = decode('foo', $octet);>,
7e19fb92 824
2575c402 825 When $octet is... The UTF8 flag in $utf8 is
7e19fb92
JH
826 ---------------------------------------------
827 In ASCII only (or EBCDIC only) OFF
828 In ISO-8859-1 ON
829 In any other Encoding ON
830 ---------------------------------------------
831
3c4b39be 832As you see, there is one exception, In ASCII. That way you can assume
7e19fb92
JH
833Goal #1. And with Encode Goal #2 is assumed but you still have to be
834careful in such cases mentioned in B<CAVEAT> paragraphs.
835
2575c402 836This UTF8 flag is not visible in perl scripts, exactly for the same
7e19fb92
JH
837reason you cannot (or you I<don't have to>) see if a scalar contains a
838string, integer, or floating point number. But you can still peek
839and poke these if you will. See the section below.
840
841=back
842
843=head2 Messing with Perl's Internals
4411f3b6 844
47bfe92f 845The following API uses parts of Perl's internals in the current
0ab8f81e 846implementation. As such, they are efficient but may change.
4411f3b6 847
7e19fb92 848=over 2
4411f3b6 849
a63c962f 850=item is_utf8(STRING [, CHECK])
4411f3b6 851
2575c402 852[INTERNAL] Tests whether the UTF8 flag is turned on in the STRING.
47bfe92f
JH
853If CHECK is true, also checks the data in STRING for being well-formed
854UTF-8. Returns true if successful, false otherwise.
4411f3b6 855
2c246b25 856As of perl 5.8.1, L<utf8> also has utf8::is_utf8().
b5ab1f6f 857
a63c962f 858=item _utf8_on(STRING)
4411f3b6 859
2575c402 860[INTERNAL] Turns on the UTF8 flag in STRING. The data in STRING is
4411f3b6
NIS
861B<not> checked for being well-formed UTF-8. Do not use unless you
862B<know> that the STRING is well-formed UTF-8. Returns the previous
2575c402 863state of the UTF8 flag (so please don't treat the return value as
0ab8f81e 864indicating success or failure), or C<undef> if STRING is not a string.
4411f3b6 865
64bc6d54
SH
866This function does not work on tainted values.
867
a63c962f 868=item _utf8_off(STRING)
4411f3b6 869
2575c402
JW
870[INTERNAL] Turns off the UTF8 flag in STRING. Do not use frivolously.
871Returns the previous state of the UTF8 flag (so please don't treat the
0ab8f81e 872return value as indicating success or failure), or C<undef> if STRING is
4411f3b6
NIS
873not a string.
874
64bc6d54
SH
875This function does not work on tainted values.
876
4411f3b6
NIS
877=back
878
2575c402 879=head1 UTF-8 vs. utf8 vs. UTF8
7f0d54d7
RGS
880
881 ....We now view strings not as sequences of bytes, but as sequences
882 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
883 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
884
885That has been the perl's notion of UTF-8 but official UTF-8 is more
886strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are
887not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al).
888
889Now that is overruled by Larry Wall himself.
890
891 From: Larry Wall <larry@wall.org>
892 Date: December 04, 2004 11:51:58 JST
893 To: perl-unicode@perl.org
894 Subject: Re: Make Encode.pm support the real UTF-8
895 Message-Id: <20041204025158.GA28754@wall.org>
896
897 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
898 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
899 : but "UTF-8" is the name of the standard and should give the
900 : corresponding behaviour.
901
902 For what it's worth, that's how I've always kept them straight in my
903 head.
8e180e82 904
7f0d54d7
RGS
905 Also for what it's worth, Perl 6 will mostly default to strict but
906 make it easy to switch back to lax.
907
908 Larry
909
910Do you copy? As of Perl 5.8.7, B<UTF-8> means strict, official UTF-8
911while B<utf8> means liberal, lax, version thereof. And Encode version
9122.10 or later thus groks the difference between C<UTF-8> and C"utf8".
913
914 encode("utf8", "\x{FFFF_FFFF}", 1); # okay
915 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
916
917C<UTF-8> in Encode is actually a canonical name for C<utf-8-strict>.
918Yes, the hyphen between "UTF" and "8" is important. Without it Encode
919goes "liberal"
920
921 find_encoding("UTF-8")->name # is 'utf-8-strict'
922 find_encoding("utf-8")->name # ditto. names are case insensitive
50c1ac04 923 find_encoding("utf_8")->name # ditto. "_" are treated as "-"
7f0d54d7
RGS
924 find_encoding("UTF8")->name # is 'utf8'.
925
2575c402
JW
926The UTF8 flag is internally called UTF8, without a hyphen. It indicates
927whether a string is internally encoded as utf8, also without a hypen.
7f0d54d7 928
4411f3b6
NIS
929=head1 SEE ALSO
930
5d030b67
JH
931L<Encode::Encoding>,
932L<Encode::Supported>,
6d1c0808 933L<Encode::PerlIO>,
5d030b67 934L<encoding>,
6d1c0808
JH
935L<perlebcdic>,
936L<perlfunc/open>,
370462a2 937L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
6d1c0808 938L<utf8>,
5d030b67 939the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 940
85982a32 941=head1 MAINTAINER
aae85ceb
DK
942
943This project was originated by Nick Ing-Simmons and later maintained
7e19fb92
JH
944by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full
945list of people involved. For any questions, use
b7a5c9de 946E<lt>perl-unicode@perl.orgE<gt> so we can all share.
aae85ceb 947
d1256cb1
RGS
948While Dan Kogai retains the copyright as a maintainer, the credit
949should go to all those involoved. See AUTHORS for those submitted
950codes.
951
952=head1 COPYRIGHT
953
954Copyright 2002-2006 Dan Kogai E<lt>dankogai@dan.co.jpE<gt>
955
956This library is free software; you can redistribute it and/or modify
957it under the same terms as Perl itself.
958
4411f3b6 959=cut