This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Resubmit change #28095
[perl5.git] / ext / Encode / Encode.pm
CommitLineData
10c5ecbb 1#
0a8c69ed 2# $Id: Encode.pm,v 2.15 2006/04/06 15:44:11 dankogai Exp dankogai $
10c5ecbb 3#
2c674647 4package Encode;
51ef4e11 5use strict;
0a8c69ed 6our $VERSION = sprintf "%d.%02d", q$Revision: 2.15 $ =~ /(\d+)/g;
8f139f4c 7sub DEBUG () { 0 }
6d1c0808 8use XSLoader ();
10c5ecbb 9XSLoader::load(__PACKAGE__, $VERSION);
2c674647 10
2c674647 11require Exporter;
7e19fb92 12use base qw/Exporter/;
2c674647 13
4411f3b6 14# Public, encouraged API is exported by default
85982a32
JH
15
16our @EXPORT = qw(
0a8c69ed 17 decode decode_utf8 encode encode_utf8 str2bytes bytes2str
a0d8a30e 18 encodings find_encoding clone_encoding
4411f3b6
NIS
19);
20
b7a5c9de 21our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
56ff7374 22 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL);
b7a5c9de 23our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
af1f55d9 24 FB_PERLQQ FB_HTMLCREF FB_XMLCREF);
85982a32 25
51ef4e11 26our @EXPORT_OK =
6d1c0808 27 (
85982a32
JH
28 qw(
29 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
30 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
31 ),
32 @FB_FLAGS, @FB_CONSTS,
33 );
34
6d1c0808 35our %EXPORT_TAGS =
85982a32
JH
36 (
37 all => [ @EXPORT, @EXPORT_OK ],
38 fallbacks => [ @FB_CONSTS ],
39 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
40 );
41
4411f3b6 42# Documentation moved after __END__ for speed - NI-S
2c674647 43
a63c962f 44our $ON_EBCDIC = (ord("A") == 193);
f2a2953c 45
5d030b67
JH
46use Encode::Alias;
47
5129552c
JH
48# Make a %Encoding package variable to allow a certain amount of cheating
49our %Encoding;
aae85ceb
DK
50our %ExtModule;
51require Encode::Config;
52eval { require Encode::ConfigLocal };
5129552c 53
656753f8
NIS
54sub encodings
55{
5129552c 56 my $class = shift;
fc17bd48
JH
57 my %enc;
58 if (@_ and $_[0] eq ":all"){
59 %enc = ( %Encoding, %ExtModule );
60 }else{
61 %enc = %Encoding;
62 for my $mod (map {m/::/o ? $_ : "Encode::$_" } @_){
8f139f4c 63 DEBUG and warn $mod;
fc17bd48
JH
64 for my $enc (keys %ExtModule){
65 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
66 }
67 }
5129552c
JH
68 }
69 return
ce912cd4 70 sort { lc $a cmp lc $b }
fc17bd48 71 grep {!/^(?:Internal|Unicode|Guess)$/o} keys %enc;
51ef4e11
NIS
72}
73
85982a32 74sub perlio_ok{
0ab8f81e 75 my $obj = ref($_[0]) ? $_[0] : find_encoding($_[0]);
011b2d2f 76 $obj->can("perlio_ok") and return $obj->perlio_ok();
0ab8f81e 77 return 0; # safety net
85982a32
JH
78}
79
51ef4e11
NIS
80sub define_encoding
81{
18586f54
NIS
82 my $obj = shift;
83 my $name = shift;
5129552c 84 $Encoding{$name} = $obj;
18586f54
NIS
85 my $lc = lc($name);
86 define_alias($lc => $obj) unless $lc eq $name;
10c5ecbb 87 while (@_){
18586f54 88 my $alias = shift;
10c5ecbb 89 define_alias($alias, $obj);
18586f54
NIS
90 }
91 return $obj;
656753f8
NIS
92}
93
656753f8
NIS
94sub getEncoding
95{
10c5ecbb
JH
96 my ($class, $name, $skip_external) = @_;
97
a0d8a30e 98 ref($name) && $name->can('renew') and return $name;
10c5ecbb 99 exists $Encoding{$name} and return $Encoding{$name};
18586f54 100 my $lc = lc $name;
10c5ecbb 101 exists $Encoding{$lc} and return $Encoding{$lc};
c50d192e 102
5129552c 103 my $oc = $class->find_alias($name);
10c5ecbb
JH
104 defined($oc) and return $oc;
105 $lc ne $name and $oc = $class->find_alias($lc);
106 defined($oc) and return $oc;
c50d192e 107
c731e18e 108 unless ($skip_external)
d1ed7747 109 {
c731e18e
JH
110 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
111 $mod =~ s,::,/,g ; $mod .= '.pm';
112 eval{ require $mod; };
10c5ecbb 113 exists $Encoding{$name} and return $Encoding{$name};
c731e18e 114 }
d1ed7747 115 }
18586f54 116 return;
656753f8
NIS
117}
118
a0d8a30e 119sub find_encoding($;$)
4411f3b6 120{
10c5ecbb 121 my ($name, $skip_external) = @_;
dd9703c9 122 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6
NIS
123}
124
a0d8a30e 125sub resolve_alias($){
fcb875d4
JH
126 my $obj = find_encoding(shift);
127 defined $obj and return $obj->name;
128 return;
129}
130
a0d8a30e
DK
131sub clone_encoding($){
132 my $obj = find_encoding(shift);
133 ref $obj or return;
134 eval { require Storable };
135 $@ and return;
136 return Storable::dclone($obj);
137}
138
b2704119 139sub encode($$;$)
4411f3b6 140{
e8c86ba6 141 my ($name, $string, $check) = @_;
0f7c507f 142 return undef unless defined $string;
78589665 143 $string .= '' if ref $string; # stringify;
b2704119 144 $check ||=0;
18586f54 145 my $enc = find_encoding($name);
10c5ecbb
JH
146 unless(defined $enc){
147 require Carp;
148 Carp::croak("Unknown encoding '$name'");
149 }
18586f54 150 my $octets = $enc->encode($string,$check);
7f0d54d7 151 $_[1] = $string if $check and !($check & LEAVE_SRC());
18586f54 152 return $octets;
4411f3b6 153}
0a8c69ed 154*str2bytes = \&encode;
4411f3b6 155
b2704119 156sub decode($$;$)
4411f3b6 157{
18586f54 158 my ($name,$octets,$check) = @_;
0f7c507f 159 return undef unless defined $octets;
78589665 160 $octets .= '' if ref $octets;
b2704119 161 $check ||=0;
18586f54 162 my $enc = find_encoding($name);
10c5ecbb
JH
163 unless(defined $enc){
164 require Carp;
165 Carp::croak("Unknown encoding '$name'");
166 }
18586f54 167 my $string = $enc->decode($octets,$check);
7f0d54d7 168 $_[1] = $octets if $check and !($check & LEAVE_SRC());
18586f54 169 return $string;
4411f3b6 170}
0a8c69ed 171*bytes2str = \&decode;
4411f3b6 172
b2704119 173sub from_to($$$;$)
4411f3b6 174{
18586f54 175 my ($string,$from,$to,$check) = @_;
0f7c507f 176 return undef unless defined $string;
b2704119 177 $check ||=0;
18586f54 178 my $f = find_encoding($from);
10c5ecbb
JH
179 unless (defined $f){
180 require Carp;
181 Carp::croak("Unknown encoding '$from'");
182 }
18586f54 183 my $t = find_encoding($to);
10c5ecbb
JH
184 unless (defined $t){
185 require Carp;
186 Carp::croak("Unknown encoding '$to'");
187 }
41c240f5
RGS
188 my $uni = $f->decode($string);
189 $_[0] = $string = $t->encode($uni,$check);
18586f54 190 return undef if ($check && length($uni));
41c240f5 191 return defined($_[0]) ? length($string) : undef ;
4411f3b6
NIS
192}
193
b2704119 194sub encode_utf8($)
4411f3b6 195{
18586f54 196 my ($str) = @_;
c731e18e 197 utf8::encode($str);
18586f54 198 return $str;
4411f3b6
NIS
199}
200
c2cbba7d 201sub decode_utf8($;$)
4411f3b6 202{
c2cbba7d 203 my ($str, $check) = @_;
41c240f5 204 return $str if is_utf8($str);
c2cbba7d
RGS
205 if ($check){
206 return decode("utf8", $str, $check);
207 }else{
dc4a2e29 208 return decode("utf8", $str);
c2cbba7d
RGS
209 return $str;
210 }
5ad8ef52
NIS
211}
212
b536bf57 213predefine_encodings(1);
f2a2953c
JH
214
215#
216# This is to restore %Encoding if really needed;
217#
10c5ecbb 218
f2a2953c 219sub predefine_encodings{
10c5ecbb 220 use Encode::Encoding;
b536bf57
DK
221 no warnings 'redefine';
222 my $use_xs = shift;
6d1c0808 223 if ($ON_EBCDIC) {
f2a2953c
JH
224 # was in Encode::UTF_EBCDIC
225 package Encode::UTF_EBCDIC;
10c5ecbb 226 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
f2a2953c
JH
227 *decode = sub{
228 my ($obj,$str,$chk) = @_;
229 my $res = '';
230 for (my $i = 0; $i < length($str); $i++) {
6d1c0808 231 $res .=
f2a2953c
JH
232 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
233 }
234 $_[1] = '' if $chk;
235 return $res;
236 };
237 *encode = sub{
238 my ($obj,$str,$chk) = @_;
239 my $res = '';
240 for (my $i = 0; $i < length($str); $i++) {
6d1c0808 241 $res .=
f2a2953c
JH
242 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
243 }
244 $_[1] = '' if $chk;
245 return $res;
246 };
6d1c0808 247 $Encode::Encoding{Unicode} =
c731e18e 248 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
6d1c0808 249 } else {
f2a2953c 250 package Encode::Internal;
10c5ecbb 251 push @Encode::Internal::ISA, 'Encode::Encoding';
f2a2953c
JH
252 *decode = sub{
253 my ($obj,$str,$chk) = @_;
254 utf8::upgrade($str);
255 $_[1] = '' if $chk;
256 return $str;
257 };
258 *encode = \&decode;
6d1c0808 259 $Encode::Encoding{Unicode} =
c731e18e 260 bless {Name => "Internal"} => "Encode::Internal";
f2a2953c
JH
261 }
262
263 {
264 # was in Encode::utf8
265 package Encode::utf8;
10c5ecbb 266 push @Encode::utf8::ISA, 'Encode::Encoding';
b536bf57
DK
267 #
268 if ($use_xs){
8f139f4c 269 Encode::DEBUG and warn __PACKAGE__, " XS on";
b536bf57
DK
270 *decode = \&decode_xs;
271 *encode = \&encode_xs;
272 }else{
8f139f4c 273 Encode::DEBUG and warn __PACKAGE__, " XS off";
b536bf57
DK
274 *decode = sub{
275 my ($obj,$octets,$chk) = @_;
276 my $str = Encode::decode_utf8($octets);
277 if (defined $str) {
278 $_[1] = '' if $chk;
279 return $str;
280 }
281 return undef;
282 };
283 *encode = sub {
284 my ($obj,$string,$chk) = @_;
285 my $octets = Encode::encode_utf8($string);
286 $_[1] = '' if $chk;
287 return $octets;
288 };
289 }
220e2d4e
IH
290 *cat_decode = sub{ # ($obj, $dst, $src, $pos, $trm, $chk)
291 my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk
292 my ($rdst, $rsrc, $rpos) = \@_[1,2,3];
293 use bytes;
294 if ((my $npos = index($$rsrc, $trm, $pos)) >= 0) {
295 $$rdst .= substr($$rsrc, $pos, $npos - $pos + length($trm));
296 $$rpos = $npos + length($trm);
297 return 1;
298 }
299 $$rdst .= substr($$rsrc, $pos);
300 $$rpos = length($$rsrc);
301 return '';
302 };
b7a5c9de 303 $Encode::Encoding{utf8} =
c731e18e 304 bless {Name => "utf8"} => "Encode::utf8";
7f0d54d7
RGS
305 $Encode::Encoding{"utf-8-strict"} =
306 bless {Name => "utf-8-strict", strict_utf8 => 1 } => "Encode::utf8";
f2a2953c 307 }
f2a2953c
JH
308}
309
656753f8
NIS
3101;
311
2a936312
NIS
312__END__
313
4411f3b6
NIS
314=head1 NAME
315
316Encode - character encodings
317
318=head1 SYNOPSIS
319
320 use Encode;
321
67d7b5ef
JH
322=head2 Table of Contents
323
0ab8f81e 324Encode consists of a collection of modules whose details are too big
67d7b5ef 325to fit in one document. This POD itself explains the top-level APIs
6d1c0808 326and general topics at a glance. For other topics and more details,
0ab8f81e 327see the PODs below:
67d7b5ef
JH
328
329 Name Description
330 --------------------------------------------------------
6d1c0808 331 Encode::Alias Alias definitions to encodings
67d7b5ef
JH
332 Encode::Encoding Encode Implementation Base Class
333 Encode::Supported List of Supported Encodings
334 Encode::CN Simplified Chinese Encodings
335 Encode::JP Japanese Encodings
336 Encode::KR Korean Encodings
337 Encode::TW Traditional Chinese Encodings
338 --------------------------------------------------------
339
4411f3b6
NIS
340=head1 DESCRIPTION
341
47bfe92f 342The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef
JH
343and the rest of the system. Perl strings are sequences of
344B<characters>.
345
346The repertoire of characters that Perl can represent is at least that
347defined by the Unicode Consortium. On most platforms the ordinal
348values of the characters (as returned by C<ord(ch)>) is the "Unicode
349codepoint" for the character (the exceptions are those platforms where
350the legacy encoding is some variant of EBCDIC rather than a super-set
351of ASCII - see L<perlebcdic>).
352
0ab8f81e 353Traditionally, computer data has been moved around in 8-bit chunks
67d7b5ef
JH
354often called "bytes". These chunks are also known as "octets" in
355networking standards. Perl is widely used to manipulate data of many
356types - not only strings of characters representing human or computer
0ab8f81e 357languages but also "binary" data being the machine's representation of
67d7b5ef
JH
358numbers, pixels in an image - or just about anything.
359
0ab8f81e 360When Perl is processing "binary data", the programmer wants Perl to
67d7b5ef 361process "sequences of bytes". This is not a problem for Perl - as a
0ab8f81e 362byte has 256 possible values, it easily fits in Perl's much larger
67d7b5ef
JH
363"logical character".
364
365=head2 TERMINOLOGY
4411f3b6 366
7e19fb92 367=over 2
21938dfa 368
67d7b5ef
JH
369=item *
370
371I<character>: a character in the range 0..(2**32-1) (or more).
372(What Perl's strings are made of.)
373
374=item *
375
376I<byte>: a character in the range 0..255
377(A special case of a Perl character.)
378
379=item *
380
381I<octet>: 8 bits of data, with ordinal values 0..255
0ab8f81e 382(Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
67d7b5ef
JH
383
384=back
4411f3b6 385
67d7b5ef 386=head1 PERL ENCODING API
4411f3b6 387
7e19fb92 388=over 2
4411f3b6 389
b7a5c9de 390=item $octets = encode(ENCODING, $string [, CHECK])
4411f3b6 391
0ab8f81e 392Encodes a string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 393a sequence of octets. ENCODING can be either a canonical name or
0ab8f81e
JH
394an alias. For encoding names and aliases, see L</"Defining Aliases">.
395For CHECK, see L</"Handling Malformed Data">.
4411f3b6 396
b7a5c9de 397For example, to convert a string from Perl's internal format to
6d1c0808 398iso-8859-1 (also known as Latin1),
681a7c68 399
b7a5c9de 400 $octets = encode("iso-8859-1", $string);
7e19fb92 401
b7a5c9de
JH
402B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then $octets
403B<may not be equal to> $string. Though they both contain the same data, the utf8 flag
7e19fb92
JH
404for $octets is B<always> off. When you encode anything, utf8 flag of
405the result is always off, even when it contains completely valid utf8
406string. See L</"The UTF-8 flag"> below.
681a7c68 407
7f0d54d7 408If the $string is C<undef> then C<undef> is returned.
4089adc4 409
b7a5c9de 410=item $string = decode(ENCODING, $octets [, CHECK])
4411f3b6 411
0ab8f81e
JH
412Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
413internal form and returns the resulting string. As in encode(),
414ENCODING can be either a canonical name or an alias. For encoding names
415and aliases, see L</"Defining Aliases">. For CHECK, see
47bfe92f
JH
416L</"Handling Malformed Data">.
417
b7a5c9de 418For example, to convert ISO-8859-1 data to a string in Perl's internal format:
681a7c68 419
b7a5c9de 420 $string = decode("iso-8859-1", $octets);
681a7c68 421
b7a5c9de
JH
422B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
423B<may not be equal to> $octets. Though they both contain the same data,
424the utf8 flag for $string is on unless $octets entirely consists of
7e19fb92
JH
425ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag">
426below.
47bfe92f 427
7f0d54d7 428If the $string is C<undef> then C<undef> is returned.
4089adc4 429
b7a5c9de 430=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
7e19fb92 431
b7a5c9de
JH
432Converts B<in-place> data between two encodings. The data in $octets
433must be encoded as octets and not as characters in Perl's internal
f9d05ba3
RGS
434format. For example, to convert ISO-8859-1 data to Microsoft's CP1250
435encoding:
2b106fbe 436
b7a5c9de 437 from_to($octets, "iso-8859-1", "cp1250");
2b106fbe
JH
438
439and to convert it back:
440
b7a5c9de 441 from_to($octets, "cp1250", "iso-8859-1");
4411f3b6 442
ab97ca19 443Note that because the conversion happens in place, the data to be
0ab8f81e 444converted cannot be a string constant; it must be a scalar variable.
ab97ca19 445
f9d05ba3
RGS
446from_to() returns the length of the converted string in octets on
447success, I<undef> on error.
3ef515df 448
b7a5c9de 449B<CAVEAT>: The following operations look the same but are not quite so;
7e19fb92 450
b7a5c9de 451 from_to($data, "iso-8859-1", "utf8"); #1
7e19fb92 452 $data = decode("iso-8859-1", $data); #2
4411f3b6 453
b7a5c9de 454Both #1 and #2 make $data consist of a completely valid UTF-8 string
7e19fb92 455but only #2 turns utf8 flag on. #1 is equivalent to
f2a2953c 456
7e19fb92 457 $data = encode("utf8", decode("iso-8859-1", $data));
f2a2953c 458
7e19fb92 459See L</"The UTF-8 flag"> below.
f2a2953c
JH
460
461=item $octets = encode_utf8($string);
462
7e19fb92 463Equivalent to C<$octets = encode("utf8", $string);> The characters
b7a5c9de
JH
464that comprise $string are encoded in Perl's internal format and the
465result is returned as a sequence of octets. All possible
7e19fb92
JH
466characters have a UTF-8 representation so this function cannot fail.
467
f2a2953c
JH
468
469=item $string = decode_utf8($octets [, CHECK]);
470
7e19fb92 471equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
b7a5c9de 472The sequence of octets represented by
7e19fb92
JH
473$octets is decoded from UTF-8 into a sequence of logical
474characters. Not all sequences of octets form valid UTF-8 encodings, so
475it is possible for this call to fail. For CHECK, see
476L</"Handling Malformed Data">.
f2a2953c
JH
477
478=back
479
51ef4e11
NIS
480=head2 Listing available encodings
481
5129552c
JH
482 use Encode;
483 @list = Encode->encodings();
484
485Returns a list of the canonical names of the available encodings that
486are loaded. To get a list of all available encodings including the
487ones that are not loaded yet, say
488
489 @all_encodings = Encode->encodings(":all");
490
0ab8f81e 491Or you can give the name of a specific module.
5129552c 492
c731e18e
JH
493 @with_jp = Encode->encodings("Encode::JP");
494
495When "::" is not in the name, "Encode::" is assumed.
51ef4e11 496
c731e18e 497 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 498
0ab8f81e 499To find out in detail which encodings are supported by this package,
5d030b67 500see L<Encode::Supported>.
51ef4e11
NIS
501
502=head2 Defining Aliases
503
0ab8f81e 504To add a new alias to a given encoding, use:
67d7b5ef 505
5129552c
JH
506 use Encode;
507 use Encode::Alias;
a63c962f 508 define_alias(newName => ENCODING);
51ef4e11 509
3ef515df 510After that, newName can be used as an alias for ENCODING.
f2a2953c
JH
511ENCODING may be either the name of an encoding or an
512I<encoding object>
51ef4e11 513
fcb875d4
JH
514But before you do so, make sure the alias is nonexistent with
515C<resolve_alias()>, which returns the canonical name thereof.
516i.e.
517
518 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
519 Encode::resolve_alias("iso-8859-12") # false; nonexistent
520 Encode::resolve_alias($name) eq $name # true if $name is canonical
521
0ab8f81e
JH
522resolve_alias() does not need C<use Encode::Alias>; it can be
523exported via C<use Encode qw(resolve_alias)>.
fcb875d4 524
0ab8f81e 525See L<Encode::Alias> for details.
51ef4e11 526
85982a32 527=head1 Encoding via PerlIO
4411f3b6 528
b7a5c9de 529If your perl supports I<PerlIO> (which is the default), you can use a PerlIO layer to decode
0ab8f81e
JH
530and encode directly via a filehandle. The following two examples
531are totally identical in their functionality.
4411f3b6 532
85982a32
JH
533 # via PerlIO
534 open my $in, "<:encoding(shiftjis)", $infile or die;
535 open my $out, ">:encoding(euc-jp)", $outfile or die;
b7a5c9de 536 while(<$in>){ print $out $_; }
8e86646e 537
85982a32 538 # via from_to
0ab8f81e
JH
539 open my $in, "<", $infile or die;
540 open my $out, ">", $outfile or die;
b7a5c9de 541 while(<$in>){
0ab8f81e 542 from_to($_, "shiftjis", "euc-jp", 1);
b7a5c9de 543 print $out $_;
85982a32 544 }
4411f3b6 545
b7a5c9de 546Unfortunately, it may be that encodings are PerlIO-savvy. You can check
0ab8f81e
JH
547if your encoding is supported by PerlIO by calling the C<perlio_ok>
548method.
549
550 Encode::perlio_ok("hz"); # False
551 find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available
552
553 use Encode qw(perlio_ok); # exported upon request
554 perlio_ok("euc-jp")
4411f3b6 555
0ab8f81e 556Fortunately, all encodings that come with Encode core are PerlIO-savvy
f9d05ba3
RGS
557except for hz and ISO-2022-kr. For gory details, see
558L<Encode::Encoding> and L<Encode::PerlIO>.
4411f3b6 559
85982a32 560=head1 Handling Malformed Data
4411f3b6 561
8e180e82
SP
562The optional I<CHECK> argument tells Encode what to do when it
563encounters malformed data. Without CHECK, Encode::FB_DEFAULT ( == 0 )
564is assumed.
565
566As of version 2.12 Encode supports coderef values for CHECK. See below.
f9d05ba3
RGS
567
568=over 2
569
3c4b39be 570=item B<NOTE:> Not all encoding support this feature
f9d05ba3
RGS
571
572Some encodings ignore I<CHECK> argument. For example,
573L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
574
575=back
576
577Now here is the list of I<CHECK> values available
47bfe92f 578
151b5d36
JH
579=over 2
580
85982a32 581=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
47bfe92f 582
f9d05ba3 583If I<CHECK> is 0, (en|de)code will put a I<substitution character> in
78589665
RGS
584place of a malformed character. When you encode, E<lt>subcharE<gt>
585will be used. When you decode the code point C<0xFFFD> is used. If
586the data is supposed to be UTF-8, an optional lexical warning
587(category utf8) is given.
e9692b5b 588
7e19fb92 589=item I<CHECK> = Encode::FB_CROAK ( == 1)
e9692b5b 590
b7a5c9de 591If I<CHECK> is 1, methods will die on error immediately with an error
0ab8f81e 592message. Therefore, when I<CHECK> is set to 1, you should trap the
f9d05ba3 593error with eval{} unless you really want to let it die.
47bfe92f 594
85982a32 595=item I<CHECK> = Encode::FB_QUIET
47bfe92f 596
85982a32 597If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
f9d05ba3
RGS
598return the portion of the data that has been processed so far when an
599error occurs. The data argument will be overwritten with everything
600after that point (that is, the unprocessed part of data). This is
601handy when you have to call decode repeatedly in the case where your
602source data may contain partial multi-byte character sequences,
603(i.e. you are reading with a fixed-width buffer). Here is a sample
604code that does exactly this:
4411f3b6 605
78589665
RGS
606 my $buffer = ''; my $string = '';
607 while(read $fh, $buffer, 256, length($buffer)){
608 $string .= decode($encoding, $buffer, Encode::FB_QUIET);
609 # $buffer now contains the unprocessed partial character
85982a32 610 }
1768d7eb 611
85982a32 612=item I<CHECK> = Encode::FB_WARN
67d7b5ef 613
0ab8f81e
JH
614This is the same as above, except that it warns on error. Handy when
615you are debugging the mode above.
85982a32
JH
616
617=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
618
af1f55d9
JH
619=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
620
621=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
622
85982a32
JH
623For encodings that are implemented by Encode::XS, CHECK ==
624Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
625
b7a5c9de
JH
626When you decode, C<\xI<HH>> will be inserted for a malformed character,
627where I<HH> is the hex representation of the octet that could not be
628decoded to utf8. And when you encode, C<\x{I<HHHH>}> will be inserted,
629where I<HHHH> is the Unicode ID of the character that cannot be found
0ab8f81e 630in the character repertoire of the encoding.
85982a32 631
af1f55d9 632HTML/XML character reference modes are about the same, in place of
78589665
RGS
633C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number and
634XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
af1f55d9 635
7f0d54d7
RGS
636In Encode 2.10 or later, C<LEAVE_SRC> is also implied.
637
85982a32
JH
638=item The bitmask
639
0ab8f81e
JH
640These modes are actually set via a bitmask. Here is how the FB_XX
641constants are laid out. You can import the FB_XX constants via
642C<use Encode qw(:fallbacks)>; you can import the generic bitmask
643constants via C<use Encode qw(:fallback_all)>.
85982a32 644
b0b300a3
JH
645 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
646 DIE_ON_ERR 0x0001 X
4089adc4 647 WARN_ON_ERR 0x0002 X
b0b300a3 648 RETURN_ON_ERR 0x0004 X X
7f0d54d7 649 LEAVE_SRC 0x0008 X
b0b300a3 650 PERLQQ 0x0100 X
b7a5c9de
JH
651 HTMLCREF 0x0200
652 XMLCREF 0x0400
67d7b5ef 653
151b5d36
JH
654=back
655
8e180e82
SP
656=head2 coderef for CHECK
657
658As of Encode 2.12 CHECK can also be a code reference which takes the
659ord value of unmapped caharacter as an argument and returns a string
660that represents the fallback character. For instance,
67d7b5ef 661
8e180e82 662 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
67d7b5ef 663
8e180e82
SP
664Acts like FB_PERLQQ but E<lt>U+I<XXXX>E<gt> is used instead of
665\x{I<XXXX>}.
982a4085 666
67d7b5ef
JH
667=head1 Defining Encodings
668
669To define a new encoding, use:
670
b7a5c9de 671 use Encode qw(define_encoding);
67d7b5ef
JH
672 define_encoding($object, 'canonicalName' [, alias...]);
673
674I<canonicalName> will be associated with I<$object>. The object
0ab8f81e 675should provide the interface described in L<Encode::Encoding>.
67d7b5ef 676If more than two arguments are provided then additional
b7a5c9de 677arguments are taken as aliases for I<$object>.
67d7b5ef 678
f2a2953c
JH
679See L<Encode::Encoding> for more details.
680
7e19fb92
JH
681=head1 The UTF-8 flag
682
683Before the introduction of utf8 support in perl, The C<eq> operator
b7a5c9de
JH
684just compared the strings represented by two scalars. Beginning with
685perl 5.8, C<eq> compares two strings with simultaneous consideration
686of I<the utf8 flag>. To explain why we made it so, I will quote page
687402 of C<Programming Perl, 3rd ed.>
7e19fb92
JH
688
689=over 2
690
691=item Goal #1:
692
693Old byte-oriented programs should not spontaneously break on the old
694byte-oriented data they used to work on.
695
696=item Goal #2:
697
698Old byte-oriented programs should magically start working on the new
699character-oriented data when appropriate.
700
701=item Goal #3:
702
703Programs should run just as fast in the new character-oriented mode
704as in the old byte-oriented mode.
705
706=item Goal #4:
707
708Perl should remain one language, rather than forking into a
709byte-oriented Perl and a character-oriented Perl.
710
711=back
712
713Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0
714was born and many features documented in the book remained
b7a5c9de
JH
715unimplemented for a long time. Perl 5.8 corrected this and the introduction
716of the UTF-8 flag is one of them. You can think of this perl notion as of a
717byte-oriented mode (utf8 flag off) and a character-oriented mode (utf8
7e19fb92
JH
718flag on).
719
720Here is how Encode takes care of the utf8 flag.
721
4bdf5738 722=over 2
7e19fb92
JH
723
724=item *
725
726When you encode, the resulting utf8 flag is always off.
727
151b5d36 728=item *
7e19fb92 729
b7a5c9de 730When you decode, the resulting utf8 flag is on unless you can
7e19fb92
JH
731unambiguously represent data. Here is the definition of
732dis-ambiguity.
733
b7a5c9de 734After C<$utf8 = decode('foo', $octet);>,
7e19fb92
JH
735
736 When $octet is... The utf8 flag in $utf8 is
737 ---------------------------------------------
738 In ASCII only (or EBCDIC only) OFF
739 In ISO-8859-1 ON
740 In any other Encoding ON
741 ---------------------------------------------
742
3c4b39be 743As you see, there is one exception, In ASCII. That way you can assume
7e19fb92
JH
744Goal #1. And with Encode Goal #2 is assumed but you still have to be
745careful in such cases mentioned in B<CAVEAT> paragraphs.
746
747This utf8 flag is not visible in perl scripts, exactly for the same
748reason you cannot (or you I<don't have to>) see if a scalar contains a
749string, integer, or floating point number. But you can still peek
750and poke these if you will. See the section below.
751
752=back
753
754=head2 Messing with Perl's Internals
4411f3b6 755
47bfe92f 756The following API uses parts of Perl's internals in the current
0ab8f81e 757implementation. As such, they are efficient but may change.
4411f3b6 758
7e19fb92 759=over 2
4411f3b6 760
a63c962f 761=item is_utf8(STRING [, CHECK])
4411f3b6 762
0ab8f81e 763[INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING.
47bfe92f
JH
764If CHECK is true, also checks the data in STRING for being well-formed
765UTF-8. Returns true if successful, false otherwise.
4411f3b6 766
2c246b25 767As of perl 5.8.1, L<utf8> also has utf8::is_utf8().
b5ab1f6f 768
a63c962f 769=item _utf8_on(STRING)
4411f3b6 770
0ab8f81e 771[INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is
4411f3b6
NIS
772B<not> checked for being well-formed UTF-8. Do not use unless you
773B<know> that the STRING is well-formed UTF-8. Returns the previous
0ab8f81e
JH
774state of the UTF-8 flag (so please don't treat the return value as
775indicating success or failure), or C<undef> if STRING is not a string.
4411f3b6 776
a63c962f 777=item _utf8_off(STRING)
4411f3b6 778
0ab8f81e
JH
779[INTERNAL] Turns off the UTF-8 flag in STRING. Do not use frivolously.
780Returns the previous state of the UTF-8 flag (so please don't treat the
781return value as indicating success or failure), or C<undef> if STRING is
4411f3b6
NIS
782not a string.
783
784=back
785
7f0d54d7
RGS
786=head1 UTF-8 vs. utf8
787
788 ....We now view strings not as sequences of bytes, but as sequences
789 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
790 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
791
792That has been the perl's notion of UTF-8 but official UTF-8 is more
793strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are
794not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al).
795
796Now that is overruled by Larry Wall himself.
797
798 From: Larry Wall <larry@wall.org>
799 Date: December 04, 2004 11:51:58 JST
800 To: perl-unicode@perl.org
801 Subject: Re: Make Encode.pm support the real UTF-8
802 Message-Id: <20041204025158.GA28754@wall.org>
803
804 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
805 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
806 : but "UTF-8" is the name of the standard and should give the
807 : corresponding behaviour.
808
809 For what it's worth, that's how I've always kept them straight in my
810 head.
8e180e82 811
7f0d54d7
RGS
812 Also for what it's worth, Perl 6 will mostly default to strict but
813 make it easy to switch back to lax.
814
815 Larry
816
817Do you copy? As of Perl 5.8.7, B<UTF-8> means strict, official UTF-8
818while B<utf8> means liberal, lax, version thereof. And Encode version
8192.10 or later thus groks the difference between C<UTF-8> and C"utf8".
820
821 encode("utf8", "\x{FFFF_FFFF}", 1); # okay
822 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
823
824C<UTF-8> in Encode is actually a canonical name for C<utf-8-strict>.
825Yes, the hyphen between "UTF" and "8" is important. Without it Encode
826goes "liberal"
827
828 find_encoding("UTF-8")->name # is 'utf-8-strict'
829 find_encoding("utf-8")->name # ditto. names are case insensitive
50c1ac04 830 find_encoding("utf_8")->name # ditto. "_" are treated as "-"
7f0d54d7
RGS
831 find_encoding("UTF8")->name # is 'utf8'.
832
833
4411f3b6
NIS
834=head1 SEE ALSO
835
5d030b67
JH
836L<Encode::Encoding>,
837L<Encode::Supported>,
6d1c0808 838L<Encode::PerlIO>,
5d030b67 839L<encoding>,
6d1c0808
JH
840L<perlebcdic>,
841L<perlfunc/open>,
842L<perlunicode>,
843L<utf8>,
5d030b67 844the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 845
85982a32 846=head1 MAINTAINER
aae85ceb
DK
847
848This project was originated by Nick Ing-Simmons and later maintained
7e19fb92
JH
849by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full
850list of people involved. For any questions, use
b7a5c9de 851E<lt>perl-unicode@perl.orgE<gt> so we can all share.
aae85ceb 852
4411f3b6 853=cut