This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Use both cache entries for Perl_sv_pos_b2u().
[perl5.git] / ext / Encode / Encode.pm
CommitLineData
10c5ecbb 1#
41c240f5 2# $Id: Encode.pm,v 2.14 2006/01/15 15:43:36 dankogai Exp dankogai $
10c5ecbb 3#
2c674647 4package Encode;
51ef4e11 5use strict;
41c240f5 6our $VERSION = sprintf "%d.%02d", q$Revision: 2.14 $ =~ /(\d+)/g;
8f139f4c 7sub DEBUG () { 0 }
6d1c0808 8use XSLoader ();
10c5ecbb 9XSLoader::load(__PACKAGE__, $VERSION);
2c674647 10
2c674647 11require Exporter;
7e19fb92 12use base qw/Exporter/;
2c674647 13
4411f3b6 14# Public, encouraged API is exported by default
85982a32
JH
15
16our @EXPORT = qw(
17 decode decode_utf8 encode encode_utf8
a0d8a30e 18 encodings find_encoding clone_encoding
4411f3b6
NIS
19);
20
b7a5c9de 21our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
56ff7374 22 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL);
b7a5c9de 23our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
af1f55d9 24 FB_PERLQQ FB_HTMLCREF FB_XMLCREF);
85982a32 25
51ef4e11 26our @EXPORT_OK =
6d1c0808 27 (
85982a32
JH
28 qw(
29 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
30 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
31 ),
32 @FB_FLAGS, @FB_CONSTS,
33 );
34
6d1c0808 35our %EXPORT_TAGS =
85982a32
JH
36 (
37 all => [ @EXPORT, @EXPORT_OK ],
38 fallbacks => [ @FB_CONSTS ],
39 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
40 );
41
4411f3b6 42# Documentation moved after __END__ for speed - NI-S
2c674647 43
a63c962f 44our $ON_EBCDIC = (ord("A") == 193);
f2a2953c 45
5d030b67
JH
46use Encode::Alias;
47
5129552c
JH
48# Make a %Encoding package variable to allow a certain amount of cheating
49our %Encoding;
aae85ceb
DK
50our %ExtModule;
51require Encode::Config;
52eval { require Encode::ConfigLocal };
5129552c 53
656753f8
NIS
54sub encodings
55{
5129552c 56 my $class = shift;
fc17bd48
JH
57 my %enc;
58 if (@_ and $_[0] eq ":all"){
59 %enc = ( %Encoding, %ExtModule );
60 }else{
61 %enc = %Encoding;
62 for my $mod (map {m/::/o ? $_ : "Encode::$_" } @_){
8f139f4c 63 DEBUG and warn $mod;
fc17bd48
JH
64 for my $enc (keys %ExtModule){
65 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
66 }
67 }
5129552c
JH
68 }
69 return
ce912cd4 70 sort { lc $a cmp lc $b }
fc17bd48 71 grep {!/^(?:Internal|Unicode|Guess)$/o} keys %enc;
51ef4e11
NIS
72}
73
85982a32 74sub perlio_ok{
0ab8f81e 75 my $obj = ref($_[0]) ? $_[0] : find_encoding($_[0]);
011b2d2f 76 $obj->can("perlio_ok") and return $obj->perlio_ok();
0ab8f81e 77 return 0; # safety net
85982a32
JH
78}
79
51ef4e11
NIS
80sub define_encoding
81{
18586f54
NIS
82 my $obj = shift;
83 my $name = shift;
5129552c 84 $Encoding{$name} = $obj;
18586f54
NIS
85 my $lc = lc($name);
86 define_alias($lc => $obj) unless $lc eq $name;
10c5ecbb 87 while (@_){
18586f54 88 my $alias = shift;
10c5ecbb 89 define_alias($alias, $obj);
18586f54
NIS
90 }
91 return $obj;
656753f8
NIS
92}
93
656753f8
NIS
94sub getEncoding
95{
10c5ecbb
JH
96 my ($class, $name, $skip_external) = @_;
97
a0d8a30e 98 ref($name) && $name->can('renew') and return $name;
10c5ecbb 99 exists $Encoding{$name} and return $Encoding{$name};
18586f54 100 my $lc = lc $name;
10c5ecbb 101 exists $Encoding{$lc} and return $Encoding{$lc};
c50d192e 102
5129552c 103 my $oc = $class->find_alias($name);
10c5ecbb
JH
104 defined($oc) and return $oc;
105 $lc ne $name and $oc = $class->find_alias($lc);
106 defined($oc) and return $oc;
c50d192e 107
c731e18e 108 unless ($skip_external)
d1ed7747 109 {
c731e18e
JH
110 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
111 $mod =~ s,::,/,g ; $mod .= '.pm';
112 eval{ require $mod; };
10c5ecbb 113 exists $Encoding{$name} and return $Encoding{$name};
c731e18e 114 }
d1ed7747 115 }
18586f54 116 return;
656753f8
NIS
117}
118
a0d8a30e 119sub find_encoding($;$)
4411f3b6 120{
10c5ecbb 121 my ($name, $skip_external) = @_;
dd9703c9 122 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6
NIS
123}
124
a0d8a30e 125sub resolve_alias($){
fcb875d4
JH
126 my $obj = find_encoding(shift);
127 defined $obj and return $obj->name;
128 return;
129}
130
a0d8a30e
DK
131sub clone_encoding($){
132 my $obj = find_encoding(shift);
133 ref $obj or return;
134 eval { require Storable };
135 $@ and return;
136 return Storable::dclone($obj);
137}
138
b2704119 139sub encode($$;$)
4411f3b6 140{
e8c86ba6 141 my ($name, $string, $check) = @_;
0f7c507f 142 return undef unless defined $string;
78589665 143 $string .= '' if ref $string; # stringify;
b2704119 144 $check ||=0;
18586f54 145 my $enc = find_encoding($name);
10c5ecbb
JH
146 unless(defined $enc){
147 require Carp;
148 Carp::croak("Unknown encoding '$name'");
149 }
18586f54 150 my $octets = $enc->encode($string,$check);
7f0d54d7 151 $_[1] = $string if $check and !($check & LEAVE_SRC());
18586f54 152 return $octets;
4411f3b6
NIS
153}
154
b2704119 155sub decode($$;$)
4411f3b6 156{
18586f54 157 my ($name,$octets,$check) = @_;
0f7c507f 158 return undef unless defined $octets;
78589665 159 $octets .= '' if ref $octets;
b2704119 160 $check ||=0;
18586f54 161 my $enc = find_encoding($name);
10c5ecbb
JH
162 unless(defined $enc){
163 require Carp;
164 Carp::croak("Unknown encoding '$name'");
165 }
18586f54 166 my $string = $enc->decode($octets,$check);
7f0d54d7 167 $_[1] = $octets if $check and !($check & LEAVE_SRC());
18586f54 168 return $string;
4411f3b6
NIS
169}
170
b2704119 171sub from_to($$$;$)
4411f3b6 172{
18586f54 173 my ($string,$from,$to,$check) = @_;
0f7c507f 174 return undef unless defined $string;
b2704119 175 $check ||=0;
18586f54 176 my $f = find_encoding($from);
10c5ecbb
JH
177 unless (defined $f){
178 require Carp;
179 Carp::croak("Unknown encoding '$from'");
180 }
18586f54 181 my $t = find_encoding($to);
10c5ecbb
JH
182 unless (defined $t){
183 require Carp;
184 Carp::croak("Unknown encoding '$to'");
185 }
41c240f5
RGS
186 my $uni = $f->decode($string);
187 $_[0] = $string = $t->encode($uni,$check);
18586f54 188 return undef if ($check && length($uni));
41c240f5 189 return defined($_[0]) ? length($string) : undef ;
4411f3b6
NIS
190}
191
b2704119 192sub encode_utf8($)
4411f3b6 193{
18586f54 194 my ($str) = @_;
c731e18e 195 utf8::encode($str);
18586f54 196 return $str;
4411f3b6
NIS
197}
198
c2cbba7d 199sub decode_utf8($;$)
4411f3b6 200{
c2cbba7d 201 my ($str, $check) = @_;
41c240f5 202 return $str if is_utf8($str);
c2cbba7d
RGS
203 if ($check){
204 return decode("utf8", $str, $check);
205 }else{
dc4a2e29 206 return decode("utf8", $str);
c2cbba7d
RGS
207 return $str;
208 }
5ad8ef52
NIS
209}
210
b536bf57 211predefine_encodings(1);
f2a2953c
JH
212
213#
214# This is to restore %Encoding if really needed;
215#
10c5ecbb 216
f2a2953c 217sub predefine_encodings{
10c5ecbb 218 use Encode::Encoding;
b536bf57
DK
219 no warnings 'redefine';
220 my $use_xs = shift;
6d1c0808 221 if ($ON_EBCDIC) {
f2a2953c
JH
222 # was in Encode::UTF_EBCDIC
223 package Encode::UTF_EBCDIC;
10c5ecbb 224 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
f2a2953c
JH
225 *decode = sub{
226 my ($obj,$str,$chk) = @_;
227 my $res = '';
228 for (my $i = 0; $i < length($str); $i++) {
6d1c0808 229 $res .=
f2a2953c
JH
230 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
231 }
232 $_[1] = '' if $chk;
233 return $res;
234 };
235 *encode = sub{
236 my ($obj,$str,$chk) = @_;
237 my $res = '';
238 for (my $i = 0; $i < length($str); $i++) {
6d1c0808 239 $res .=
f2a2953c
JH
240 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
241 }
242 $_[1] = '' if $chk;
243 return $res;
244 };
6d1c0808 245 $Encode::Encoding{Unicode} =
c731e18e 246 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
6d1c0808 247 } else {
f2a2953c 248 package Encode::Internal;
10c5ecbb 249 push @Encode::Internal::ISA, 'Encode::Encoding';
f2a2953c
JH
250 *decode = sub{
251 my ($obj,$str,$chk) = @_;
252 utf8::upgrade($str);
253 $_[1] = '' if $chk;
254 return $str;
255 };
256 *encode = \&decode;
6d1c0808 257 $Encode::Encoding{Unicode} =
c731e18e 258 bless {Name => "Internal"} => "Encode::Internal";
f2a2953c
JH
259 }
260
261 {
262 # was in Encode::utf8
263 package Encode::utf8;
10c5ecbb 264 push @Encode::utf8::ISA, 'Encode::Encoding';
b536bf57
DK
265 #
266 if ($use_xs){
8f139f4c 267 Encode::DEBUG and warn __PACKAGE__, " XS on";
b536bf57
DK
268 *decode = \&decode_xs;
269 *encode = \&encode_xs;
270 }else{
8f139f4c 271 Encode::DEBUG and warn __PACKAGE__, " XS off";
b536bf57
DK
272 *decode = sub{
273 my ($obj,$octets,$chk) = @_;
274 my $str = Encode::decode_utf8($octets);
275 if (defined $str) {
276 $_[1] = '' if $chk;
277 return $str;
278 }
279 return undef;
280 };
281 *encode = sub {
282 my ($obj,$string,$chk) = @_;
283 my $octets = Encode::encode_utf8($string);
284 $_[1] = '' if $chk;
285 return $octets;
286 };
287 }
220e2d4e
IH
288 *cat_decode = sub{ # ($obj, $dst, $src, $pos, $trm, $chk)
289 my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk
290 my ($rdst, $rsrc, $rpos) = \@_[1,2,3];
291 use bytes;
292 if ((my $npos = index($$rsrc, $trm, $pos)) >= 0) {
293 $$rdst .= substr($$rsrc, $pos, $npos - $pos + length($trm));
294 $$rpos = $npos + length($trm);
295 return 1;
296 }
297 $$rdst .= substr($$rsrc, $pos);
298 $$rpos = length($$rsrc);
299 return '';
300 };
b7a5c9de 301 $Encode::Encoding{utf8} =
c731e18e 302 bless {Name => "utf8"} => "Encode::utf8";
7f0d54d7
RGS
303 $Encode::Encoding{"utf-8-strict"} =
304 bless {Name => "utf-8-strict", strict_utf8 => 1 } => "Encode::utf8";
f2a2953c 305 }
f2a2953c
JH
306}
307
656753f8
NIS
3081;
309
2a936312
NIS
310__END__
311
4411f3b6
NIS
312=head1 NAME
313
314Encode - character encodings
315
316=head1 SYNOPSIS
317
318 use Encode;
319
67d7b5ef
JH
320=head2 Table of Contents
321
0ab8f81e 322Encode consists of a collection of modules whose details are too big
67d7b5ef 323to fit in one document. This POD itself explains the top-level APIs
6d1c0808 324and general topics at a glance. For other topics and more details,
0ab8f81e 325see the PODs below:
67d7b5ef
JH
326
327 Name Description
328 --------------------------------------------------------
6d1c0808 329 Encode::Alias Alias definitions to encodings
67d7b5ef
JH
330 Encode::Encoding Encode Implementation Base Class
331 Encode::Supported List of Supported Encodings
332 Encode::CN Simplified Chinese Encodings
333 Encode::JP Japanese Encodings
334 Encode::KR Korean Encodings
335 Encode::TW Traditional Chinese Encodings
336 --------------------------------------------------------
337
4411f3b6
NIS
338=head1 DESCRIPTION
339
47bfe92f 340The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef
JH
341and the rest of the system. Perl strings are sequences of
342B<characters>.
343
344The repertoire of characters that Perl can represent is at least that
345defined by the Unicode Consortium. On most platforms the ordinal
346values of the characters (as returned by C<ord(ch)>) is the "Unicode
347codepoint" for the character (the exceptions are those platforms where
348the legacy encoding is some variant of EBCDIC rather than a super-set
349of ASCII - see L<perlebcdic>).
350
0ab8f81e 351Traditionally, computer data has been moved around in 8-bit chunks
67d7b5ef
JH
352often called "bytes". These chunks are also known as "octets" in
353networking standards. Perl is widely used to manipulate data of many
354types - not only strings of characters representing human or computer
0ab8f81e 355languages but also "binary" data being the machine's representation of
67d7b5ef
JH
356numbers, pixels in an image - or just about anything.
357
0ab8f81e 358When Perl is processing "binary data", the programmer wants Perl to
67d7b5ef 359process "sequences of bytes". This is not a problem for Perl - as a
0ab8f81e 360byte has 256 possible values, it easily fits in Perl's much larger
67d7b5ef
JH
361"logical character".
362
363=head2 TERMINOLOGY
4411f3b6 364
7e19fb92 365=over 2
21938dfa 366
67d7b5ef
JH
367=item *
368
369I<character>: a character in the range 0..(2**32-1) (or more).
370(What Perl's strings are made of.)
371
372=item *
373
374I<byte>: a character in the range 0..255
375(A special case of a Perl character.)
376
377=item *
378
379I<octet>: 8 bits of data, with ordinal values 0..255
0ab8f81e 380(Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
67d7b5ef
JH
381
382=back
4411f3b6 383
67d7b5ef 384=head1 PERL ENCODING API
4411f3b6 385
7e19fb92 386=over 2
4411f3b6 387
b7a5c9de 388=item $octets = encode(ENCODING, $string [, CHECK])
4411f3b6 389
0ab8f81e 390Encodes a string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 391a sequence of octets. ENCODING can be either a canonical name or
0ab8f81e
JH
392an alias. For encoding names and aliases, see L</"Defining Aliases">.
393For CHECK, see L</"Handling Malformed Data">.
4411f3b6 394
b7a5c9de 395For example, to convert a string from Perl's internal format to
6d1c0808 396iso-8859-1 (also known as Latin1),
681a7c68 397
b7a5c9de 398 $octets = encode("iso-8859-1", $string);
7e19fb92 399
b7a5c9de
JH
400B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then $octets
401B<may not be equal to> $string. Though they both contain the same data, the utf8 flag
7e19fb92
JH
402for $octets is B<always> off. When you encode anything, utf8 flag of
403the result is always off, even when it contains completely valid utf8
404string. See L</"The UTF-8 flag"> below.
681a7c68 405
7f0d54d7 406If the $string is C<undef> then C<undef> is returned.
4089adc4 407
b7a5c9de 408=item $string = decode(ENCODING, $octets [, CHECK])
4411f3b6 409
0ab8f81e
JH
410Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
411internal form and returns the resulting string. As in encode(),
412ENCODING can be either a canonical name or an alias. For encoding names
413and aliases, see L</"Defining Aliases">. For CHECK, see
47bfe92f
JH
414L</"Handling Malformed Data">.
415
b7a5c9de 416For example, to convert ISO-8859-1 data to a string in Perl's internal format:
681a7c68 417
b7a5c9de 418 $string = decode("iso-8859-1", $octets);
681a7c68 419
b7a5c9de
JH
420B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
421B<may not be equal to> $octets. Though they both contain the same data,
422the utf8 flag for $string is on unless $octets entirely consists of
7e19fb92
JH
423ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag">
424below.
47bfe92f 425
7f0d54d7 426If the $string is C<undef> then C<undef> is returned.
4089adc4 427
b7a5c9de 428=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
7e19fb92 429
b7a5c9de
JH
430Converts B<in-place> data between two encodings. The data in $octets
431must be encoded as octets and not as characters in Perl's internal
f9d05ba3
RGS
432format. For example, to convert ISO-8859-1 data to Microsoft's CP1250
433encoding:
2b106fbe 434
b7a5c9de 435 from_to($octets, "iso-8859-1", "cp1250");
2b106fbe
JH
436
437and to convert it back:
438
b7a5c9de 439 from_to($octets, "cp1250", "iso-8859-1");
4411f3b6 440
ab97ca19 441Note that because the conversion happens in place, the data to be
0ab8f81e 442converted cannot be a string constant; it must be a scalar variable.
ab97ca19 443
f9d05ba3
RGS
444from_to() returns the length of the converted string in octets on
445success, I<undef> on error.
3ef515df 446
b7a5c9de 447B<CAVEAT>: The following operations look the same but are not quite so;
7e19fb92 448
b7a5c9de 449 from_to($data, "iso-8859-1", "utf8"); #1
7e19fb92 450 $data = decode("iso-8859-1", $data); #2
4411f3b6 451
b7a5c9de 452Both #1 and #2 make $data consist of a completely valid UTF-8 string
7e19fb92 453but only #2 turns utf8 flag on. #1 is equivalent to
f2a2953c 454
7e19fb92 455 $data = encode("utf8", decode("iso-8859-1", $data));
f2a2953c 456
7e19fb92 457See L</"The UTF-8 flag"> below.
f2a2953c
JH
458
459=item $octets = encode_utf8($string);
460
7e19fb92 461Equivalent to C<$octets = encode("utf8", $string);> The characters
b7a5c9de
JH
462that comprise $string are encoded in Perl's internal format and the
463result is returned as a sequence of octets. All possible
7e19fb92
JH
464characters have a UTF-8 representation so this function cannot fail.
465
f2a2953c
JH
466
467=item $string = decode_utf8($octets [, CHECK]);
468
7e19fb92 469equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
b7a5c9de 470The sequence of octets represented by
7e19fb92
JH
471$octets is decoded from UTF-8 into a sequence of logical
472characters. Not all sequences of octets form valid UTF-8 encodings, so
473it is possible for this call to fail. For CHECK, see
474L</"Handling Malformed Data">.
f2a2953c
JH
475
476=back
477
51ef4e11
NIS
478=head2 Listing available encodings
479
5129552c
JH
480 use Encode;
481 @list = Encode->encodings();
482
483Returns a list of the canonical names of the available encodings that
484are loaded. To get a list of all available encodings including the
485ones that are not loaded yet, say
486
487 @all_encodings = Encode->encodings(":all");
488
0ab8f81e 489Or you can give the name of a specific module.
5129552c 490
c731e18e
JH
491 @with_jp = Encode->encodings("Encode::JP");
492
493When "::" is not in the name, "Encode::" is assumed.
51ef4e11 494
c731e18e 495 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 496
0ab8f81e 497To find out in detail which encodings are supported by this package,
5d030b67 498see L<Encode::Supported>.
51ef4e11
NIS
499
500=head2 Defining Aliases
501
0ab8f81e 502To add a new alias to a given encoding, use:
67d7b5ef 503
5129552c
JH
504 use Encode;
505 use Encode::Alias;
a63c962f 506 define_alias(newName => ENCODING);
51ef4e11 507
3ef515df 508After that, newName can be used as an alias for ENCODING.
f2a2953c
JH
509ENCODING may be either the name of an encoding or an
510I<encoding object>
51ef4e11 511
fcb875d4
JH
512But before you do so, make sure the alias is nonexistent with
513C<resolve_alias()>, which returns the canonical name thereof.
514i.e.
515
516 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
517 Encode::resolve_alias("iso-8859-12") # false; nonexistent
518 Encode::resolve_alias($name) eq $name # true if $name is canonical
519
0ab8f81e
JH
520resolve_alias() does not need C<use Encode::Alias>; it can be
521exported via C<use Encode qw(resolve_alias)>.
fcb875d4 522
0ab8f81e 523See L<Encode::Alias> for details.
51ef4e11 524
85982a32 525=head1 Encoding via PerlIO
4411f3b6 526
b7a5c9de 527If your perl supports I<PerlIO> (which is the default), you can use a PerlIO layer to decode
0ab8f81e
JH
528and encode directly via a filehandle. The following two examples
529are totally identical in their functionality.
4411f3b6 530
85982a32
JH
531 # via PerlIO
532 open my $in, "<:encoding(shiftjis)", $infile or die;
533 open my $out, ">:encoding(euc-jp)", $outfile or die;
b7a5c9de 534 while(<$in>){ print $out $_; }
8e86646e 535
85982a32 536 # via from_to
0ab8f81e
JH
537 open my $in, "<", $infile or die;
538 open my $out, ">", $outfile or die;
b7a5c9de 539 while(<$in>){
0ab8f81e 540 from_to($_, "shiftjis", "euc-jp", 1);
b7a5c9de 541 print $out $_;
85982a32 542 }
4411f3b6 543
b7a5c9de 544Unfortunately, it may be that encodings are PerlIO-savvy. You can check
0ab8f81e
JH
545if your encoding is supported by PerlIO by calling the C<perlio_ok>
546method.
547
548 Encode::perlio_ok("hz"); # False
549 find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available
550
551 use Encode qw(perlio_ok); # exported upon request
552 perlio_ok("euc-jp")
4411f3b6 553
0ab8f81e 554Fortunately, all encodings that come with Encode core are PerlIO-savvy
f9d05ba3
RGS
555except for hz and ISO-2022-kr. For gory details, see
556L<Encode::Encoding> and L<Encode::PerlIO>.
4411f3b6 557
85982a32 558=head1 Handling Malformed Data
4411f3b6 559
8e180e82
SP
560The optional I<CHECK> argument tells Encode what to do when it
561encounters malformed data. Without CHECK, Encode::FB_DEFAULT ( == 0 )
562is assumed.
563
564As of version 2.12 Encode supports coderef values for CHECK. See below.
f9d05ba3
RGS
565
566=over 2
567
3c4b39be 568=item B<NOTE:> Not all encoding support this feature
f9d05ba3
RGS
569
570Some encodings ignore I<CHECK> argument. For example,
571L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
572
573=back
574
575Now here is the list of I<CHECK> values available
47bfe92f 576
151b5d36
JH
577=over 2
578
85982a32 579=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
47bfe92f 580
f9d05ba3 581If I<CHECK> is 0, (en|de)code will put a I<substitution character> in
78589665
RGS
582place of a malformed character. When you encode, E<lt>subcharE<gt>
583will be used. When you decode the code point C<0xFFFD> is used. If
584the data is supposed to be UTF-8, an optional lexical warning
585(category utf8) is given.
e9692b5b 586
7e19fb92 587=item I<CHECK> = Encode::FB_CROAK ( == 1)
e9692b5b 588
b7a5c9de 589If I<CHECK> is 1, methods will die on error immediately with an error
0ab8f81e 590message. Therefore, when I<CHECK> is set to 1, you should trap the
f9d05ba3 591error with eval{} unless you really want to let it die.
47bfe92f 592
85982a32 593=item I<CHECK> = Encode::FB_QUIET
47bfe92f 594
85982a32 595If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
f9d05ba3
RGS
596return the portion of the data that has been processed so far when an
597error occurs. The data argument will be overwritten with everything
598after that point (that is, the unprocessed part of data). This is
599handy when you have to call decode repeatedly in the case where your
600source data may contain partial multi-byte character sequences,
601(i.e. you are reading with a fixed-width buffer). Here is a sample
602code that does exactly this:
4411f3b6 603
78589665
RGS
604 my $buffer = ''; my $string = '';
605 while(read $fh, $buffer, 256, length($buffer)){
606 $string .= decode($encoding, $buffer, Encode::FB_QUIET);
607 # $buffer now contains the unprocessed partial character
85982a32 608 }
1768d7eb 609
85982a32 610=item I<CHECK> = Encode::FB_WARN
67d7b5ef 611
0ab8f81e
JH
612This is the same as above, except that it warns on error. Handy when
613you are debugging the mode above.
85982a32
JH
614
615=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
616
af1f55d9
JH
617=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
618
619=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
620
85982a32
JH
621For encodings that are implemented by Encode::XS, CHECK ==
622Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
623
b7a5c9de
JH
624When you decode, C<\xI<HH>> will be inserted for a malformed character,
625where I<HH> is the hex representation of the octet that could not be
626decoded to utf8. And when you encode, C<\x{I<HHHH>}> will be inserted,
627where I<HHHH> is the Unicode ID of the character that cannot be found
0ab8f81e 628in the character repertoire of the encoding.
85982a32 629
af1f55d9 630HTML/XML character reference modes are about the same, in place of
78589665
RGS
631C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number and
632XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
af1f55d9 633
7f0d54d7
RGS
634In Encode 2.10 or later, C<LEAVE_SRC> is also implied.
635
85982a32
JH
636=item The bitmask
637
0ab8f81e
JH
638These modes are actually set via a bitmask. Here is how the FB_XX
639constants are laid out. You can import the FB_XX constants via
640C<use Encode qw(:fallbacks)>; you can import the generic bitmask
641constants via C<use Encode qw(:fallback_all)>.
85982a32 642
b0b300a3
JH
643 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
644 DIE_ON_ERR 0x0001 X
4089adc4 645 WARN_ON_ERR 0x0002 X
b0b300a3 646 RETURN_ON_ERR 0x0004 X X
7f0d54d7 647 LEAVE_SRC 0x0008 X
b0b300a3 648 PERLQQ 0x0100 X
b7a5c9de
JH
649 HTMLCREF 0x0200
650 XMLCREF 0x0400
67d7b5ef 651
151b5d36
JH
652=back
653
8e180e82
SP
654=head2 coderef for CHECK
655
656As of Encode 2.12 CHECK can also be a code reference which takes the
657ord value of unmapped caharacter as an argument and returns a string
658that represents the fallback character. For instance,
67d7b5ef 659
8e180e82 660 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
67d7b5ef 661
8e180e82
SP
662Acts like FB_PERLQQ but E<lt>U+I<XXXX>E<gt> is used instead of
663\x{I<XXXX>}.
982a4085 664
67d7b5ef
JH
665=head1 Defining Encodings
666
667To define a new encoding, use:
668
b7a5c9de 669 use Encode qw(define_encoding);
67d7b5ef
JH
670 define_encoding($object, 'canonicalName' [, alias...]);
671
672I<canonicalName> will be associated with I<$object>. The object
0ab8f81e 673should provide the interface described in L<Encode::Encoding>.
67d7b5ef 674If more than two arguments are provided then additional
b7a5c9de 675arguments are taken as aliases for I<$object>.
67d7b5ef 676
f2a2953c
JH
677See L<Encode::Encoding> for more details.
678
7e19fb92
JH
679=head1 The UTF-8 flag
680
681Before the introduction of utf8 support in perl, The C<eq> operator
b7a5c9de
JH
682just compared the strings represented by two scalars. Beginning with
683perl 5.8, C<eq> compares two strings with simultaneous consideration
684of I<the utf8 flag>. To explain why we made it so, I will quote page
685402 of C<Programming Perl, 3rd ed.>
7e19fb92
JH
686
687=over 2
688
689=item Goal #1:
690
691Old byte-oriented programs should not spontaneously break on the old
692byte-oriented data they used to work on.
693
694=item Goal #2:
695
696Old byte-oriented programs should magically start working on the new
697character-oriented data when appropriate.
698
699=item Goal #3:
700
701Programs should run just as fast in the new character-oriented mode
702as in the old byte-oriented mode.
703
704=item Goal #4:
705
706Perl should remain one language, rather than forking into a
707byte-oriented Perl and a character-oriented Perl.
708
709=back
710
711Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0
712was born and many features documented in the book remained
b7a5c9de
JH
713unimplemented for a long time. Perl 5.8 corrected this and the introduction
714of the UTF-8 flag is one of them. You can think of this perl notion as of a
715byte-oriented mode (utf8 flag off) and a character-oriented mode (utf8
7e19fb92
JH
716flag on).
717
718Here is how Encode takes care of the utf8 flag.
719
4bdf5738 720=over 2
7e19fb92
JH
721
722=item *
723
724When you encode, the resulting utf8 flag is always off.
725
151b5d36 726=item *
7e19fb92 727
b7a5c9de 728When you decode, the resulting utf8 flag is on unless you can
7e19fb92
JH
729unambiguously represent data. Here is the definition of
730dis-ambiguity.
731
b7a5c9de 732After C<$utf8 = decode('foo', $octet);>,
7e19fb92
JH
733
734 When $octet is... The utf8 flag in $utf8 is
735 ---------------------------------------------
736 In ASCII only (or EBCDIC only) OFF
737 In ISO-8859-1 ON
738 In any other Encoding ON
739 ---------------------------------------------
740
3c4b39be 741As you see, there is one exception, In ASCII. That way you can assume
7e19fb92
JH
742Goal #1. And with Encode Goal #2 is assumed but you still have to be
743careful in such cases mentioned in B<CAVEAT> paragraphs.
744
745This utf8 flag is not visible in perl scripts, exactly for the same
746reason you cannot (or you I<don't have to>) see if a scalar contains a
747string, integer, or floating point number. But you can still peek
748and poke these if you will. See the section below.
749
750=back
751
752=head2 Messing with Perl's Internals
4411f3b6 753
47bfe92f 754The following API uses parts of Perl's internals in the current
0ab8f81e 755implementation. As such, they are efficient but may change.
4411f3b6 756
7e19fb92 757=over 2
4411f3b6 758
a63c962f 759=item is_utf8(STRING [, CHECK])
4411f3b6 760
0ab8f81e 761[INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING.
47bfe92f
JH
762If CHECK is true, also checks the data in STRING for being well-formed
763UTF-8. Returns true if successful, false otherwise.
4411f3b6 764
2c246b25 765As of perl 5.8.1, L<utf8> also has utf8::is_utf8().
b5ab1f6f 766
a63c962f 767=item _utf8_on(STRING)
4411f3b6 768
0ab8f81e 769[INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is
4411f3b6
NIS
770B<not> checked for being well-formed UTF-8. Do not use unless you
771B<know> that the STRING is well-formed UTF-8. Returns the previous
0ab8f81e
JH
772state of the UTF-8 flag (so please don't treat the return value as
773indicating success or failure), or C<undef> if STRING is not a string.
4411f3b6 774
a63c962f 775=item _utf8_off(STRING)
4411f3b6 776
0ab8f81e
JH
777[INTERNAL] Turns off the UTF-8 flag in STRING. Do not use frivolously.
778Returns the previous state of the UTF-8 flag (so please don't treat the
779return value as indicating success or failure), or C<undef> if STRING is
4411f3b6
NIS
780not a string.
781
782=back
783
7f0d54d7
RGS
784=head1 UTF-8 vs. utf8
785
786 ....We now view strings not as sequences of bytes, but as sequences
787 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
788 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
789
790That has been the perl's notion of UTF-8 but official UTF-8 is more
791strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are
792not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al).
793
794Now that is overruled by Larry Wall himself.
795
796 From: Larry Wall <larry@wall.org>
797 Date: December 04, 2004 11:51:58 JST
798 To: perl-unicode@perl.org
799 Subject: Re: Make Encode.pm support the real UTF-8
800 Message-Id: <20041204025158.GA28754@wall.org>
801
802 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
803 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
804 : but "UTF-8" is the name of the standard and should give the
805 : corresponding behaviour.
806
807 For what it's worth, that's how I've always kept them straight in my
808 head.
8e180e82 809
7f0d54d7
RGS
810 Also for what it's worth, Perl 6 will mostly default to strict but
811 make it easy to switch back to lax.
812
813 Larry
814
815Do you copy? As of Perl 5.8.7, B<UTF-8> means strict, official UTF-8
816while B<utf8> means liberal, lax, version thereof. And Encode version
8172.10 or later thus groks the difference between C<UTF-8> and C"utf8".
818
819 encode("utf8", "\x{FFFF_FFFF}", 1); # okay
820 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
821
822C<UTF-8> in Encode is actually a canonical name for C<utf-8-strict>.
823Yes, the hyphen between "UTF" and "8" is important. Without it Encode
824goes "liberal"
825
826 find_encoding("UTF-8")->name # is 'utf-8-strict'
827 find_encoding("utf-8")->name # ditto. names are case insensitive
50c1ac04 828 find_encoding("utf_8")->name # ditto. "_" are treated as "-"
7f0d54d7
RGS
829 find_encoding("UTF8")->name # is 'utf8'.
830
831
4411f3b6
NIS
832=head1 SEE ALSO
833
5d030b67
JH
834L<Encode::Encoding>,
835L<Encode::Supported>,
6d1c0808 836L<Encode::PerlIO>,
5d030b67 837L<encoding>,
6d1c0808
JH
838L<perlebcdic>,
839L<perlfunc/open>,
840L<perlunicode>,
841L<utf8>,
5d030b67 842the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 843
85982a32 844=head1 MAINTAINER
aae85ceb
DK
845
846This project was originated by Nick Ing-Simmons and later maintained
7e19fb92
JH
847by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full
848list of people involved. For any questions, use
b7a5c9de 849E<lt>perl-unicode@perl.orgE<gt> so we can all share.
aae85ceb 850
4411f3b6 851=cut