This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update Changes.
[perl5.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
fc17bd48 3our $VERSION = do { my @r = (q$Revision: 1.62 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c 4our $DEBUG = 0;
6d1c0808
JH
5use XSLoader ();
6XSLoader::load 'Encode';
2c674647 7
2c674647 8require Exporter;
7e19fb92 9use base qw/Exporter/;
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
85982a32
JH
12
13our @EXPORT = qw(
14 decode decode_utf8 encode encode_utf8
15 encodings find_encoding
4411f3b6
NIS
16);
17
af1f55d9
JH
18our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
19 PERLQQ HTMLCREF XMLCREF);
20our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
21 FB_PERLQQ FB_HTMLCREF FB_XMLCREF);
85982a32 22
51ef4e11 23our @EXPORT_OK =
6d1c0808 24 (
85982a32
JH
25 qw(
26 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
27 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
28 ),
29 @FB_FLAGS, @FB_CONSTS,
30 );
31
6d1c0808 32our %EXPORT_TAGS =
85982a32
JH
33 (
34 all => [ @EXPORT, @EXPORT_OK ],
35 fallbacks => [ @FB_CONSTS ],
36 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
37 );
38
4411f3b6 39# Documentation moved after __END__ for speed - NI-S
2c674647 40
bf230f3d
NIS
41use Carp;
42
a63c962f 43our $ON_EBCDIC = (ord("A") == 193);
f2a2953c 44
5d030b67
JH
45use Encode::Alias;
46
5129552c
JH
47# Make a %Encoding package variable to allow a certain amount of cheating
48our %Encoding;
aae85ceb
DK
49our %ExtModule;
50require Encode::Config;
51eval { require Encode::ConfigLocal };
5129552c 52
656753f8
NIS
53sub encodings
54{
5129552c 55 my $class = shift;
fc17bd48
JH
56 my %enc;
57 if (@_ and $_[0] eq ":all"){
58 %enc = ( %Encoding, %ExtModule );
59 }else{
60 %enc = %Encoding;
61 for my $mod (map {m/::/o ? $_ : "Encode::$_" } @_){
62 $DEBUG and warn $mod;
63 for my $enc (keys %ExtModule){
64 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
65 }
66 }
5129552c
JH
67 }
68 return
ce912cd4 69 sort { lc $a cmp lc $b }
fc17bd48 70 grep {!/^(?:Internal|Unicode|Guess)$/o} keys %enc;
51ef4e11
NIS
71}
72
85982a32 73sub perlio_ok{
0ab8f81e 74 my $obj = ref($_[0]) ? $_[0] : find_encoding($_[0]);
011b2d2f 75 $obj->can("perlio_ok") and return $obj->perlio_ok();
0ab8f81e 76 return 0; # safety net
85982a32
JH
77}
78
51ef4e11
NIS
79sub define_encoding
80{
18586f54
NIS
81 my $obj = shift;
82 my $name = shift;
5129552c 83 $Encoding{$name} = $obj;
18586f54
NIS
84 my $lc = lc($name);
85 define_alias($lc => $obj) unless $lc eq $name;
86 while (@_)
87 {
88 my $alias = shift;
89 define_alias($alias,$obj);
90 }
91 return $obj;
656753f8
NIS
92}
93
656753f8
NIS
94sub getEncoding
95{
dd9703c9 96 my ($class,$name,$skip_external) = @_;
18586f54
NIS
97 my $enc;
98 if (ref($name) && $name->can('new_sequence'))
99 {
100 return $name;
101 }
102 my $lc = lc $name;
5129552c 103 if (exists $Encoding{$name})
18586f54 104 {
5129552c 105 return $Encoding{$name};
18586f54 106 }
5129552c 107 if (exists $Encoding{$lc})
18586f54 108 {
5129552c 109 return $Encoding{$lc};
18586f54 110 }
c50d192e 111
5129552c 112 my $oc = $class->find_alias($name);
c50d192e
AT
113 return $oc if defined $oc;
114
5129552c 115 $oc = $class->find_alias($lc) if $lc ne $name;
c50d192e
AT
116 return $oc if defined $oc;
117
c731e18e 118 unless ($skip_external)
d1ed7747 119 {
c731e18e
JH
120 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
121 $mod =~ s,::,/,g ; $mod .= '.pm';
122 eval{ require $mod; };
123 return $Encoding{$name} if exists $Encoding{$name};
124 }
d1ed7747 125 }
18586f54 126 return;
656753f8
NIS
127}
128
4411f3b6
NIS
129sub find_encoding
130{
dd9703c9
AT
131 my ($name,$skip_external) = @_;
132 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6
NIS
133}
134
fcb875d4
JH
135sub resolve_alias {
136 my $obj = find_encoding(shift);
137 defined $obj and return $obj->name;
138 return;
139}
140
b2704119 141sub encode($$;$)
4411f3b6 142{
18586f54 143 my ($name,$string,$check) = @_;
b2704119 144 $check ||=0;
18586f54
NIS
145 my $enc = find_encoding($name);
146 croak("Unknown encoding '$name'") unless defined $enc;
147 my $octets = $enc->encode($string,$check);
148 return undef if ($check && length($string));
149 return $octets;
4411f3b6
NIS
150}
151
b2704119 152sub decode($$;$)
4411f3b6 153{
18586f54 154 my ($name,$octets,$check) = @_;
b2704119 155 $check ||=0;
18586f54
NIS
156 my $enc = find_encoding($name);
157 croak("Unknown encoding '$name'") unless defined $enc;
158 my $string = $enc->decode($octets,$check);
159 $_[1] = $octets if $check;
160 return $string;
4411f3b6
NIS
161}
162
b2704119 163sub from_to($$$;$)
4411f3b6 164{
18586f54 165 my ($string,$from,$to,$check) = @_;
b2704119 166 $check ||=0;
18586f54
NIS
167 my $f = find_encoding($from);
168 croak("Unknown encoding '$from'") unless defined $f;
169 my $t = find_encoding($to);
170 croak("Unknown encoding '$to'") unless defined $t;
171 my $uni = $f->decode($string,$check);
172 return undef if ($check && length($string));
a999c27c 173 $string = $t->encode($uni,$check);
18586f54 174 return undef if ($check && length($uni));
3ef515df 175 return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6
NIS
176}
177
b2704119 178sub encode_utf8($)
4411f3b6 179{
18586f54 180 my ($str) = @_;
c731e18e 181 utf8::encode($str);
18586f54 182 return $str;
4411f3b6
NIS
183}
184
b2704119 185sub decode_utf8($)
4411f3b6 186{
18586f54
NIS
187 my ($str) = @_;
188 return undef unless utf8::decode($str);
189 return $str;
5ad8ef52
NIS
190}
191
f2a2953c
JH
192predefine_encodings();
193
194#
195# This is to restore %Encoding if really needed;
196#
197sub predefine_encodings{
6d1c0808 198 if ($ON_EBCDIC) {
f2a2953c
JH
199 # was in Encode::UTF_EBCDIC
200 package Encode::UTF_EBCDIC;
201 *name = sub{ shift->{'Name'} };
202 *new_sequence = sub{ return $_[0] };
af1f55d9
JH
203 *needs_lines = sub{ 0 };
204 *perlio_ok = sub {
205 eval{ require PerlIO::encoding };
206 return $@ ? 0 : 1;
207 };
f2a2953c
JH
208 *decode = sub{
209 my ($obj,$str,$chk) = @_;
210 my $res = '';
211 for (my $i = 0; $i < length($str); $i++) {
6d1c0808 212 $res .=
f2a2953c
JH
213 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
214 }
215 $_[1] = '' if $chk;
216 return $res;
217 };
218 *encode = sub{
219 my ($obj,$str,$chk) = @_;
220 my $res = '';
221 for (my $i = 0; $i < length($str); $i++) {
6d1c0808 222 $res .=
f2a2953c
JH
223 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
224 }
225 $_[1] = '' if $chk;
226 return $res;
227 };
6d1c0808 228 $Encode::Encoding{Unicode} =
c731e18e 229 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
6d1c0808 230 } else {
f2a2953c
JH
231 # was in Encode::UTF_EBCDIC
232 package Encode::Internal;
233 *name = sub{ shift->{'Name'} };
234 *new_sequence = sub{ return $_[0] };
af1f55d9
JH
235 *needs_lines = sub{ 0 };
236 *perlio_ok = sub {
237 eval{ require PerlIO::encoding };
238 return $@ ? 0 : 1;
239 };
f2a2953c
JH
240 *decode = sub{
241 my ($obj,$str,$chk) = @_;
242 utf8::upgrade($str);
243 $_[1] = '' if $chk;
244 return $str;
245 };
246 *encode = \&decode;
6d1c0808 247 $Encode::Encoding{Unicode} =
c731e18e 248 bless {Name => "Internal"} => "Encode::Internal";
f2a2953c
JH
249 }
250
251 {
252 # was in Encode::utf8
253 package Encode::utf8;
254 *name = sub{ shift->{'Name'} };
255 *new_sequence = sub{ return $_[0] };
af1f55d9
JH
256 *needs_lines = sub{ 0 };
257 *perlio_ok = sub {
258 eval{ require PerlIO::encoding };
259 return $@ ? 0 : 1;
260 };
f2a2953c
JH
261 *decode = sub{
262 my ($obj,$octets,$chk) = @_;
263 my $str = Encode::decode_utf8($octets);
264 if (defined $str) {
265 $_[1] = '' if $chk;
266 return $str;
267 }
268 return undef;
269 };
270 *encode = sub {
271 my ($obj,$string,$chk) = @_;
272 my $octets = Encode::encode_utf8($string);
273 $_[1] = '' if $chk;
274 return $octets;
275 };
0ab8f81e 276 $Encode::Encoding{utf8} =
c731e18e 277 bless {Name => "utf8"} => "Encode::utf8";
f2a2953c 278 }
f2a2953c
JH
279}
280
656753f8
NIS
2811;
282
2a936312
NIS
283__END__
284
4411f3b6
NIS
285=head1 NAME
286
287Encode - character encodings
288
289=head1 SYNOPSIS
290
291 use Encode;
292
67d7b5ef
JH
293=head2 Table of Contents
294
0ab8f81e 295Encode consists of a collection of modules whose details are too big
67d7b5ef 296to fit in one document. This POD itself explains the top-level APIs
6d1c0808 297and general topics at a glance. For other topics and more details,
0ab8f81e 298see the PODs below:
67d7b5ef
JH
299
300 Name Description
301 --------------------------------------------------------
6d1c0808 302 Encode::Alias Alias definitions to encodings
67d7b5ef
JH
303 Encode::Encoding Encode Implementation Base Class
304 Encode::Supported List of Supported Encodings
305 Encode::CN Simplified Chinese Encodings
306 Encode::JP Japanese Encodings
307 Encode::KR Korean Encodings
308 Encode::TW Traditional Chinese Encodings
309 --------------------------------------------------------
310
4411f3b6
NIS
311=head1 DESCRIPTION
312
47bfe92f 313The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef
JH
314and the rest of the system. Perl strings are sequences of
315B<characters>.
316
317The repertoire of characters that Perl can represent is at least that
318defined by the Unicode Consortium. On most platforms the ordinal
319values of the characters (as returned by C<ord(ch)>) is the "Unicode
320codepoint" for the character (the exceptions are those platforms where
321the legacy encoding is some variant of EBCDIC rather than a super-set
322of ASCII - see L<perlebcdic>).
323
0ab8f81e 324Traditionally, computer data has been moved around in 8-bit chunks
67d7b5ef
JH
325often called "bytes". These chunks are also known as "octets" in
326networking standards. Perl is widely used to manipulate data of many
327types - not only strings of characters representing human or computer
0ab8f81e 328languages but also "binary" data being the machine's representation of
67d7b5ef
JH
329numbers, pixels in an image - or just about anything.
330
0ab8f81e 331When Perl is processing "binary data", the programmer wants Perl to
67d7b5ef 332process "sequences of bytes". This is not a problem for Perl - as a
0ab8f81e 333byte has 256 possible values, it easily fits in Perl's much larger
67d7b5ef
JH
334"logical character".
335
336=head2 TERMINOLOGY
4411f3b6 337
7e19fb92 338=over 2
21938dfa 339
67d7b5ef
JH
340=item *
341
342I<character>: a character in the range 0..(2**32-1) (or more).
343(What Perl's strings are made of.)
344
345=item *
346
347I<byte>: a character in the range 0..255
348(A special case of a Perl character.)
349
350=item *
351
352I<octet>: 8 bits of data, with ordinal values 0..255
0ab8f81e 353(Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
67d7b5ef
JH
354
355=back
4411f3b6 356
67d7b5ef
JH
357The marker [INTERNAL] marks Internal Implementation Details, in
358general meant only for those who think they know what they are doing,
359and such details may change in future releases.
360
361=head1 PERL ENCODING API
4411f3b6 362
7e19fb92 363=over 2
4411f3b6 364
f2a2953c 365=item $octets = encode(ENCODING, $string[, CHECK])
4411f3b6 366
0ab8f81e 367Encodes a string from Perl's internal form into I<ENCODING> and returns
67d7b5ef 368a sequence of octets. ENCODING can be either a canonical name or
0ab8f81e
JH
369an alias. For encoding names and aliases, see L</"Defining Aliases">.
370For CHECK, see L</"Handling Malformed Data">.
4411f3b6 371
0ab8f81e 372For example, to convert (internally UTF-8 encoded) Unicode string to
6d1c0808 373iso-8859-1 (also known as Latin1),
681a7c68 374
7e19fb92
JH
375 $octets = encode("iso-8859-1", $utf8);
376
377B<CAVEAT>: When you C<$octets = encode("utf8", $utf8)>, then $octets
378B<ne> $utf8. Though they both contain the same data, the utf8 flag
379for $octets is B<always> off. When you encode anything, utf8 flag of
380the result is always off, even when it contains completely valid utf8
381string. See L</"The UTF-8 flag"> below.
681a7c68 382
f2a2953c 383=item $string = decode(ENCODING, $octets[, CHECK])
4411f3b6 384
0ab8f81e
JH
385Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
386internal form and returns the resulting string. As in encode(),
387ENCODING can be either a canonical name or an alias. For encoding names
388and aliases, see L</"Defining Aliases">. For CHECK, see
47bfe92f
JH
389L</"Handling Malformed Data">.
390
0ab8f81e 391For example, to convert ISO-8859-1 data to UTF-8:
681a7c68 392
67d7b5ef 393 $utf8 = decode("iso-8859-1", $latin1);
681a7c68 394
7e19fb92
JH
395B<CAVEAT>: When you C<$utf8 = encode("utf8", $octets)>, then $utf8
396B<may not be equal to> $utf8. Though they both contain the same data,
397the utf8 flag for $utf8 is on unless $octets entirely conststs of
398ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag">
399below.
47bfe92f 400
7e19fb92
JH
401=item [$length =] from_to($string, FROM_ENC, TO_ENC [, CHECK])
402
403Converts B<in-place> data between two encodings. For example, to
404convert ISO-8859-1 data to UTF-8:
2b106fbe 405
7e19fb92 406 from_to($data, "iso-8859-1", "utf8");
2b106fbe
JH
407
408and to convert it back:
409
7e19fb92 410 from_to($data, "utf8", "iso-8859-1");
4411f3b6 411
ab97ca19 412Note that because the conversion happens in place, the data to be
0ab8f81e 413converted cannot be a string constant; it must be a scalar variable.
ab97ca19 414
0ab8f81e 415from_to() returns the length of the converted string on success, undef
3ef515df
JH
416otherwise.
417
7e19fb92
JH
418B<CAVEAT>: The following operations look the same but not quite so;
419
420 from_to($data, "iso-8859-1", "utf8"); #1
421 $data = decode("iso-8859-1", $data); #2
4411f3b6 422
7e19fb92
JH
423Both #1 and #2 makes $data consists of completely valid UTF-8 string
424but only #2 turns utf8 flag on. #1 is equivalent to
f2a2953c 425
7e19fb92 426 $data = encode("utf8", decode("iso-8859-1", $data));
f2a2953c 427
7e19fb92 428See L</"The UTF-8 flag"> below.
f2a2953c
JH
429
430=item $octets = encode_utf8($string);
431
7e19fb92
JH
432Equivalent to C<$octets = encode("utf8", $string);> The characters
433that comprise $string are encoded in Perl's superset of UTF-8 and the
434resulting octets are returned as a sequence of bytes. All possible
435characters have a UTF-8 representation so this function cannot fail.
436
f2a2953c
JH
437
438=item $string = decode_utf8($octets [, CHECK]);
439
7e19fb92
JH
440equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
441decode_utf8($octets [, CHECK]); The sequence of octets represented by
442$octets is decoded from UTF-8 into a sequence of logical
443characters. Not all sequences of octets form valid UTF-8 encodings, so
444it is possible for this call to fail. For CHECK, see
445L</"Handling Malformed Data">.
f2a2953c
JH
446
447=back
448
51ef4e11
NIS
449=head2 Listing available encodings
450
5129552c
JH
451 use Encode;
452 @list = Encode->encodings();
453
454Returns a list of the canonical names of the available encodings that
455are loaded. To get a list of all available encodings including the
456ones that are not loaded yet, say
457
458 @all_encodings = Encode->encodings(":all");
459
0ab8f81e 460Or you can give the name of a specific module.
5129552c 461
c731e18e
JH
462 @with_jp = Encode->encodings("Encode::JP");
463
464When "::" is not in the name, "Encode::" is assumed.
51ef4e11 465
c731e18e 466 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 467
0ab8f81e 468To find out in detail which encodings are supported by this package,
5d030b67 469see L<Encode::Supported>.
51ef4e11
NIS
470
471=head2 Defining Aliases
472
0ab8f81e 473To add a new alias to a given encoding, use:
67d7b5ef 474
5129552c
JH
475 use Encode;
476 use Encode::Alias;
a63c962f 477 define_alias(newName => ENCODING);
51ef4e11 478
3ef515df 479After that, newName can be used as an alias for ENCODING.
f2a2953c
JH
480ENCODING may be either the name of an encoding or an
481I<encoding object>
51ef4e11 482
fcb875d4
JH
483But before you do so, make sure the alias is nonexistent with
484C<resolve_alias()>, which returns the canonical name thereof.
485i.e.
486
487 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
488 Encode::resolve_alias("iso-8859-12") # false; nonexistent
489 Encode::resolve_alias($name) eq $name # true if $name is canonical
490
0ab8f81e
JH
491resolve_alias() does not need C<use Encode::Alias>; it can be
492exported via C<use Encode qw(resolve_alias)>.
fcb875d4 493
0ab8f81e 494See L<Encode::Alias> for details.
51ef4e11 495
85982a32 496=head1 Encoding via PerlIO
4411f3b6 497
0ab8f81e
JH
498If your perl supports I<PerlIO>, you can use a PerlIO layer to decode
499and encode directly via a filehandle. The following two examples
500are totally identical in their functionality.
4411f3b6 501
85982a32
JH
502 # via PerlIO
503 open my $in, "<:encoding(shiftjis)", $infile or die;
504 open my $out, ">:encoding(euc-jp)", $outfile or die;
505 while(<>){ print; }
8e86646e 506
85982a32 507 # via from_to
0ab8f81e
JH
508 open my $in, "<", $infile or die;
509 open my $out, ">", $outfile or die;
6d1c0808 510 while(<>){
0ab8f81e 511 from_to($_, "shiftjis", "euc-jp", 1);
85982a32 512 }
4411f3b6 513
0ab8f81e
JH
514Unfortunately, there may be encodings are PerlIO-savvy. You can check
515if your encoding is supported by PerlIO by calling the C<perlio_ok>
516method.
517
518 Encode::perlio_ok("hz"); # False
519 find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available
520
521 use Encode qw(perlio_ok); # exported upon request
522 perlio_ok("euc-jp")
4411f3b6 523
0ab8f81e
JH
524Fortunately, all encodings that come with Encode core are PerlIO-savvy
525except for hz and ISO-2022-kr. See L<Encode::Encoding> for details.
4411f3b6 526
0ab8f81e 527For gory details, see L<Encode::PerlIO>.
4411f3b6 528
85982a32 529=head1 Handling Malformed Data
4411f3b6 530
7e19fb92 531=over 2
47bfe92f 532
0ab8f81e
JH
533The I<CHECK> argument is used as follows. When you omit it,
534the behaviour is the same as if you had passed a value of 0 for
535I<CHECK>.
47bfe92f 536
85982a32 537=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
47bfe92f 538
0ab8f81e
JH
539If I<CHECK> is 0, (en|de)code will put a I<substitution character>
540in place of a malformed character. For UCM-based encodings,
541E<lt>subcharE<gt> will be used. For Unicode, "\x{FFFD}" is used.
542If the data is supposed to be UTF-8, an optional lexical warning
543(category utf8) is given.
e9692b5b 544
7e19fb92 545=item I<CHECK> = Encode::FB_CROAK ( == 1)
e9692b5b 546
0ab8f81e
JH
547If I<CHECK> is 1, methods will die immediately with an error
548message. Therefore, when I<CHECK> is set to 1, you should trap the
549fatal error with eval{} unless you really want to let it die on error.
47bfe92f 550
85982a32 551=item I<CHECK> = Encode::FB_QUIET
47bfe92f 552
85982a32 553If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
0ab8f81e
JH
554return the portion of the data that has been processed so far when
555an error occurs. The data argument will be overwritten with
556everything after that point (that is, the unprocessed part of data).
557This is handy when you have to call decode repeatedly in the case
558where your source data may contain partial multi-byte character
559sequences, for example because you are reading with a fixed-width
560buffer. Here is some sample code that does exactly this:
4411f3b6 561
85982a32
JH
562 my $data = '';
563 while(defined(read $fh, $buffer, 256)){
0ab8f81e 564 # buffer may end in a partial character so we append
85982a32
JH
565 $data .= $buffer;
566 $utf8 .= decode($encoding, $data, ENCODE::FB_QUIET);
0ab8f81e 567 # $data now contains the unprocessed partial character
85982a32 568 }
1768d7eb 569
85982a32 570=item I<CHECK> = Encode::FB_WARN
67d7b5ef 571
0ab8f81e
JH
572This is the same as above, except that it warns on error. Handy when
573you are debugging the mode above.
85982a32
JH
574
575=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
576
af1f55d9
JH
577=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
578
579=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
580
85982a32
JH
581For encodings that are implemented by Encode::XS, CHECK ==
582Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
583
0ab8f81e
JH
584When you decode, '\xI<XX>' will be inserted for a malformed character,
585where I<XX> is the hex representation of the octet that could not be
586decoded to utf8. And when you encode, '\x{I<xxxx>}' will be inserted,
587where I<xxxx> is the Unicode ID of the character that cannot be found
588in the character repertoire of the encoding.
85982a32 589
af1f55d9
JH
590HTML/XML character reference modes are about the same, in place of
591\x{I<xxxx>}, HTML uses &#I<1234>; where I<1234> is a decimal digit and
592XML uses &#xI<abcd>; where I<abcd> is the hexadecimal digit.
593
85982a32
JH
594=item The bitmask
595
0ab8f81e
JH
596These modes are actually set via a bitmask. Here is how the FB_XX
597constants are laid out. You can import the FB_XX constants via
598C<use Encode qw(:fallbacks)>; you can import the generic bitmask
599constants via C<use Encode qw(:fallback_all)>.
85982a32 600
b0b300a3
JH
601 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
602 DIE_ON_ERR 0x0001 X
603 WARN_ON_ER 0x0002 X
604 RETURN_ON_ERR 0x0004 X X
605 LEAVE_SRC 0x0008
606 PERLQQ 0x0100 X
af1f55d9
JH
607 HTMLCREF 0x0200
608 XMLCREF 0x0400
67d7b5ef 609
0ab8f81e 610=head2 Unimplemented fallback schemes
67d7b5ef 611
0ab8f81e 612In the future, you will be able to use a code reference to a callback
f2a2953c 613function for the value of I<CHECK> but its API is still undecided.
67d7b5ef
JH
614
615=head1 Defining Encodings
616
617To define a new encoding, use:
618
619 use Encode qw(define_alias);
620 define_encoding($object, 'canonicalName' [, alias...]);
621
622I<canonicalName> will be associated with I<$object>. The object
0ab8f81e 623should provide the interface described in L<Encode::Encoding>.
67d7b5ef 624If more than two arguments are provided then additional
0ab8f81e 625arguments are taken as aliases for I<$object>, as for C<define_alias>.
67d7b5ef 626
f2a2953c
JH
627See L<Encode::Encoding> for more details.
628
7e19fb92
JH
629=head1 The UTF-8 flag
630
631Before the introduction of utf8 support in perl, The C<eq> operator
632just compares internal data of the scalars. Now C<eq> means internal
633data equality AND I<the utf8 flag>. To explain why we made it so, I
634will quote page 402 of C<Programming Perl, 3rd ed.>
635
636=over 2
637
638=item Goal #1:
639
640Old byte-oriented programs should not spontaneously break on the old
641byte-oriented data they used to work on.
642
643=item Goal #2:
644
645Old byte-oriented programs should magically start working on the new
646character-oriented data when appropriate.
647
648=item Goal #3:
649
650Programs should run just as fast in the new character-oriented mode
651as in the old byte-oriented mode.
652
653=item Goal #4:
654
655Perl should remain one language, rather than forking into a
656byte-oriented Perl and a character-oriented Perl.
657
658=back
659
660Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0
661was born and many features documented in the book remained
662unimplemented. Perl 5.8 hopefully correct this and the introduction
663of UTF-8 flag is one of them. You can think this perl notion of
664byte-oriented mode (utf8 flag off) and character-oriented mode (utf8
665flag on).
666
667Here is how Encode takes care of the utf8 flag.
668
4bdf5738 669=over 2
7e19fb92
JH
670
671=item *
672
673When you encode, the resulting utf8 flag is always off.
674
675=item
676
677When you decode, the resuting utf8 flag is on unless you can
678unambiguously represent data. Here is the definition of
679dis-ambiguity.
680
681 After C<$utf8 = decode('foo', $octet);>,
682
683 When $octet is... The utf8 flag in $utf8 is
684 ---------------------------------------------
685 In ASCII only (or EBCDIC only) OFF
686 In ISO-8859-1 ON
687 In any other Encoding ON
688 ---------------------------------------------
689
690As you see, there is one exception, In ASCII. That way you can assue
691Goal #1. And with Encode Goal #2 is assumed but you still have to be
692careful in such cases mentioned in B<CAVEAT> paragraphs.
693
694This utf8 flag is not visible in perl scripts, exactly for the same
695reason you cannot (or you I<don't have to>) see if a scalar contains a
696string, integer, or floating point number. But you can still peek
697and poke these if you will. See the section below.
698
699=back
700
701=head2 Messing with Perl's Internals
4411f3b6 702
47bfe92f 703The following API uses parts of Perl's internals in the current
0ab8f81e 704implementation. As such, they are efficient but may change.
4411f3b6 705
7e19fb92 706=over 2
4411f3b6 707
a63c962f 708=item is_utf8(STRING [, CHECK])
4411f3b6 709
0ab8f81e 710[INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING.
47bfe92f
JH
711If CHECK is true, also checks the data in STRING for being well-formed
712UTF-8. Returns true if successful, false otherwise.
4411f3b6 713
a63c962f 714=item _utf8_on(STRING)
4411f3b6 715
0ab8f81e 716[INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is
4411f3b6
NIS
717B<not> checked for being well-formed UTF-8. Do not use unless you
718B<know> that the STRING is well-formed UTF-8. Returns the previous
0ab8f81e
JH
719state of the UTF-8 flag (so please don't treat the return value as
720indicating success or failure), or C<undef> if STRING is not a string.
4411f3b6 721
a63c962f 722=item _utf8_off(STRING)
4411f3b6 723
0ab8f81e
JH
724[INTERNAL] Turns off the UTF-8 flag in STRING. Do not use frivolously.
725Returns the previous state of the UTF-8 flag (so please don't treat the
726return value as indicating success or failure), or C<undef> if STRING is
4411f3b6
NIS
727not a string.
728
729=back
730
731=head1 SEE ALSO
732
5d030b67
JH
733L<Encode::Encoding>,
734L<Encode::Supported>,
6d1c0808 735L<Encode::PerlIO>,
5d030b67 736L<encoding>,
6d1c0808
JH
737L<perlebcdic>,
738L<perlfunc/open>,
739L<perlunicode>,
740L<utf8>,
5d030b67 741the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6 742
85982a32 743=head1 MAINTAINER
aae85ceb
DK
744
745This project was originated by Nick Ing-Simmons and later maintained
7e19fb92
JH
746by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full
747list of people involved. For any questions, use
748E<lt>perl-unicode@perl.orgE<gt> so we can all share share.
aae85ceb 749
4411f3b6 750=cut