This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to Encode 1.33, from Dan Kogai.
[perl5.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
448e90bb 3our $VERSION = do { my @r = (q$Revision: 1.33 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5129552c 4our $DEBUG = 0;
2c674647
JH
5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
51ef4e11 12our @EXPORT = qw (
4411f3b6 13 decode
4411f3b6 14 decode_utf8
fcb875d4
JH
15 encode
16 encode_utf8
51ef4e11 17 encodings
fcb875d4 18 find_encoding
4411f3b6
NIS
19);
20
51ef4e11 21our @EXPORT_OK =
2c674647 22 qw(
fcb875d4
JH
23 _utf8_off
24 _utf8_on
51ef4e11 25 define_encoding
2c674647 26 from_to
4411f3b6 27 is_16bit
fcb875d4
JH
28 is_8bit
29 is_utf8
30 resolve_alias
a12c0f56 31 utf8_downgrade
fcb875d4 32 utf8_upgrade
2c674647
JH
33 );
34
35bootstrap Encode ();
36
4411f3b6 37# Documentation moved after __END__ for speed - NI-S
2c674647 38
bf230f3d
NIS
39use Carp;
40
a63c962f 41our $ON_EBCDIC = (ord("A") == 193);
f2a2953c 42
5d030b67
JH
43use Encode::Alias;
44
5129552c
JH
45# Make a %Encoding package variable to allow a certain amount of cheating
46our %Encoding;
fdd579e2 47use Encode::Config;
5129552c 48
656753f8
NIS
49sub encodings
50{
5129552c 51 my $class = shift;
071db25d 52 my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_;
c731e18e
JH
53 for my $mod (@modules){
54 $mod =~ s,::,/,g or $mod = "Encode/$mod";
55 $mod .= '.pm';
56 $DEBUG and warn "about to require $mod;";
57 eval { require $mod; };
5129552c 58 }
c731e18e 59 my %modules = map {$_ => 1} @modules;
5129552c 60 return
ce912cd4
JH
61 sort { lc $a cmp lc $b }
62 grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
51ef4e11
NIS
63}
64
51ef4e11
NIS
65sub define_encoding
66{
18586f54
NIS
67 my $obj = shift;
68 my $name = shift;
5129552c 69 $Encoding{$name} = $obj;
18586f54
NIS
70 my $lc = lc($name);
71 define_alias($lc => $obj) unless $lc eq $name;
72 while (@_)
73 {
74 my $alias = shift;
75 define_alias($alias,$obj);
76 }
77 return $obj;
656753f8
NIS
78}
79
656753f8
NIS
80sub getEncoding
81{
dd9703c9 82 my ($class,$name,$skip_external) = @_;
18586f54
NIS
83 my $enc;
84 if (ref($name) && $name->can('new_sequence'))
85 {
86 return $name;
87 }
88 my $lc = lc $name;
5129552c 89 if (exists $Encoding{$name})
18586f54 90 {
5129552c 91 return $Encoding{$name};
18586f54 92 }
5129552c 93 if (exists $Encoding{$lc})
18586f54 94 {
5129552c 95 return $Encoding{$lc};
18586f54 96 }
c50d192e 97
5129552c 98 my $oc = $class->find_alias($name);
c50d192e
AT
99 return $oc if defined $oc;
100
5129552c 101 $oc = $class->find_alias($lc) if $lc ne $name;
c50d192e
AT
102 return $oc if defined $oc;
103
c731e18e 104 unless ($skip_external)
d1ed7747 105 {
c731e18e
JH
106 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
107 $mod =~ s,::,/,g ; $mod .= '.pm';
108 eval{ require $mod; };
109 return $Encoding{$name} if exists $Encoding{$name};
110 }
d1ed7747 111 }
18586f54 112 return;
656753f8
NIS
113}
114
4411f3b6
NIS
115sub find_encoding
116{
dd9703c9
AT
117 my ($name,$skip_external) = @_;
118 return __PACKAGE__->getEncoding($name,$skip_external);
4411f3b6
NIS
119}
120
fcb875d4
JH
121sub resolve_alias {
122 my $obj = find_encoding(shift);
123 defined $obj and return $obj->name;
124 return;
125}
126
4411f3b6
NIS
127sub encode
128{
18586f54
NIS
129 my ($name,$string,$check) = @_;
130 my $enc = find_encoding($name);
131 croak("Unknown encoding '$name'") unless defined $enc;
132 my $octets = $enc->encode($string,$check);
133 return undef if ($check && length($string));
134 return $octets;
4411f3b6
NIS
135}
136
137sub decode
138{
18586f54
NIS
139 my ($name,$octets,$check) = @_;
140 my $enc = find_encoding($name);
141 croak("Unknown encoding '$name'") unless defined $enc;
142 my $string = $enc->decode($octets,$check);
143 $_[1] = $octets if $check;
144 return $string;
4411f3b6
NIS
145}
146
147sub from_to
148{
18586f54
NIS
149 my ($string,$from,$to,$check) = @_;
150 my $f = find_encoding($from);
151 croak("Unknown encoding '$from'") unless defined $f;
152 my $t = find_encoding($to);
153 croak("Unknown encoding '$to'") unless defined $t;
154 my $uni = $f->decode($string,$check);
155 return undef if ($check && length($string));
a999c27c 156 $string = $t->encode($uni,$check);
18586f54 157 return undef if ($check && length($uni));
3ef515df 158 return defined($_[0] = $string) ? length($string) : undef ;
4411f3b6
NIS
159}
160
161sub encode_utf8
162{
18586f54 163 my ($str) = @_;
c731e18e 164 utf8::encode($str);
18586f54 165 return $str;
4411f3b6
NIS
166}
167
168sub decode_utf8
169{
18586f54
NIS
170 my ($str) = @_;
171 return undef unless utf8::decode($str);
172 return $str;
5ad8ef52
NIS
173}
174
f2a2953c
JH
175predefine_encodings();
176
177#
178# This is to restore %Encoding if really needed;
179#
180sub predefine_encodings{
181 if ($ON_EBCDIC) {
182 # was in Encode::UTF_EBCDIC
183 package Encode::UTF_EBCDIC;
184 *name = sub{ shift->{'Name'} };
185 *new_sequence = sub{ return $_[0] };
186 *decode = sub{
187 my ($obj,$str,$chk) = @_;
188 my $res = '';
189 for (my $i = 0; $i < length($str); $i++) {
190 $res .=
191 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
192 }
193 $_[1] = '' if $chk;
194 return $res;
195 };
196 *encode = sub{
197 my ($obj,$str,$chk) = @_;
198 my $res = '';
199 for (my $i = 0; $i < length($str); $i++) {
200 $res .=
201 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
202 }
203 $_[1] = '' if $chk;
204 return $res;
205 };
c731e18e
JH
206 $Encode::Encoding{Internal} =
207 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
f2a2953c
JH
208 } else {
209 # was in Encode::UTF_EBCDIC
210 package Encode::Internal;
211 *name = sub{ shift->{'Name'} };
212 *new_sequence = sub{ return $_[0] };
213 *decode = sub{
214 my ($obj,$str,$chk) = @_;
215 utf8::upgrade($str);
216 $_[1] = '' if $chk;
217 return $str;
218 };
219 *encode = \&decode;
220 $Encode::Encoding{Unicode} =
c731e18e 221 bless {Name => "Internal"} => "Encode::Internal";
f2a2953c
JH
222 }
223
224 {
225 # was in Encode::utf8
226 package Encode::utf8;
227 *name = sub{ shift->{'Name'} };
228 *new_sequence = sub{ return $_[0] };
229 *decode = sub{
230 my ($obj,$octets,$chk) = @_;
231 my $str = Encode::decode_utf8($octets);
232 if (defined $str) {
233 $_[1] = '' if $chk;
234 return $str;
235 }
236 return undef;
237 };
238 *encode = sub {
239 my ($obj,$string,$chk) = @_;
240 my $octets = Encode::encode_utf8($string);
241 $_[1] = '' if $chk;
242 return $octets;
243 };
244 $Encode::Encoding{utf8} =
c731e18e 245 bless {Name => "utf8"} => "Encode::utf8";
f2a2953c
JH
246 }
247 # do externals if necessary
248 require File::Basename;
249 require File::Spec;
c731e18e 250 for my $ext (qw()){
f2a2953c
JH
251 my $pm =
252 File::Spec->catfile(File::Basename::dirname($INC{'Encode.pm'}),
253 "Encode", "$ext.pm");
254 do $pm;
255 }
256}
257
18586f54
NIS
258require Encode::Encoding;
259require Encode::XS;
4411f3b6 260
656753f8
NIS
2611;
262
2a936312
NIS
263__END__
264
4411f3b6
NIS
265=head1 NAME
266
267Encode - character encodings
268
269=head1 SYNOPSIS
270
271 use Encode;
272
67d7b5ef
JH
273
274=head2 Table of Contents
275
276Encode consists of a collection of modules which details are too big
277to fit in one document. This POD itself explains the top-level APIs
278and general topics at a glance. For other topics and more details,
279see the PODs below;
280
281 Name Description
282 --------------------------------------------------------
283 Encode::Alias Alias defintions to encodings
284 Encode::Encoding Encode Implementation Base Class
285 Encode::Supported List of Supported Encodings
286 Encode::CN Simplified Chinese Encodings
287 Encode::JP Japanese Encodings
288 Encode::KR Korean Encodings
289 Encode::TW Traditional Chinese Encodings
290 --------------------------------------------------------
291
4411f3b6
NIS
292=head1 DESCRIPTION
293
47bfe92f 294The C<Encode> module provides the interfaces between Perl's strings
67d7b5ef
JH
295and the rest of the system. Perl strings are sequences of
296B<characters>.
297
298The repertoire of characters that Perl can represent is at least that
299defined by the Unicode Consortium. On most platforms the ordinal
300values of the characters (as returned by C<ord(ch)>) is the "Unicode
301codepoint" for the character (the exceptions are those platforms where
302the legacy encoding is some variant of EBCDIC rather than a super-set
303of ASCII - see L<perlebcdic>).
304
305Traditionally computer data has been moved around in 8-bit chunks
306often called "bytes". These chunks are also known as "octets" in
307networking standards. Perl is widely used to manipulate data of many
308types - not only strings of characters representing human or computer
309languages but also "binary" data being the machines representation of
310numbers, pixels in an image - or just about anything.
311
312When Perl is processing "binary data" the programmer wants Perl to
313process "sequences of bytes". This is not a problem for Perl - as a
314byte has 256 possible values it easily fits in Perl's much larger
315"logical character".
316
317=head2 TERMINOLOGY
4411f3b6 318
67d7b5ef 319=over 4
21938dfa 320
67d7b5ef
JH
321=item *
322
323I<character>: a character in the range 0..(2**32-1) (or more).
324(What Perl's strings are made of.)
325
326=item *
327
328I<byte>: a character in the range 0..255
329(A special case of a Perl character.)
330
331=item *
332
333I<octet>: 8 bits of data, with ordinal values 0..255
334(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
335
336=back
4411f3b6 337
67d7b5ef
JH
338The marker [INTERNAL] marks Internal Implementation Details, in
339general meant only for those who think they know what they are doing,
340and such details may change in future releases.
341
342=head1 PERL ENCODING API
4411f3b6
NIS
343
344=over 4
345
f2a2953c 346=item $octets = encode(ENCODING, $string[, CHECK])
4411f3b6 347
47bfe92f 348Encodes string from Perl's internal form into I<ENCODING> and returns
67d7b5ef
JH
349a sequence of octets. ENCODING can be either a canonical name or
350alias. For encoding names and aliases, see L</"Defining Aliases">.
351For CHECK see L</"Handling Malformed Data">.
4411f3b6 352
67d7b5ef
JH
353For example to convert (internally UTF-8 encoded) Unicode string to
354iso-8859-1 (also known as Latin1),
681a7c68 355
67d7b5ef 356 $octets = encode("iso-8859-1", $unicode);
681a7c68 357
f2a2953c 358=item $string = decode(ENCODING, $octets[, CHECK])
4411f3b6 359
47bfe92f 360Decode sequence of octets assumed to be in I<ENCODING> into Perl's
67d7b5ef
JH
361internal form and returns the resulting string. as in encode(),
362ENCODING can be either a canonical name or alias. For encoding names
363and aliases, see L</"Defining Aliases">. For CHECK see
47bfe92f
JH
364L</"Handling Malformed Data">.
365
1b2c56c8 366For example to convert ISO-8859-1 data to UTF-8:
681a7c68 367
67d7b5ef 368 $utf8 = decode("iso-8859-1", $latin1);
681a7c68 369
f2a2953c 370=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
47bfe92f 371
2b106fbe
JH
372Convert B<in-place> the data between two encodings. How did the data
373in $string originally get to be in FROM_ENCODING? Either using
67d7b5ef
JH
374encode() or through PerlIO: See L</"Encoding and IO">.
375For encoding names and aliases, see L</"Defining Aliases">.
376For CHECK see L</"Handling Malformed Data">.
2b106fbe 377
1b2c56c8 378For example to convert ISO-8859-1 data to UTF-8:
2b106fbe
JH
379
380 from_to($data, "iso-8859-1", "utf-8");
381
382and to convert it back:
383
384 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 385
ab97ca19
JH
386Note that because the conversion happens in place, the data to be
387converted cannot be a string constant, it must be a scalar variable.
388
3ef515df
JH
389from_to() return the length of the converted string on success, undef
390otherwise.
391
4411f3b6
NIS
392=back
393
f2a2953c
JH
394=head2 UTF-8 / utf8
395
396The Unicode consortium defines the UTF-8 standard as a way of encoding
397the entire Unicode repertoire as sequences of octets. This encoding is
398expected to become very widespread. Perl can use this form internally
399to represent strings, so conversions to and from this form are
400particularly efficient (as octets in memory do not have to change,
401just the meta-data that tells Perl how to treat them).
402
403=over 4
404
405=item $octets = encode_utf8($string);
406
407The characters that comprise string are encoded in Perl's superset of UTF-8
408and the resulting octets returned as a sequence of bytes. All possible
409characters have a UTF-8 representation so this function cannot fail.
410
411=item $string = decode_utf8($octets [, CHECK]);
412
413The sequence of octets represented by $octets is decoded from UTF-8
414into a sequence of logical characters. Not all sequences of octets
415form valid UTF-8 encodings, so it is possible for this call to fail.
416For CHECK see L</"Handling Malformed Data">.
417
418=back
419
51ef4e11
NIS
420=head2 Listing available encodings
421
5129552c
JH
422 use Encode;
423 @list = Encode->encodings();
424
425Returns a list of the canonical names of the available encodings that
426are loaded. To get a list of all available encodings including the
427ones that are not loaded yet, say
428
429 @all_encodings = Encode->encodings(":all");
430
431Or you can give the name of specific module.
432
c731e18e
JH
433 @with_jp = Encode->encodings("Encode::JP");
434
435When "::" is not in the name, "Encode::" is assumed.
51ef4e11 436
c731e18e 437 @ebcdic = Encode->encodings("EBCDIC");
5d030b67 438
a63c962f 439To find which encodings are supported by this package in details,
5d030b67 440see L<Encode::Supported>.
51ef4e11
NIS
441
442=head2 Defining Aliases
443
67d7b5ef
JH
444To add new alias to a given encoding, Use;
445
5129552c
JH
446 use Encode;
447 use Encode::Alias;
a63c962f 448 define_alias(newName => ENCODING);
51ef4e11 449
3ef515df 450After that, newName can be used as an alias for ENCODING.
f2a2953c
JH
451ENCODING may be either the name of an encoding or an
452I<encoding object>
51ef4e11 453
fcb875d4
JH
454But before you do so, make sure the alias is nonexistent with
455C<resolve_alias()>, which returns the canonical name thereof.
456i.e.
457
458 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
459 Encode::resolve_alias("iso-8859-12") # false; nonexistent
460 Encode::resolve_alias($name) eq $name # true if $name is canonical
461
462This resolve_alias() does not need C<use Encode::Alias> and is
463exported via C<use encode qw(resolve_alias)>.
464
5d030b67 465See L<Encode::Alias> on details.
51ef4e11 466
4411f3b6
NIS
467=head1 Encoding and IO
468
469It is very common to want to do encoding transformations when
470reading or writing files, network connections, pipes etc.
47bfe92f 471If Perl is configured to use the new 'perlio' IO system then
4411f3b6
NIS
472C<Encode> provides a "layer" (See L<perliol>) which can transform
473data as it is read or written.
474
8e86646e
JH
475Here is how the blind poet would modernise the encoding:
476
42234700 477 use Encode;
8e86646e
JH
478 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
479 open(my $utf8,'>:utf8','iliad.utf8');
480 my @epic = <$iliad>;
481 print $utf8 @epic;
482 close($utf8);
483 close($illiad);
4411f3b6
NIS
484
485In addition the new IO system can also be configured to read/write
486UTF-8 encoded characters (as noted above this is efficient):
487
e9692b5b
JH
488 open(my $fh,'>:utf8','anything');
489 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6
NIS
490
491Either of the above forms of "layer" specifications can be made the default
492for a lexical scope with the C<use open ...> pragma. See L<open>.
493
494Once a handle is open is layers can be altered using C<binmode>.
495
47bfe92f 496Without any such configuration, or if Perl itself is built using
4411f3b6
NIS
497system's own IO, then write operations assume that file handle accepts
498only I<bytes> and will C<die> if a character larger than 255 is
499written to the handle. When reading, each octet from the handle
500becomes a byte-in-a-character. Note that this default is the same
47bfe92f
JH
501behaviour as bytes-only languages (including Perl before v5.6) would
502have, and is sufficient to handle native 8-bit encodings
503e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
504other encodings and binary data.
505
506In other cases it is the programs responsibility to transform
507characters into bytes using the API above before doing writes, and to
508transform the bytes read from a handle into characters before doing
509"character operations" (e.g. C<lc>, C</\W+/>, ...).
510
47bfe92f 511You can also use PerlIO to convert larger amounts of data you don't
1b2c56c8 512want to bring into memory. For example to convert between ISO-8859-1
47bfe92f
JH
513(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
514
e9692b5b
JH
515 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
516 open(G, ">:utf8", "data.utf") or die $!;
517 while (<F>) { print G }
518
519 # Could also do "print G <F>" but that would pull
520 # the whole file into memory just to write it out again.
521
522More examples:
47bfe92f 523
e9692b5b
JH
524 open(my $f, "<:encoding(cp1252)")
525 open(my $g, ">:encoding(iso-8859-2)")
526 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f
JH
527
528See L<PerlIO> for more information.
4411f3b6 529
1768d7eb 530See also L<encoding> for how to change the default encoding of the
d521382b 531data in your script.
1768d7eb 532
67d7b5ef
JH
533=head1 Handling Malformed Data
534
f2a2953c
JH
535If I<CHECK> is not set, (en|de)code will put I<substitution character> in
536place of the malformed character. for UCM-based encodings,
537E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
538data is supposed to be UTF-8, an optional lexical warning (category
539utf8) is given.
67d7b5ef 540
f2a2953c 541If I<CHECK> is true but not a code reference, dies with an error message.
67d7b5ef 542
f2a2953c
JH
543In future you will be able to use a code reference to a callback
544function for the value of I<CHECK> but its API is still undecided.
67d7b5ef
JH
545
546=head1 Defining Encodings
547
548To define a new encoding, use:
549
550 use Encode qw(define_alias);
551 define_encoding($object, 'canonicalName' [, alias...]);
552
553I<canonicalName> will be associated with I<$object>. The object
554should provide the interface described in L<Encode::Encoding>
555If more than two arguments are provided then additional
556arguments are taken as aliases for I<$object> as for C<define_alias>.
557
f2a2953c
JH
558See L<Encode::Encoding> for more details.
559
4411f3b6
NIS
560=head1 Messing with Perl's Internals
561
47bfe92f
JH
562The following API uses parts of Perl's internals in the current
563implementation. As such they are efficient, but may change.
4411f3b6
NIS
564
565=over 4
566
a63c962f 567=item is_utf8(STRING [, CHECK])
4411f3b6
NIS
568
569[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f
JH
570If CHECK is true, also checks the data in STRING for being well-formed
571UTF-8. Returns true if successful, false otherwise.
4411f3b6 572
a63c962f 573=item _utf8_on(STRING)
4411f3b6
NIS
574
575[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
576B<not> checked for being well-formed UTF-8. Do not use unless you
577B<know> that the STRING is well-formed UTF-8. Returns the previous
578state of the UTF-8 flag (so please don't test the return value as
579I<not> success or failure), or C<undef> if STRING is not a string.
580
a63c962f 581=item _utf8_off(STRING)
4411f3b6
NIS
582
583[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
584Returns the previous state of the UTF-8 flag (so please don't test the
585return value as I<not> success or failure), or C<undef> if STRING is
586not a string.
587
588=back
589
590=head1 SEE ALSO
591
5d030b67
JH
592L<Encode::Encoding>,
593L<Encode::Supported>,
594L<PerlIO>,
595L<encoding>,
596L<perlebcdic>,
597L<perlfunc/open>,
598L<perlunicode>,
599L<utf8>,
600the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
4411f3b6
NIS
601
602=cut