8 @ISA = qw(Exporter DynaLoader);
10 # Public, encouraged API is exported by default
34 # Documentation moved after __END__ for speed - NI-S
38 # The global hash is declared in XS code
39 $encoding{Unicode} = bless({},'Encode::Unicode');
40 $encoding{utf8} = bless({},'Encode::utf8');
41 $encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
46 foreach my $dir (@INC)
48 if (opendir(my $dh,"$dir/Encode"))
50 while (defined(my $name = readdir($dh)))
52 if ($name =~ /^(.*)\.enc$/)
54 next if exists $encoding{$1};
55 $encoding{$1} = "$dir/$name";
61 return keys %encoding;
66 my ($class,$name,$file) = @_;
67 if (open(my $fh,$file))
73 $type = substr($line,0,1);
74 last unless $type eq '#';
76 $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
77 #warn "Loading $file";
78 return $class->read($fh,$name,$type);
88 my ($class,$name) = @_;
90 unless (ref($enc = $encoding{$name}))
92 $enc = $class->loadEncoding($name,$enc) if defined $enc;
95 foreach my $dir (@INC)
97 last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
100 $encoding{$name} = $enc;
108 return __PACKAGE__->getEncoding($name);
113 my ($name,$string,$check) = @_;
114 my $enc = find_encoding($name);
115 croak("Unknown encoding '$name'") unless defined $enc;
116 my $octets = $enc->fromUnicode($string,$check);
117 return undef if ($check && length($string));
123 my ($name,$octets,$check) = @_;
124 my $enc = find_encoding($name);
125 croak("Unknown encoding '$name'") unless defined $enc;
126 my $string = $enc->toUnicode($octets,$check);
127 return undef if ($check && length($octets));
133 my ($string,$from,$to,$check) = @_;
134 my $f = find_encoding($from);
135 croak("Unknown encoding '$from'") unless defined $f;
136 my $t = find_encoding($to);
137 croak("Unknown encoding '$to'") unless defined $t;
138 my $uni = $f->toUnicode($string,$check);
139 return undef if ($check && length($string));
140 $string = $t->fromUnicode($uni,$check);
141 return undef if ($check && length($uni));
142 return length($_[0] = $string);
155 return undef unless utf8_decode($str);
159 package Encode::Unicode;
161 # Dummy package that provides the encode interface but leaves data
162 # as UTF-8 encoded. It is here so that from_to() works.
164 sub name { 'Unicode' }
168 my ($obj,$str,$chk) = @_;
169 Encode::utf8_upgrade($str);
174 *fromUnicode = \&toUnicode;
176 package Encode::utf8;
178 # package to allow long-hand
179 # $octets = encode( utf8 => $string );
186 my ($obj,$octets,$chk) = @_;
187 my $str = Encode::decode_utf8($octets);
198 my ($obj,$string,$chk) = @_;
199 my $octets = Encode::encode_utf8($string);
204 package Encode::Table;
208 my ($class,$fh,$name,$type) = @_;
209 my $rep = $class->can("rep_$type");
210 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
219 my $page = hex($line);
221 my $ch = $page * 256;
222 for (my $i = 0; $i < 16; $i++)
225 for (my $j = 0; $j < 16; $j++)
227 my $val = hex(substr($line,0,4,''));
242 $touni[$page] = \@page;
245 return bless {Name => $name,
254 sub name { shift->{'Name'} }
260 sub rep_M { ($_[0] > 255) ? 'n' : 'C' }
265 $ch = 0 unless @_ > 1;
271 my ($obj,$str,$chk) = @_;
272 my $rep = $obj->{'Rep'};
273 my $touni = $obj->{'ToUni'};
277 my $ch = ord(substr($str,0,1,''));
279 if (&$rep($ch) eq 'C')
281 $x = $touni->[0][$ch];
285 $x = $touni->[$ch][ord(substr($str,0,1,''))];
290 # What do we do here ?
295 $_[1] = $str if $chk;
301 my ($obj,$uni,$chk) = @_;
302 my $fmuni = $obj->{'FmUni'};
304 my $def = $obj->{'Def'};
305 my $rep = $obj->{'Rep'};
308 my $ch = substr($uni,0,1,'');
309 my $x = $fmuni->{chr(ord($ch))};
315 $str .= pack(&$rep($x),$x);
317 $_[1] = $uni if $chk;
321 package Encode::iso10646_1;
322 # Encoding is 16-bit network order Unicode
323 # Used for X font encodings
325 sub name { 'iso10646-1' }
329 my ($obj,$str,$chk) = @_;
333 my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
336 $_[1] = $str if $chk;
337 Encode::utf8_upgrade($uni);
343 my ($obj,$uni,$chk) = @_;
347 my $ch = substr($uni,0,1,'');
354 $str .= pack('n',$x);
356 $_[1] = $uni if $chk;
361 package Encode::Escape;
366 my ($class,$fh,$name) = @_;
367 my %self = (Name => $name, Num => 0);
370 my ($key,$val) = /^(\S+)\s+(.*)$/;
371 $val =~ s/^\{(.*?)\}/$1/g;
372 $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
375 return bless \%self,$class;
378 sub name { shift->{'Name'} }
382 croak("Not implemented yet");
387 croak("Not implemented yet");
390 # switch back to Encode package in case we ever add AutoLoader
399 Encode - character encodings
407 The C<Encode> module provides the interfaces between perl's strings
408 and the rest of the system. Perl strings are sequences of B<characters>.
410 The repertoire of characters that Perl can represent is at least that
411 defined by the Unicode Consortium. On most platforms the ordinal values
412 of the characters (as returned by C<ord(ch)>) is the "Unicode codepoint" for
413 the character (the exceptions are those platforms where the legacy
414 encoding is some variant of EBCDIC rather than a super-set of ASCII
415 - see L<perlebcdic>).
417 Traditionaly computer data has been moved around in 8-bit chunks
418 often called "bytes". These chunks are also known as "octets" in
419 networking standards. Perl is widely used to manipulate data of
420 many types - not only strings of characters representing human or
421 computer languages but also "binary" data being the machines representation
422 of numbers, pixels in an image - or just about anything.
424 When perl is processing "binary data" the programmer wants perl to process
425 "sequences of bytes". This is not a problem for perl - as a byte has 256
426 possible values it easily fits in perl's much larger "logical character".
434 I<character>: a character in the range 0..(2**32-1) (or more).
435 (What perl's strings are made of.)
439 I<byte>: a character in the range 0..255
440 (A special case of a perl character.)
444 I<octet>: 8 bits of data, with ordinal values 0..255
445 (Term for bytes passed to or from a non-perl context, e.g. disk file.)
449 The marker [INTERNAL] marks Internal Implementation Details, in
450 general meant only for those who think they know what they are doing,
451 and such details may change in future releases.
455 =head2 Characteristics of an Encoding
457 An encoding has a "repertoire" of characters that it can represent,
458 and for each representable character there is at least one sequence of
459 octets that represents it.
461 =head2 Types of Encodings
463 Encodings can be divided into the following types:
467 =item * Fixed length 8-bit (or less) encodings.
469 Each character is a single octet so may have a repertoire of up to
470 256 characters. ASCII and iso-8859-* are typical examples.
472 =item * Fixed length 16-bit encodings
474 Each character is two octets so may have a repertoire of up to
475 65,536 characters. Unicode's UCS-2 is an example. Also used for
476 encodings for East Asian languages.
478 =item * Fixed length 32-bit encodings.
480 Not really very "encoded" encodings. The Unicode code points
481 are just represented as 4-octet integers. None the less because
482 different architectures use different representations of integers
483 (so called "endian") there at least two disctinct encodings.
485 =item * Multi-byte encodings
487 The number of octets needed to represent a character varies.
488 UTF-8 is a particularly complex but regular case of a multi-byte
489 encoding. Several East Asian countries use a multi-byte encoding
490 where 1-octet is used to cover western roman characters and Asian
491 characters get 2-octets.
492 (UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
493 to represent a Unicode code point.)
495 =item * "Escape" encodings.
497 These encodings embed "escape sequences" into the octet sequence
498 which describe how the following octets are to be interpreted.
499 The iso-2022-* family is typical. Following the escape sequence
500 octets are encoded by an "embedded" encoding (which will be one
501 of the above types) until another escape sequence switches to
502 a different "embedded" encoding.
504 These schemes are very flexible and can handle mixed languages but are
505 very complex to process (and have state).
506 No escape encodings are implemented for perl yet.
510 =head2 Specifying Encodings
512 Encodings can be specified to the API described below in two ways:
518 Encoding names are strings with characters taken from a restricted repertoire.
519 See L</"Encoding Names">.
521 =item 2. As an object
523 Encoding objects are returned by C<find_encoding($name)>.
527 =head2 Encoding Names
529 Encoding names are case insensitive. White space in names is ignored.
530 In addition an encoding may have aliases. Each encoding has one "canonical" name.
531 The "canonical" name is chosen from the names of the encoding by picking
532 the first in the following sequence:
536 =item * The MIME name as defined in IETF RFC-XXXX.
538 =item * The name in the IANA registry.
540 =item * The name used by the the organization that defined it.
544 Because of all the alias issues, and because in the general case
545 encodings have state C<Encode> uses the encoding object internally
546 once an operation is in progress.
548 I<Aliasing is not yet implemented.>
550 =head1 PERL ENCODING API
552 =head2 Generic Encoding Interface
558 $bytes = encode(ENCODING, $string[, CHECK])
560 Encodes string from perl's internal form into I<ENCODING> and returns a
562 See L</"Handling Malformed Data">.
566 $string = decode(ENCODING, $bytes[, CHECK])
568 Decode sequence of octets assumed to be in I<ENCODING> into perls internal
569 form and returns the resuting string.
570 See L</"Handling Malformed Data">.
574 =head2 Handling Malformed Data
576 If CHECK is not set, C<undef> is returned. If the data is supposed to
577 be UTF-8, an optional lexical warning (category utf8) is given.
578 If CHECK is true but not a code reference, dies.
580 It would desirable to have a way to indicate that transform should use the
581 encodings "replacement character" - no such mechanism is defined yet.
583 It is also planned to allow I<CHECK> to be a code reference.
585 This is not yet implemented as there are design issues with what its arguments
586 should be and how it returns its results.
592 Passed remaining fragment of string being processed.
593 Modifies it in place to remove bytes/characters it can understand
594 and returns a string used to represent them.
598 my $ch = substr($_[0],0,1,'');
599 return sprintf("\x{%02X}",ord($ch);
602 This scheme is close to how underlying C code for Encode works, but gives
603 the fixup routine very little context.
607 Passed original string, and an index into it of the problem area,
608 and output string so far.
609 Appends what it will to output string and returns new index into
614 # my ($s,$i,$d) = @_;
615 my $ch = substr($_[0],$_[1],1);
616 $_[2] .= sprintf("\x{%02X}",ord($ch);
620 This scheme gives maximal control to the fixup routine but is more complicated
621 to code, and may need internals of Encode to be tweaked to keep original
628 Multiple return values rather than in-place modifications.
630 Index into the string could be pos($str) allowing s/\G...//.
636 The Unicode consortium defines the UTF-8 standard as a way of encoding
637 the entire Unicode repertiore as sequences of octets. This encoding
638 is expected to become very widespread. Perl can use this form internaly
639 to represent strings, so conversions to and from this form are particularly
640 efficient (as octets in memory do not have to change, just the meta-data
641 that tells perl how to treat them).
647 $bytes = encode_utf8($string);
649 The characters that comprise string are encoded in perl's superset of UTF-8
650 and the resulting octets returned as a sequence of bytes. All possible
651 characters have a UTF-8 representation so this function cannot fail.
655 $string = decode_utf8($bytes [,CHECK]);
657 The sequence of octets represented by $bytes is decoded from UTF-8 into
658 a sequence of logical characters. Not all sequences of octets form valid
659 UTF-8 encodings, so it is possible for this call to fail.
660 See L</"Handling Malformed Data">.
664 =head2 Other Encodings of Unicode
666 UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks.
667 UCS-2 can only represent 0..0xFFFF, while UTF-16 has a "surogate pair"
668 scheme which allows it to cover the whole Unicode range.
670 Encode implements big-endian UCS-2 as the encoding "iso10646-1" as that
671 happens to be the name used by that representation when used with X11 fonts.
673 UTF-32 or UCS-4 is 32-bit or 4-byte chunks. Perl's logical characters
674 can be considered as being in this form without encoding. An encoding
675 to transfer strings in this form (e.g. to write them to a file) would need to
677 pack('L',map(chr($_),split(//,$string))); # native
679 pack('V',map(chr($_),split(//,$string))); # little-endian
681 pack('N',map(chr($_),split(//,$string))); # big-endian
683 depending on the endian required.
685 No UTF-32 encodings are not yet implemented.
687 Both UCS-2 and UCS-4 style encodings can have "byte order marks" by representing
688 the code point 0xFFFE as the very first thing in a file.
690 =head1 Encoding and IO
692 It is very common to want to do encoding transformations when
693 reading or writing files, network connections, pipes etc.
694 If perl is configured to use the new 'perlio' IO system then
695 C<Encode> provides a "layer" (See L<perliol>) which can transform
696 data as it is read or written.
698 open(my $ilyad,'>:encoding(iso8859-7)','ilyad.greek');
701 In addition the new IO system can also be configured to read/write
702 UTF-8 encoded characters (as noted above this is efficient):
704 open(my $fh,'>:utf8','anything');
705 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
707 Either of the above forms of "layer" specifications can be made the default
708 for a lexical scope with the C<use open ...> pragma. See L<open>.
710 Once a handle is open is layers can be altered using C<binmode>.
712 Without any such configuration, or if perl itself is built using
713 system's own IO, then write operations assume that file handle accepts
714 only I<bytes> and will C<die> if a character larger than 255 is
715 written to the handle. When reading, each octet from the handle
716 becomes a byte-in-a-character. Note that this default is the same
717 behaviour as bytes-only languages (including perl before v5.6) would have,
718 and is sufficient to handle native 8-bit encodings e.g. iso-8859-1,
719 EBCDIC etc. and any legacy mechanisms for handling other encodings
722 In other cases it is the programs responsibility
723 to transform characters into bytes using the API above before
724 doing writes, and to transform the bytes read from a handle into characters
725 before doing "character operations" (e.g. C<lc>, C</\W+/>, ...).
727 =head1 Encoding How to ...
733 =item * IO with mixed content (faking iso-2020-*)
735 =item * MIME's Content-Length:
737 =item * UTF-8 strings in binary data.
739 =item * perl/Encode wrappers on non-Unicode XS modules.
743 =head1 Messing with Perl's Internals
745 The following API uses parts of perl's internals in the current implementation.
746 As such they are efficient, but may change.
752 $num_octets = utf8_upgrade($string);
754 Converts internal representation of string to the UTF-8 form.
755 Returns the number of octets necessary to represent the string as UTF-8.
757 =item * utf8_downgrade($string[, CHECK])
759 Converts internal representation of string to be un-encoded bytes.
761 =item * is_utf8(STRING [, CHECK])
763 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
764 If CHECK is true, also checks the data in STRING for being
765 well-formed UTF-8. Returns true if successful, false otherwise.
767 =item * valid_utf8(STRING)
769 [INTERNAL] Test whether STRING is in a consistent state.
770 Will return true if string is held as bytes, or is well-formed UTF-8
771 and has the UTF-8 flag on.
772 Main reason for this routine is to allow perl's testsuite to check
773 that operations have left strings in a consistent state.
779 [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
780 B<not> checked for being well-formed UTF-8. Do not use unless you
781 B<know> that the STRING is well-formed UTF-8. Returns the previous
782 state of the UTF-8 flag (so please don't test the return value as
783 I<not> success or failure), or C<undef> if STRING is not a string.
789 [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
790 Returns the previous state of the UTF-8 flag (so please don't test the
791 return value as I<not> success or failure), or C<undef> if STRING is
798 L<perlunicode>, L<perlebcdic>, L<perlfunc/open>