This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Checked in the wrong one ...
[perl5.git] / ext / Encode / Encode.pm
CommitLineData
2c674647
JH
1package Encode;
2
3$VERSION = 0.01;
4
5require DynaLoader;
6require Exporter;
7
8@ISA = qw(Exporter DynaLoader);
9
4411f3b6
NIS
10# Public, encouraged API is exported by default
11@EXPORT = qw (
12 encode
13 decode
14 encode_utf8
15 decode_utf8
16 find_encoding
17);
18
2c674647
JH
19@EXPORT_OK =
20 qw(
4411f3b6 21 encodings
2c674647
JH
22 from_to
23 is_utf8
4411f3b6
NIS
24 is_8bit
25 is_16bit
a12c0f56
NIS
26 utf8_upgrade
27 utf8_downgrade
4411f3b6
NIS
28 _utf8_on
29 _utf8_off
2c674647
JH
30 );
31
32bootstrap Encode ();
33
4411f3b6 34# Documentation moved after __END__ for speed - NI-S
2c674647 35
bf230f3d
NIS
36use Carp;
37
2f2b4ff2 38# The global hash is declared in XS code
4411f3b6
NIS
39$encoding{Unicode} = bless({},'Encode::Unicode');
40$encoding{utf8} = bless({},'Encode::utf8');
9b37254d 41$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
5345d506 42
656753f8
NIS
43sub encodings
44{
45 my ($class) = @_;
5345d506 46 foreach my $dir (@INC)
656753f8 47 {
5345d506 48 if (opendir(my $dh,"$dir/Encode"))
656753f8 49 {
5345d506
NIS
50 while (defined(my $name = readdir($dh)))
51 {
52 if ($name =~ /^(.*)\.enc$/)
53 {
54 next if exists $encoding{$1};
55 $encoding{$1} = "$dir/$name";
56 }
57 }
58 closedir($dh);
656753f8 59 }
5345d506
NIS
60 }
61 return keys %encoding;
62}
63
64sub loadEncoding
65{
66 my ($class,$name,$file) = @_;
67 if (open(my $fh,$file))
68 {
69 my $type;
70 while (1)
71 {
72 my $line = <$fh>;
73 $type = substr($line,0,1);
74 last unless $type eq '#';
75 }
76 $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
c8991b40 77 #warn "Loading $file";
5345d506 78 return $class->read($fh,$name,$type);
656753f8
NIS
79 }
80 else
81 {
5345d506 82 return undef;
656753f8 83 }
656753f8
NIS
84}
85
656753f8
NIS
86sub getEncoding
87{
88 my ($class,$name) = @_;
5345d506
NIS
89 my $enc;
90 unless (ref($enc = $encoding{$name}))
656753f8 91 {
5345d506
NIS
92 $enc = $class->loadEncoding($name,$enc) if defined $enc;
93 unless (ref($enc))
656753f8 94 {
5345d506 95 foreach my $dir (@INC)
656753f8 96 {
5345d506 97 last if ($enc = $class->loadEncoding($name,"$dir/Encode/$name.enc"));
656753f8 98 }
87714904 99 }
5345d506 100 $encoding{$name} = $enc;
656753f8 101 }
5345d506 102 return $enc;
656753f8
NIS
103}
104
4411f3b6
NIS
105sub find_encoding
106{
107 my ($name) = @_;
108 return __PACKAGE__->getEncoding($name);
109}
110
111sub encode
112{
113 my ($name,$string,$check) = @_;
114 my $enc = find_encoding($name);
115 croak("Unknown encoding '$name'") unless defined $enc;
116 my $octets = $enc->fromUnicode($string,$check);
117 return undef if ($check && length($string));
118 return $octets;
119}
120
121sub decode
122{
123 my ($name,$octets,$check) = @_;
124 my $enc = find_encoding($name);
125 croak("Unknown encoding '$name'") unless defined $enc;
126 my $string = $enc->toUnicode($octets,$check);
127 return undef if ($check && length($octets));
128 return $string;
129}
130
131sub from_to
132{
133 my ($string,$from,$to,$check) = @_;
134 my $f = find_encoding($from);
135 croak("Unknown encoding '$from'") unless defined $f;
136 my $t = find_encoding($to);
137 croak("Unknown encoding '$to'") unless defined $t;
138 my $uni = $f->toUnicode($string,$check);
139 return undef if ($check && length($string));
140 $string = $t->fromUnicode($uni,$check);
141 return undef if ($check && length($uni));
142 return length($_[0] = $string);
143}
144
145sub encode_utf8
146{
147 my ($str) = @_;
148 utf8_encode($str);
149 return $str;
150}
151
152sub decode_utf8
153{
154 my ($str) = @_;
155 return undef unless utf8_decode($str);
156 return $str;
157}
158
656753f8
NIS
159package Encode::Unicode;
160
9b37254d 161# Dummy package that provides the encode interface but leaves data
a12c0f56 162# as UTF-8 encoded. It is here so that from_to() works.
656753f8
NIS
163
164sub name { 'Unicode' }
165
a12c0f56
NIS
166sub toUnicode
167{
168 my ($obj,$str,$chk) = @_;
169 Encode::utf8_upgrade($str);
170 $_[1] = '' if $chk;
171 return $str;
172}
656753f8 173
a12c0f56 174*fromUnicode = \&toUnicode;
656753f8 175
4411f3b6
NIS
176package Encode::utf8;
177
178# package to allow long-hand
179# $octets = encode( utf8 => $string );
180#
181
182sub name { 'utf8' }
183
184sub toUnicode
185{
186 my ($obj,$octets,$chk) = @_;
2a936312 187 my $str = Encode::decode_utf8($octets);
4411f3b6
NIS
188 if (defined $str)
189 {
190 $_[1] = '' if $chk;
191 return $str;
192 }
193 return undef;
194}
195
196sub fromUnicode
197{
198 my ($obj,$string,$chk) = @_;
2a936312 199 my $octets = Encode::encode_utf8($string);
4411f3b6
NIS
200 $_[1] = '' if $chk;
201 return $octets;
4411f3b6
NIS
202}
203
656753f8
NIS
204package Encode::Table;
205
206sub read
207{
208 my ($class,$fh,$name,$type) = @_;
209 my $rep = $class->can("rep_$type");
210 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
211 my @touni;
212 my %fmuni;
213 my $count = 0;
214 $def = hex($def);
656753f8
NIS
215 while ($pages--)
216 {
87714904
NIS
217 my $line = <$fh>;
218 chomp($line);
219 my $page = hex($line);
656753f8
NIS
220 my @page;
221 my $ch = $page * 256;
222 for (my $i = 0; $i < 16; $i++)
223 {
224 my $line = <$fh>;
225 for (my $j = 0; $j < 16; $j++)
226 {
227 my $val = hex(substr($line,0,4,''));
228 if ($val || !$ch)
229 {
230 my $uch = chr($val);
231 push(@page,$uch);
87714904 232 $fmuni{$uch} = $ch;
656753f8
NIS
233 $count++;
234 }
235 else
236 {
237 push(@page,undef);
238 }
239 $ch++;
240 }
241 }
242 $touni[$page] = \@page;
243 }
244
245 return bless {Name => $name,
246 Rep => $rep,
247 ToUni => \@touni,
248 FmUni => \%fmuni,
249 Def => $def,
250 Num => $count,
251 },$class;
252}
253
254sub name { shift->{'Name'} }
255
256sub rep_S { 'C' }
257
5dcbab34 258sub rep_D { 'n' }
656753f8 259
5dcbab34 260sub rep_M { ($_[0] > 255) ? 'n' : 'C' }
656753f8
NIS
261
262sub representation
263{
264 my ($obj,$ch) = @_;
265 $ch = 0 unless @_ > 1;
266 $obj-{'Rep'}->($ch);
267}
268
269sub toUnicode
270{
bf230f3d 271 my ($obj,$str,$chk) = @_;
656753f8
NIS
272 my $rep = $obj->{'Rep'};
273 my $touni = $obj->{'ToUni'};
274 my $uni = '';
275 while (length($str))
276 {
277 my $ch = ord(substr($str,0,1,''));
bf230f3d 278 my $x;
656753f8
NIS
279 if (&$rep($ch) eq 'C')
280 {
bf230f3d 281 $x = $touni->[0][$ch];
656753f8
NIS
282 }
283 else
284 {
bf230f3d 285 $x = $touni->[$ch][ord(substr($str,0,1,''))];
656753f8 286 }
bf230f3d
NIS
287 unless (defined $x)
288 {
289 last if $chk;
290 # What do we do here ?
291 $x = '';
292 }
293 $uni .= $x;
656753f8 294 }
bf230f3d 295 $_[1] = $str if $chk;
656753f8
NIS
296 return $uni;
297}
298
299sub fromUnicode
300{
bf230f3d 301 my ($obj,$uni,$chk) = @_;
656753f8
NIS
302 my $fmuni = $obj->{'FmUni'};
303 my $str = '';
304 my $def = $obj->{'Def'};
87714904 305 my $rep = $obj->{'Rep'};
656753f8
NIS
306 while (length($uni))
307 {
308 my $ch = substr($uni,0,1,'');
63eec5db 309 my $x = $fmuni->{chr(ord($ch))};
bf230f3d
NIS
310 unless (defined $x)
311 {
312 last if ($chk);
313 $x = $def;
314 }
87714904
NIS
315 $str .= pack(&$rep($x),$x);
316 }
317 $_[1] = $uni if $chk;
318 return $str;
319}
320
9b37254d
NIS
321package Encode::iso10646_1;
322# Encoding is 16-bit network order Unicode
323# Used for X font encodings
87714904
NIS
324
325sub name { 'iso10646-1' }
326
327sub toUnicode
328{
329 my ($obj,$str,$chk) = @_;
330 my $uni = '';
331 while (length($str))
332 {
5dcbab34 333 my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
87714904
NIS
334 $uni .= chr($code);
335 }
336 $_[1] = $str if $chk;
a12c0f56 337 Encode::utf8_upgrade($uni);
87714904
NIS
338 return $uni;
339}
340
341sub fromUnicode
342{
343 my ($obj,$uni,$chk) = @_;
344 my $str = '';
345 while (length($uni))
346 {
347 my $ch = substr($uni,0,1,'');
348 my $x = ord($ch);
349 unless ($x < 32768)
350 {
351 last if ($chk);
352 $x = 0;
353 }
5dcbab34 354 $str .= pack('n',$x);
656753f8 355 }
bf230f3d 356 $_[1] = $uni if $chk;
656753f8
NIS
357 return $str;
358}
359
2f2b4ff2 360
656753f8
NIS
361package Encode::Escape;
362use Carp;
363
364sub read
365{
366 my ($class,$fh,$name) = @_;
367 my %self = (Name => $name, Num => 0);
368 while (<$fh>)
369 {
370 my ($key,$val) = /^(\S+)\s+(.*)$/;
371 $val =~ s/^\{(.*?)\}/$1/g;
372 $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge;
373 $self{$key} = $val;
374 }
375 return bless \%self,$class;
376}
377
378sub name { shift->{'Name'} }
379
380sub toUnicode
381{
382 croak("Not implemented yet");
383}
384
385sub fromUnicode
386{
387 croak("Not implemented yet");
388}
389
4411f3b6
NIS
390# switch back to Encode package in case we ever add AutoLoader
391package Encode;
392
656753f8
NIS
3931;
394
2a936312
NIS
395__END__
396
4411f3b6
NIS
397=head1 NAME
398
399Encode - character encodings
400
401=head1 SYNOPSIS
402
403 use Encode;
404
405=head1 DESCRIPTION
406
407The C<Encode> module provides the interfaces between perl's strings
408and the rest of the system. Perl strings are sequences of B<characters>.
409
410The repertoire of characters that Perl can represent is at least that
411defined by the Unicode Consortium. On most platforms the ordinal values
412of the characters (as returned by C<ord(ch)>) is the "Unicode codepoint" for
413the character (the exceptions are those platforms where the legacy
414encoding is some variant of EBCDIC rather than a super-set of ASCII
415- see L<perlebcdic>).
416
417Traditionaly computer data has been moved around in 8-bit chunks
418often called "bytes". These chunks are also known as "octets" in
419networking standards. Perl is widely used to manipulate data of
420many types - not only strings of characters representing human or
421computer languages but also "binary" data being the machines representation
422of numbers, pixels in an image - or just about anything.
423
424When perl is processing "binary data" the programmer wants perl to process
425"sequences of bytes". This is not a problem for perl - as a byte has 256
426possible values it easily fits in perl's much larger "logical character".
427
428=head2 TERMINOLOGY
429
430=over
431
432=item *
433
434I<character>: a character in the range 0..(2**32-1) (or more).
435(What perl's strings are made of.)
436
437=item *
438
439I<byte>: a character in the range 0..255
440(A special case of a perl character.)
441
442=item *
443
444I<octet>: 8 bits of data, with ordinal values 0..255
445(Term for bytes passed to or from a non-perl context, e.g. disk file.)
446
447=back
448
449The marker [INTERNAL] marks Internal Implementation Details, in
450general meant only for those who think they know what they are doing,
451and such details may change in future releases.
452
453=head1 ENCODINGS
454
455=head2 Characteristics of an Encoding
456
457An encoding has a "repertoire" of characters that it can represent,
458and for each representable character there is at least one sequence of
459octets that represents it.
460
461=head2 Types of Encodings
462
463Encodings can be divided into the following types:
464
465=over 4
466
467=item * Fixed length 8-bit (or less) encodings.
468
469Each character is a single octet so may have a repertoire of up to
470256 characters. ASCII and iso-8859-* are typical examples.
471
472=item * Fixed length 16-bit encodings
473
474Each character is two octets so may have a repertoire of up to
47565,536 characters. Unicode's UCS-2 is an example. Also used for
476encodings for East Asian languages.
477
478=item * Fixed length 32-bit encodings.
479
480Not really very "encoded" encodings. The Unicode code points
481are just represented as 4-octet integers. None the less because
482different architectures use different representations of integers
483(so called "endian") there at least two disctinct encodings.
484
485=item * Multi-byte encodings
486
487The number of octets needed to represent a character varies.
488UTF-8 is a particularly complex but regular case of a multi-byte
489encoding. Several East Asian countries use a multi-byte encoding
490where 1-octet is used to cover western roman characters and Asian
491characters get 2-octets.
492(UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
493to represent a Unicode code point.)
494
495=item * "Escape" encodings.
496
497These encodings embed "escape sequences" into the octet sequence
498which describe how the following octets are to be interpreted.
499The iso-2022-* family is typical. Following the escape sequence
500octets are encoded by an "embedded" encoding (which will be one
501of the above types) until another escape sequence switches to
502a different "embedded" encoding.
503
504These schemes are very flexible and can handle mixed languages but are
505very complex to process (and have state).
506No escape encodings are implemented for perl yet.
507
508=back
509
510=head2 Specifying Encodings
511
512Encodings can be specified to the API described below in two ways:
513
514=over 4
515
516=item 1. By name
517
518Encoding names are strings with characters taken from a restricted repertoire.
519See L</"Encoding Names">.
520
521=item 2. As an object
522
523Encoding objects are returned by C<find_encoding($name)>.
524
525=back
526
527=head2 Encoding Names
528
529Encoding names are case insensitive. White space in names is ignored.
530In addition an encoding may have aliases. Each encoding has one "canonical" name.
531The "canonical" name is chosen from the names of the encoding by picking
532the first in the following sequence:
533
534=over 4
535
536=item * The MIME name as defined in IETF RFC-XXXX.
537
538=item * The name in the IANA registry.
539
540=item * The name used by the the organization that defined it.
541
542=back
543
544Because of all the alias issues, and because in the general case
545encodings have state C<Encode> uses the encoding object internally
546once an operation is in progress.
547
548I<Aliasing is not yet implemented.>
549
550=head1 PERL ENCODING API
551
552=head2 Generic Encoding Interface
553
554=over 4
555
556=item *
557
558 $bytes = encode(ENCODING, $string[, CHECK])
559
560Encodes string from perl's internal form into I<ENCODING> and returns a
561sequence of octets.
562See L</"Handling Malformed Data">.
563
564=item *
565
566 $string = decode(ENCODING, $bytes[, CHECK])
567
568Decode sequence of octets assumed to be in I<ENCODING> into perls internal
569form and returns the resuting string.
570See L</"Handling Malformed Data">.
571
572=back
573
574=head2 Handling Malformed Data
575
576If CHECK is not set, C<undef> is returned. If the data is supposed to
577be UTF-8, an optional lexical warning (category utf8) is given.
578If CHECK is true but not a code reference, dies.
579
580It would desirable to have a way to indicate that transform should use the
581encodings "replacement character" - no such mechanism is defined yet.
582
583It is also planned to allow I<CHECK> to be a code reference.
584
585This is not yet implemented as there are design issues with what its arguments
586should be and how it returns its results.
587
588=over 4
589
590=item Scheme 1
591
592Passed remaining fragment of string being processed.
593Modifies it in place to remove bytes/characters it can understand
594and returns a string used to represent them.
595e.g.
596
597 sub fixup {
598 my $ch = substr($_[0],0,1,'');
599 return sprintf("\x{%02X}",ord($ch);
600 }
601
602This scheme is close to how underlying C code for Encode works, but gives
603the fixup routine very little context.
604
605=item Scheme 2
606
607Passed original string, and an index into it of the problem area,
608and output string so far.
609Appends what it will to output string and returns new index into
610original string.
611e.g.
612
613 sub fixup {
614 # my ($s,$i,$d) = @_;
615 my $ch = substr($_[0],$_[1],1);
616 $_[2] .= sprintf("\x{%02X}",ord($ch);
617 return $_[1]+1;
618 }
619
620This scheme gives maximal control to the fixup routine but is more complicated
621to code, and may need internals of Encode to be tweaked to keep original
622string intact.
623
624=item Other Schemes
625
626Hybrids of above.
627
628Multiple return values rather than in-place modifications.
629
630Index into the string could be pos($str) allowing s/\G...//.
631
632=back
633
634=head2 UTF-8 / utf8
635
636The Unicode consortium defines the UTF-8 standard as a way of encoding
637the entire Unicode repertiore as sequences of octets. This encoding
638is expected to become very widespread. Perl can use this form internaly
639to represent strings, so conversions to and from this form are particularly
640efficient (as octets in memory do not have to change, just the meta-data
641that tells perl how to treat them).
642
643=over 4
644
645=item *
646
647 $bytes = encode_utf8($string);
648
649The characters that comprise string are encoded in perl's superset of UTF-8
650and the resulting octets returned as a sequence of bytes. All possible
651characters have a UTF-8 representation so this function cannot fail.
652
653=item *
654
655 $string = decode_utf8($bytes [,CHECK]);
656
657The sequence of octets represented by $bytes is decoded from UTF-8 into
658a sequence of logical characters. Not all sequences of octets form valid
659UTF-8 encodings, so it is possible for this call to fail.
660See L</"Handling Malformed Data">.
661
662=back
663
664=head2 Other Encodings of Unicode
665
666UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks.
667UCS-2 can only represent 0..0xFFFF, while UTF-16 has a "surogate pair"
668scheme which allows it to cover the whole Unicode range.
669
670Encode implements big-endian UCS-2 as the encoding "iso10646-1" as that
671happens to be the name used by that representation when used with X11 fonts.
672
673UTF-32 or UCS-4 is 32-bit or 4-byte chunks. Perl's logical characters
674can be considered as being in this form without encoding. An encoding
675to transfer strings in this form (e.g. to write them to a file) would need to
676
677 pack('L',map(chr($_),split(//,$string))); # native
678 or
679 pack('V',map(chr($_),split(//,$string))); # little-endian
680 or
681 pack('N',map(chr($_),split(//,$string))); # big-endian
682
683depending on the endian required.
684
685No UTF-32 encodings are not yet implemented.
686
687Both UCS-2 and UCS-4 style encodings can have "byte order marks" by representing
688the code point 0xFFFE as the very first thing in a file.
689
690=head1 Encoding and IO
691
692It is very common to want to do encoding transformations when
693reading or writing files, network connections, pipes etc.
694If perl is configured to use the new 'perlio' IO system then
695C<Encode> provides a "layer" (See L<perliol>) which can transform
696data as it is read or written.
697
698 open(my $ilyad,'>:encoding(iso8859-7)','ilyad.greek');
699 print $ilyad @epic;
700
701In addition the new IO system can also be configured to read/write
702UTF-8 encoded characters (as noted above this is efficient):
703
704 open(my $fh,'>:utf8','anything');
705 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
706
707Either of the above forms of "layer" specifications can be made the default
708for a lexical scope with the C<use open ...> pragma. See L<open>.
709
710Once a handle is open is layers can be altered using C<binmode>.
711
712Without any such configuration, or if perl itself is built using
713system's own IO, then write operations assume that file handle accepts
714only I<bytes> and will C<die> if a character larger than 255 is
715written to the handle. When reading, each octet from the handle
716becomes a byte-in-a-character. Note that this default is the same
717behaviour as bytes-only languages (including perl before v5.6) would have,
718and is sufficient to handle native 8-bit encodings e.g. iso-8859-1,
719EBCDIC etc. and any legacy mechanisms for handling other encodings
720and binary data.
721
722In other cases it is the programs responsibility
723to transform characters into bytes using the API above before
724doing writes, and to transform the bytes read from a handle into characters
725before doing "character operations" (e.g. C<lc>, C</\W+/>, ...).
726
727=head1 Encoding How to ...
728
729To do:
730
731=over 4
732
733=item * IO with mixed content (faking iso-2020-*)
734
735=item * MIME's Content-Length:
736
737=item * UTF-8 strings in binary data.
738
739=item * perl/Encode wrappers on non-Unicode XS modules.
740
741=back
742
743=head1 Messing with Perl's Internals
744
745The following API uses parts of perl's internals in the current implementation.
746As such they are efficient, but may change.
747
748=over 4
749
750=item *
751
752 $num_octets = utf8_upgrade($string);
753
754Converts internal representation of string to the UTF-8 form.
755Returns the number of octets necessary to represent the string as UTF-8.
756
757=item * utf8_downgrade($string[, CHECK])
758
759Converts internal representation of string to be un-encoded bytes.
760
761=item * is_utf8(STRING [, CHECK])
762
763[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
764If CHECK is true, also checks the data in STRING for being
765well-formed UTF-8. Returns true if successful, false otherwise.
766
767=item * valid_utf8(STRING)
768
769[INTERNAL] Test whether STRING is in a consistent state.
770Will return true if string is held as bytes, or is well-formed UTF-8
771and has the UTF-8 flag on.
772Main reason for this routine is to allow perl's testsuite to check
773that operations have left strings in a consistent state.
774
775=item *
776
777 _utf8_on(STRING)
778
779[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
780B<not> checked for being well-formed UTF-8. Do not use unless you
781B<know> that the STRING is well-formed UTF-8. Returns the previous
782state of the UTF-8 flag (so please don't test the return value as
783I<not> success or failure), or C<undef> if STRING is not a string.
784
785=item *
786
787 _utf8_off(STRING)
788
789[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
790Returns the previous state of the UTF-8 flag (so please don't test the
791return value as I<not> success or failure), or C<undef> if STRING is
792not a string.
793
794=back
795
796=head1 SEE ALSO
797
798L<perlunicode>, L<perlebcdic>, L<perlfunc/open>
799
800=cut
801
802
2a936312 803