This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Set makefile.mk CCHOME etc. for default locations of MinGW and free
[perl5.git] / ext / Encode / Encode.pm
CommitLineData
2c674647 1package Encode;
51ef4e11 2use strict;
2c674647 3
b8a524e9 4our $VERSION = '0.02';
2c674647
JH
5
6require DynaLoader;
7require Exporter;
8
51ef4e11 9our @ISA = qw(Exporter DynaLoader);
2c674647 10
4411f3b6 11# Public, encouraged API is exported by default
51ef4e11 12our @EXPORT = qw (
4411f3b6
NIS
13 encode
14 decode
15 encode_utf8
16 decode_utf8
17 find_encoding
51ef4e11 18 encodings
4411f3b6
NIS
19);
20
51ef4e11 21our @EXPORT_OK =
2c674647 22 qw(
51ef4e11
NIS
23 define_encoding
24 define_alias
2c674647
JH
25 from_to
26 is_utf8
4411f3b6
NIS
27 is_8bit
28 is_16bit
a12c0f56
NIS
29 utf8_upgrade
30 utf8_downgrade
4411f3b6
NIS
31 _utf8_on
32 _utf8_off
2c674647
JH
33 );
34
35bootstrap Encode ();
36
4411f3b6 37# Documentation moved after __END__ for speed - NI-S
2c674647 38
bf230f3d
NIS
39use Carp;
40
51ef4e11
NIS
41# Make a %encoding package variable to allow a certain amount of cheating
42our %encoding;
43my @alias; # ordered matching list
44my %alias; # cached known aliases
f7ac3676 45
6d6a7c8d
NIS
46 # 0 1 2 3 4 5 6 7 8 9 10
47our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
48
f7ac3676
JH
49our %winlatin2cp = (
50 'Latin1' => 1252,
51 'Latin2' => 1250,
52 'Cyrillic' => 1251,
f7ac3676
JH
53 'Greek' => 1253,
54 'Turkish' => 1254,
55 'Hebrew' => 1255,
56 'Arabic' => 1256,
57 'Baltic' => 1257,
58 'Vietnamese' => 1258,
59 );
5345d506 60
656753f8
NIS
61sub encodings
62{
63 my ($class) = @_;
40a073c6
JH
64 return
65 map { $_->[0] }
66 sort { $a->[1] cmp $b->[1] }
67 map { [$_, lc $_] }
68 grep { $_ ne 'Internal' }
69 keys %encoding;
51ef4e11
NIS
70}
71
72sub findAlias
73{
74 my $class = shift;
75 local $_ = shift;
1e616cf5 76 # print "# findAlias $_\n";
51ef4e11 77 unless (exists $alias{$_})
656753f8 78 {
51ef4e11 79 for (my $i=0; $i < @alias; $i += 2)
656753f8 80 {
51ef4e11
NIS
81 my $alias = $alias[$i];
82 my $val = $alias[$i+1];
83 my $new;
84 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
5345d506 85 {
51ef4e11
NIS
86 $new = eval $val;
87 }
88 elsif (ref($alias) eq 'CODE')
89 {
90 $new = &{$alias}($val)
91 }
5ad8ef52 92 elsif (lc($_) eq lc($alias))
51ef4e11
NIS
93 {
94 $new = $val;
95 }
96 if (defined($new))
97 {
98 next if $new eq $_; # avoid (direct) recursion on bugs
99 my $enc = (ref($new)) ? $new : find_encoding($new);
100 if ($enc)
5345d506 101 {
51ef4e11
NIS
102 $alias{$_} = $enc;
103 last;
5345d506
NIS
104 }
105 }
656753f8 106 }
5345d506 107 }
51ef4e11 108 return $alias{$_};
5345d506
NIS
109}
110
51ef4e11 111sub define_alias
5345d506 112{
51ef4e11 113 while (@_)
5345d506 114 {
51ef4e11
NIS
115 my ($alias,$name) = splice(@_,0,2);
116 push(@alias, $alias => $name);
656753f8 117 }
51ef4e11
NIS
118}
119
016cb72c 120# Allow variants of iso-8859-1 etc.
d6089a2a 121define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
016cb72c 122
7faf300d
JH
123# At least HP-UX has these.
124define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
125
f7ac3676
JH
126# More HP stuff.
127define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
128
8a361256
JH
129# The Official name of ASCII.
130define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
131
58d53262
JH
132# This is a font issue, not an encoding issue.
133# (The currency symbol of the Latin 1 upper half
134# has been redefined as the euro symbol.)
135define_alias( qr/^(.+)\@euro$/i => '"$1"' );
136
016cb72c 137# Allow latin-1 style names as well
7faf300d 138define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
016cb72c 139
f7ac3676 140# Allow winlatin1 style names as well
cf91068f 141define_alias( qr/^win(latin[12]|cyrillic|baltic|greek|turkish|hebrew|arabic|baltic|vietnamese)$/i => '"cp$winlatin2cp{\u$1}"' );
f7ac3676 142
016cb72c
NIS
143# Common names for non-latin prefered MIME names
144define_alias( 'ascii' => 'US-ascii',
145 'cyrillic' => 'iso-8859-5',
146 'arabic' => 'iso-8859-6',
147 'greek' => 'iso-8859-7',
f7ac3676
JH
148 'hebrew' => 'iso-8859-8',
149 'thai' => 'iso-8859-11',
150 'tis620' => 'iso-8859-11',
151 );
016cb72c 152
7faf300d
JH
153# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
154define_alias( qr/^ibm[-_]?(\d\d\d\d?)$/i => '"cp$1"');
155
58d53262
JH
156# Standardize on the dashed versions.
157define_alias( qr/^utf8$/i => 'utf-8' );
7faf300d 158define_alias( qr/^koi8r$/i => 'koi8-r' );
f7ac3676
JH
159define_alias( qr/^koi8u$/i => 'koi8-u' );
160
161# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
162# TODO: HP-UX '15' encodings japanese15 korean15 roi15
163# TODO: Cyrillic encoding ISO-IR-111 (useful?)
164# TODO: Chinese encodings GB18030 GBK Big5-HSKCS EUC-TW
165# TODO: Armenian encoding ARMSCII-8
166# TODO: Hebrew encoding ISO-8859-8-1
167# TODO: Thai encoding TCVN
168# TODO: Korean encoding Johab
56a543c5 169# TODO: Vietnamese encodings VPS
f7ac3676
JH
170# TODO: Japanese encoding JIS (not the same as SJIS)
171# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
172# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
173# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
174# Kannada Khmer Korean Laotian Malayalam Mongolian
175# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
176# TODO: what is the Japanese 'UJIS' encoding seen in some Linuxes?
58d53262 177
016cb72c
NIS
178# Map white space and _ to '-'
179define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
180
51ef4e11
NIS
181sub define_encoding
182{
183 my $obj = shift;
184 my $name = shift;
185 $encoding{$name} = $obj;
186 my $lc = lc($name);
187 define_alias($lc => $obj) unless $lc eq $name;
188 while (@_)
656753f8 189 {
51ef4e11
NIS
190 my $alias = shift;
191 define_alias($alias,$obj);
656753f8 192 }
51ef4e11 193 return $obj;
656753f8
NIS
194}
195
656753f8
NIS
196sub getEncoding
197{
198 my ($class,$name) = @_;
5345d506 199 my $enc;
0f43fc90
NIS
200 if (ref($name) && $name->can('new_sequence'))
201 {
202 return $name;
203 }
1e616cf5 204 my $lc = lc $name;
51ef4e11 205 if (exists $encoding{$name})
656753f8 206 {
51ef4e11
NIS
207 return $encoding{$name};
208 }
1e616cf5 209 if (exists $encoding{$lc})
51ef4e11 210 {
1e616cf5 211 return $encoding{$lc};
656753f8 212 }
1e616cf5
JH
213
214 my $oc = $class->findAlias($name);
215 return $oc if defined $oc;
216 return $class->findAlias($lc) if $lc ne $name;
217
218 return;
656753f8
NIS
219}
220
4411f3b6
NIS
221sub find_encoding
222{
223 my ($name) = @_;
224 return __PACKAGE__->getEncoding($name);
225}
226
227sub encode
228{
229 my ($name,$string,$check) = @_;
230 my $enc = find_encoding($name);
231 croak("Unknown encoding '$name'") unless defined $enc;
50d26985 232 my $octets = $enc->encode($string,$check);
4411f3b6
NIS
233 return undef if ($check && length($string));
234 return $octets;
235}
236
237sub decode
238{
239 my ($name,$octets,$check) = @_;
240 my $enc = find_encoding($name);
241 croak("Unknown encoding '$name'") unless defined $enc;
50d26985 242 my $string = $enc->decode($octets,$check);
96d6357c 243 $_[1] = $octets if $check;
4411f3b6
NIS
244 return $string;
245}
246
247sub from_to
248{
249 my ($string,$from,$to,$check) = @_;
250 my $f = find_encoding($from);
251 croak("Unknown encoding '$from'") unless defined $f;
252 my $t = find_encoding($to);
253 croak("Unknown encoding '$to'") unless defined $t;
50d26985 254 my $uni = $f->decode($string,$check);
4411f3b6 255 return undef if ($check && length($string));
50d26985 256 $string = $t->encode($uni,$check);
4411f3b6
NIS
257 return undef if ($check && length($uni));
258 return length($_[0] = $string);
259}
260
261sub encode_utf8
262{
263 my ($str) = @_;
1b026014 264 utf8::encode($str);
4411f3b6
NIS
265 return $str;
266}
267
268sub decode_utf8
269{
270 my ($str) = @_;
1b026014 271 return undef unless utf8::decode($str);
4411f3b6
NIS
272 return $str;
273}
274
50d26985
NIS
275package Encode::Encoding;
276# Base class for classes which implement encodings
4edaa979 277
51ef4e11
NIS
278sub Define
279{
280 my $obj = shift;
281 my $canonical = shift;
282 $obj = bless { Name => $canonical },$obj unless ref $obj;
283 # warn "$canonical => $obj\n";
284 Encode::define_encoding($obj, $canonical, @_);
285}
286
287sub name { shift->{'Name'} }
288
50d26985 289# Temporary legacy methods
4edaa979
NIS
290sub toUnicode { shift->decode(@_) }
291sub fromUnicode { shift->encode(@_) }
292
293sub new_sequence { return $_[0] }
50d26985
NIS
294
295package Encode::XS;
296use base 'Encode::Encoding';
297
5ad8ef52 298package Encode::Internal;
50d26985 299use base 'Encode::Encoding';
656753f8 300
9b37254d 301# Dummy package that provides the encode interface but leaves data
1b026014 302# as UTF-X encoded. It is here so that from_to() works.
656753f8 303
5ad8ef52
NIS
304__PACKAGE__->Define('Internal');
305
306Encode::define_alias( 'Unicode' => 'Internal' ) if ord('A') == 65;
656753f8 307
50d26985 308sub decode
a12c0f56
NIS
309{
310 my ($obj,$str,$chk) = @_;
1b026014 311 utf8::upgrade($str);
a12c0f56
NIS
312 $_[1] = '' if $chk;
313 return $str;
314}
656753f8 315
50d26985 316*encode = \&decode;
656753f8 317
5ad8ef52
NIS
318package Encoding::Unicode;
319use base 'Encode::Encoding';
320
321__PACKAGE__->Define('Unicode') unless ord('A') == 65;
322
323sub decode
324{
325 my ($obj,$str,$chk) = @_;
326 my $res = '';
327 for (my $i = 0; $i < length($str); $i++)
328 {
329 $res .= chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
330 }
331 $_[1] = '' if $chk;
332 return $res;
333}
334
335sub encode
336{
337 my ($obj,$str,$chk) = @_;
338 my $res = '';
339 for (my $i = 0; $i < length($str); $i++)
340 {
341 $res .= chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
342 }
343 $_[1] = '' if $chk;
344 return $res;
345}
346
347
4411f3b6 348package Encode::utf8;
50d26985 349use base 'Encode::Encoding';
4411f3b6
NIS
350# package to allow long-hand
351# $octets = encode( utf8 => $string );
352#
353
51ef4e11 354__PACKAGE__->Define(qw(UTF-8 utf8));
4411f3b6 355
50d26985 356sub decode
4411f3b6
NIS
357{
358 my ($obj,$octets,$chk) = @_;
2a936312 359 my $str = Encode::decode_utf8($octets);
4411f3b6
NIS
360 if (defined $str)
361 {
362 $_[1] = '' if $chk;
363 return $str;
364 }
365 return undef;
366}
367
50d26985 368sub encode
4411f3b6
NIS
369{
370 my ($obj,$string,$chk) = @_;
2a936312 371 my $octets = Encode::encode_utf8($string);
4411f3b6
NIS
372 $_[1] = '' if $chk;
373 return $octets;
4411f3b6
NIS
374}
375
9b37254d 376package Encode::iso10646_1;
50d26985 377use base 'Encode::Encoding';
51ef4e11 378# Encoding is 16-bit network order Unicode (no surogates)
9b37254d 379# Used for X font encodings
87714904 380
8040349a 381__PACKAGE__->Define(qw(UCS-2 iso-10646-1));
87714904 382
50d26985 383sub decode
87714904
NIS
384{
385 my ($obj,$str,$chk) = @_;
386 my $uni = '';
387 while (length($str))
388 {
5dcbab34 389 my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
87714904
NIS
390 $uni .= chr($code);
391 }
392 $_[1] = $str if $chk;
8040349a 393 utf8::upgrade($uni);
87714904
NIS
394 return $uni;
395}
396
50d26985 397sub encode
87714904
NIS
398{
399 my ($obj,$uni,$chk) = @_;
400 my $str = '';
401 while (length($uni))
402 {
403 my $ch = substr($uni,0,1,'');
404 my $x = ord($ch);
405 unless ($x < 32768)
406 {
407 last if ($chk);
408 $x = 0;
409 }
5dcbab34 410 $str .= pack('n',$x);
656753f8 411 }
bf230f3d 412 $_[1] = $uni if $chk;
656753f8
NIS
413 return $str;
414}
415
79019f4f
MS
416package Encode::ucs_2le;
417use base 'Encode::Encoding';
418
419__PACKAGE__->Define(qw(UCS-2le UCS-2LE ucs-2le));
420
421sub decode
422{
423 my ($obj,$str,$chk) = @_;
424 my $uni = '';
425 while (length($str))
426 {
427 my $code = unpack('v',substr($str,0,2,'')) & 0xffff;
428 $uni .= chr($code);
429 }
430 $_[1] = $str if $chk;
431 utf8::upgrade($uni);
432 return $uni;
433}
434
435sub encode
436{
437 my ($obj,$uni,$chk) = @_;
438 my $str = '';
439 while (length($uni))
440 {
441 my $ch = substr($uni,0,1,'');
442 my $x = ord($ch);
443 unless ($x < 32768)
444 {
445 last if ($chk);
446 $x = 0;
447 }
448 $str .= pack('v',$x);
449 }
450 $_[1] = $uni if $chk;
451 return $str;
452}
453
4411f3b6
NIS
454# switch back to Encode package in case we ever add AutoLoader
455package Encode;
456
656753f8
NIS
4571;
458
2a936312
NIS
459__END__
460
4411f3b6
NIS
461=head1 NAME
462
463Encode - character encodings
464
465=head1 SYNOPSIS
466
467 use Encode;
468
469=head1 DESCRIPTION
470
47bfe92f
JH
471The C<Encode> module provides the interfaces between Perl's strings
472and the rest of the system. Perl strings are sequences of B<characters>.
4411f3b6
NIS
473
474The repertoire of characters that Perl can represent is at least that
47bfe92f
JH
475defined by the Unicode Consortium. On most platforms the ordinal
476values of the characters (as returned by C<ord(ch)>) is the "Unicode
477codepoint" for the character (the exceptions are those platforms where
478the legacy encoding is some variant of EBCDIC rather than a super-set
479of ASCII - see L<perlebcdic>).
4411f3b6
NIS
480
481Traditionaly computer data has been moved around in 8-bit chunks
482often called "bytes". These chunks are also known as "octets" in
483networking standards. Perl is widely used to manipulate data of
484many types - not only strings of characters representing human or
485computer languages but also "binary" data being the machines representation
486of numbers, pixels in an image - or just about anything.
487
47bfe92f
JH
488When Perl is processing "binary data" the programmer wants Perl to process
489"sequences of bytes". This is not a problem for Perl - as a byte has 256
490possible values it easily fits in Perl's much larger "logical character".
4411f3b6
NIS
491
492=head2 TERMINOLOGY
493
4ac9195f 494=over 4
4411f3b6
NIS
495
496=item *
497
498I<character>: a character in the range 0..(2**32-1) (or more).
47bfe92f 499(What Perl's strings are made of.)
4411f3b6
NIS
500
501=item *
502
503I<byte>: a character in the range 0..255
47bfe92f 504(A special case of a Perl character.)
4411f3b6
NIS
505
506=item *
507
508I<octet>: 8 bits of data, with ordinal values 0..255
47bfe92f 509(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
4411f3b6
NIS
510
511=back
512
513The marker [INTERNAL] marks Internal Implementation Details, in
514general meant only for those who think they know what they are doing,
515and such details may change in future releases.
516
517=head1 ENCODINGS
518
519=head2 Characteristics of an Encoding
520
521An encoding has a "repertoire" of characters that it can represent,
522and for each representable character there is at least one sequence of
523octets that represents it.
524
525=head2 Types of Encodings
526
527Encodings can be divided into the following types:
528
529=over 4
530
531=item * Fixed length 8-bit (or less) encodings.
532
533Each character is a single octet so may have a repertoire of up to
534256 characters. ASCII and iso-8859-* are typical examples.
535
536=item * Fixed length 16-bit encodings
537
538Each character is two octets so may have a repertoire of up to
47bfe92f 53965 536 characters. Unicode's UCS-2 is an example. Also used for
4411f3b6
NIS
540encodings for East Asian languages.
541
542=item * Fixed length 32-bit encodings.
543
544Not really very "encoded" encodings. The Unicode code points
545are just represented as 4-octet integers. None the less because
546different architectures use different representations of integers
547(so called "endian") there at least two disctinct encodings.
548
549=item * Multi-byte encodings
550
551The number of octets needed to represent a character varies.
552UTF-8 is a particularly complex but regular case of a multi-byte
553encoding. Several East Asian countries use a multi-byte encoding
554where 1-octet is used to cover western roman characters and Asian
555characters get 2-octets.
556(UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
557to represent a Unicode code point.)
558
559=item * "Escape" encodings.
560
561These encodings embed "escape sequences" into the octet sequence
562which describe how the following octets are to be interpreted.
563The iso-2022-* family is typical. Following the escape sequence
564octets are encoded by an "embedded" encoding (which will be one
565of the above types) until another escape sequence switches to
566a different "embedded" encoding.
567
568These schemes are very flexible and can handle mixed languages but are
47bfe92f
JH
569very complex to process (and have state). No escape encodings are
570implemented for Perl yet.
4411f3b6
NIS
571
572=back
573
574=head2 Specifying Encodings
575
576Encodings can be specified to the API described below in two ways:
577
578=over 4
579
580=item 1. By name
581
47bfe92f
JH
582Encoding names are strings with characters taken from a restricted
583repertoire. See L</"Encoding Names">.
4411f3b6
NIS
584
585=item 2. As an object
586
587Encoding objects are returned by C<find_encoding($name)>.
588
589=back
590
591=head2 Encoding Names
592
593Encoding names are case insensitive. White space in names is ignored.
47bfe92f
JH
594In addition an encoding may have aliases. Each encoding has one
595"canonical" name. The "canonical" name is chosen from the names of
596the encoding by picking the first in the following sequence:
4411f3b6
NIS
597
598=over 4
599
78255929 600=item * The MIME name as defined in IETF RFCs.
4411f3b6
NIS
601
602=item * The name in the IANA registry.
603
d1be9408 604=item * The name used by the organization that defined it.
4411f3b6
NIS
605
606=back
607
608Because of all the alias issues, and because in the general case
609encodings have state C<Encode> uses the encoding object internally
610once an operation is in progress.
611
21938dfa
JH
612As of Perl 5.8.0, at least the following encodings are recognized
613(the => marks aliases):
614
615 ASCII
616
617 US-ASCII => ASCII
618
619The Unicode:
620
621 UTF-8
622 UTF-16
623 UCS-2
624
625 ISO 10646-1 => UCS-2
626
627The ISO 8859 and KOI:
628
629 ISO 8859-1 ISO 8859-6 ISO 8859-11 KOI8-F
630 ISO 8859-2 ISO 8859-7 (12 doesn't exist) KOI8-R
56a543c5 631 ISO 8859-3 ISO 8859-8 ISO 8859-13 KOI8-U
21938dfa
JH
632 ISO 8859-4 ISO 8859-9 ISO 8859-14
633 ISO 8859-5 ISO 8859-10 ISO 8859-15
634 ISO 8859-16
635
636 Latin1 => 8859-1 Latin6 => 8859-10
637 Latin2 => 8859-2 Latin7 => 8859-13
638 Latin3 => 8859-3 Latin8 => 8859-14
639 Latin4 => 8859-4 Latin9 => 8859-15
640 Latin5 => 8859-9 Latin10 => 8859-16
641
642 Cyrillic => 8859-5
643 Arabic => 8859-6
644 Greek => 8859-7
645 Hebrew => 8859-8
646 Thai => 8859-11
647 TIS620 => 8859-11
648
649The CJKV: Chinese, Japanese, Korean, Vietnamese:
650
651 ISO 2022 ISO 2022 JP-1 JIS 0201 GB 1988 Big5 EUC-CN
56a543c5 652 ISO 2022 CN ISO 2022 JP-2 JIS 0208 GB 2312 HZ EUC-JP
21938dfa 653 ISO 2022 JP ISO 2022 KR JIS 0210 GB 12345 CNS 11643 EUC-JP-0212
56a543c5 654 Shift-JIS EUC-KR
21938dfa
JH
655 VISCII
656
657The PC codepages:
658
659 CP37 CP852 CP861 CP866 CP949 CP1251 CP1256
660 CP424 CP855 CP862 CP869 CP950 CP1252 CP1257
661 CP737 CP856 CP863 CP874 CP1006 CP1253 CP1258
662 CP775 CP857 CP864 CP932 CP1047 CP1254
663 CP850 CP860 CP865 CP936 CP1250 CP1255
664
665 WinLatin1 => CP1252
666 WinLatin2 => CP1250
667 WinCyrillic => CP1251
668 WinGreek => CP1253
669 WinTurkiskh => CP1254
670 WinHebrew => CP1255
671 WinArabic => CP1256
672 WinBaltic => CP1257
673 WinVietnamese => CP1258
674
4a42e14c 675(All the CPI<NNN...> are available also as IBMI<NNN...>.)
21938dfa
JH
676
677The Mac codepages:
678
679 MacCentralEuropean MacJapanese
56a543c5
JH
680 MacCroatian MacRoman
681 MacCyrillic MacRumanian
682 MacDingbats MacSami
683 MacGreek MacThai
684 MacIcelandic MacTurkish
685 MacUkraine
21938dfa
JH
686
687Miscellaneous:
688
689 7bit-greek IR-197
690 7bit-kana NeXTstep
691 7bit-latin1 POSIX-BC
692 DingBats Roman8
693 GSM 0338 Symbol
694
4411f3b6
NIS
695=head1 PERL ENCODING API
696
697=head2 Generic Encoding Interface
698
699=over 4
700
701=item *
702
703 $bytes = encode(ENCODING, $string[, CHECK])
704
47bfe92f
JH
705Encodes string from Perl's internal form into I<ENCODING> and returns
706a sequence of octets. For CHECK see L</"Handling Malformed Data">.
4411f3b6 707
681a7c68
JH
708For example to convert (internally UTF-8 encoded) Unicode data
709to octets:
710
711 $octets = encode("utf8", $unicode);
712
4411f3b6
NIS
713=item *
714
715 $string = decode(ENCODING, $bytes[, CHECK])
716
47bfe92f
JH
717Decode sequence of octets assumed to be in I<ENCODING> into Perl's
718internal form and returns the resulting string. For CHECK see
719L</"Handling Malformed Data">.
720
681a7c68
JH
721For example to convert ISO 8859-1 data to UTF-8:
722
723 $utf8 = decode("latin1", $latin1);
724
47bfe92f
JH
725=item *
726
727 from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
728
2b106fbe
JH
729Convert B<in-place> the data between two encodings. How did the data
730in $string originally get to be in FROM_ENCODING? Either using
e9692b5b 731encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
2b106fbe
JH
732see L</"Handling Malformed Data">.
733
734For example to convert ISO 8859-1 data to UTF-8:
735
736 from_to($data, "iso-8859-1", "utf-8");
737
738and to convert it back:
739
740 from_to($data, "utf-8", "iso-8859-1");
4411f3b6 741
ab97ca19
JH
742Note that because the conversion happens in place, the data to be
743converted cannot be a string constant, it must be a scalar variable.
744
4411f3b6
NIS
745=back
746
747=head2 Handling Malformed Data
748
749If CHECK is not set, C<undef> is returned. If the data is supposed to
47bfe92f
JH
750be UTF-8, an optional lexical warning (category utf8) is given. If
751CHECK is true but not a code reference, dies.
4411f3b6 752
47bfe92f
JH
753It would desirable to have a way to indicate that transform should use
754the encodings "replacement character" - no such mechanism is defined yet.
4411f3b6
NIS
755
756It is also planned to allow I<CHECK> to be a code reference.
757
47bfe92f
JH
758This is not yet implemented as there are design issues with what its
759arguments should be and how it returns its results.
4411f3b6
NIS
760
761=over 4
762
763=item Scheme 1
764
765Passed remaining fragment of string being processed.
766Modifies it in place to remove bytes/characters it can understand
767and returns a string used to represent them.
768e.g.
769
770 sub fixup {
771 my $ch = substr($_[0],0,1,'');
772 return sprintf("\x{%02X}",ord($ch);
773 }
774
775This scheme is close to how underlying C code for Encode works, but gives
776the fixup routine very little context.
777
778=item Scheme 2
779
47bfe92f
JH
780Passed original string, and an index into it of the problem area, and
781output string so far. Appends what it will to output string and
782returns new index into original string. For example:
4411f3b6
NIS
783
784 sub fixup {
785 # my ($s,$i,$d) = @_;
786 my $ch = substr($_[0],$_[1],1);
787 $_[2] .= sprintf("\x{%02X}",ord($ch);
788 return $_[1]+1;
789 }
790
47bfe92f
JH
791This scheme gives maximal control to the fixup routine but is more
792complicated to code, and may need internals of Encode to be tweaked to
793keep original string intact.
4411f3b6
NIS
794
795=item Other Schemes
796
797Hybrids of above.
798
799Multiple return values rather than in-place modifications.
800
801Index into the string could be pos($str) allowing s/\G...//.
802
803=back
804
805=head2 UTF-8 / utf8
806
807The Unicode consortium defines the UTF-8 standard as a way of encoding
47bfe92f
JH
808the entire Unicode repertiore as sequences of octets. This encoding is
809expected to become very widespread. Perl can use this form internaly
810to represent strings, so conversions to and from this form are
811particularly efficient (as octets in memory do not have to change,
812just the meta-data that tells Perl how to treat them).
4411f3b6
NIS
813
814=over 4
815
816=item *
817
818 $bytes = encode_utf8($string);
819
47bfe92f 820The characters that comprise string are encoded in Perl's superset of UTF-8
4411f3b6
NIS
821and the resulting octets returned as a sequence of bytes. All possible
822characters have a UTF-8 representation so this function cannot fail.
823
824=item *
825
826 $string = decode_utf8($bytes [,CHECK]);
827
47bfe92f
JH
828The sequence of octets represented by $bytes is decoded from UTF-8
829into a sequence of logical characters. Not all sequences of octets
830form valid UTF-8 encodings, so it is possible for this call to fail.
831For CHECK see L</"Handling Malformed Data">.
4411f3b6
NIS
832
833=back
834
835=head2 Other Encodings of Unicode
836
47bfe92f 837UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks. UCS-2 can only
7a4efbb2 838represent 0..0xFFFF, while UTF-16 has a I<surrogate pair> scheme which
47bfe92f 839allows it to cover the whole Unicode range.
4411f3b6 840
7a4efbb2
JH
841Surrogates are code points set aside to encode the 0x01000..0x10FFFF
842range of Unicode code points in pairs of 16-bit units. The I<high
843surrogates> are the range 0xD800..0xDBFF, and the I<low surrogates>
844are the range 0xDC00..0xDFFFF. The surrogate encoding is
845
846 $hi = ($uni - 0x10000) / 0x400 + 0xD800;
847 $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
848
849and the decoding is
850
851 $uni = 0x10000 + ($hi - 0xD8000) * 0x400 + ($lo - 0xDC00);
852
8040349a 853Encode implements big-endian UCS-2 aliased to "iso-10646-1" as that
47bfe92f
JH
854happens to be the name used by that representation when used with X11
855fonts.
4411f3b6
NIS
856
857UTF-32 or UCS-4 is 32-bit or 4-byte chunks. Perl's logical characters
858can be considered as being in this form without encoding. An encoding
47bfe92f
JH
859to transfer strings in this form (e.g. to write them to a file) would
860need to
4411f3b6 861
c079d275 862 pack('L*', unpack('U*', $string)); # native
4411f3b6 863 or
c079d275 864 pack('V*', unpack('U*', $string)); # little-endian
4411f3b6 865 or
c079d275 866 pack('N*', unpack('U*', $string)); # big-endian
4411f3b6 867
c079d275 868depending on the endianness required.
4411f3b6 869
51ef4e11 870No UTF-32 encodings are implemented yet.
4411f3b6 871
47bfe92f
JH
872Both UCS-2 and UCS-4 style encodings can have "byte order marks" by
873representing the code point 0xFFFE as the very first thing in a file.
4411f3b6 874
51ef4e11
NIS
875=head2 Listing available encodings
876
877 use Encode qw(encodings);
878 @list = encodings();
879
880Returns a list of the canonical names of the available encodings.
881
882=head2 Defining Aliases
883
884 use Encode qw(define_alias);
885 define_alias( newName => ENCODING);
886
47bfe92f
JH
887Allows newName to be used as am alias for ENCODING. ENCODING may be
888either the name of an encoding or and encoding object (as above).
51ef4e11
NIS
889
890Currently I<newName> can be specified in the following ways:
891
892=over 4
893
894=item As a simple string.
895
896=item As a qr// compiled regular expression, e.g.:
897
898 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
899
47bfe92f
JH
900In this case if I<ENCODING> is not a reference it is C<eval>-ed to
901allow C<$1> etc. to be subsituted. The example is one way to names as
902used in X11 font names to alias the MIME names for the iso-8859-*
903family.
51ef4e11
NIS
904
905=item As a code reference, e.g.:
906
907 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
908
909In this case C<$_> will be set to the name that is being looked up and
47bfe92f
JH
910I<ENCODING> is passed to the sub as its first argument. The example
911is another way to names as used in X11 font names to alias the MIME
912names for the iso-8859-* family.
51ef4e11
NIS
913
914=back
915
916=head2 Defining Encodings
917
e9692b5b
JH
918 use Encode qw(define_alias);
919 define_encoding( $object, 'canonicalName' [,alias...]);
51ef4e11 920
47bfe92f
JH
921Causes I<canonicalName> to be associated with I<$object>. The object
922should provide the interface described in L</"IMPLEMENTATION CLASSES">
923below. If more than two arguments are provided then additional
924arguments are taken as aliases for I<$object> as for C<define_alias>.
51ef4e11 925
4411f3b6
NIS
926=head1 Encoding and IO
927
928It is very common to want to do encoding transformations when
929reading or writing files, network connections, pipes etc.
47bfe92f 930If Perl is configured to use the new 'perlio' IO system then
4411f3b6
NIS
931C<Encode> provides a "layer" (See L<perliol>) which can transform
932data as it is read or written.
933
8e86646e
JH
934Here is how the blind poet would modernise the encoding:
935
42234700 936 use Encode;
8e86646e
JH
937 open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
938 open(my $utf8,'>:utf8','iliad.utf8');
939 my @epic = <$iliad>;
940 print $utf8 @epic;
941 close($utf8);
942 close($illiad);
4411f3b6
NIS
943
944In addition the new IO system can also be configured to read/write
945UTF-8 encoded characters (as noted above this is efficient):
946
e9692b5b
JH
947 open(my $fh,'>:utf8','anything');
948 print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
4411f3b6
NIS
949
950Either of the above forms of "layer" specifications can be made the default
951for a lexical scope with the C<use open ...> pragma. See L<open>.
952
953Once a handle is open is layers can be altered using C<binmode>.
954
47bfe92f 955Without any such configuration, or if Perl itself is built using
4411f3b6
NIS
956system's own IO, then write operations assume that file handle accepts
957only I<bytes> and will C<die> if a character larger than 255 is
958written to the handle. When reading, each octet from the handle
959becomes a byte-in-a-character. Note that this default is the same
47bfe92f
JH
960behaviour as bytes-only languages (including Perl before v5.6) would
961have, and is sufficient to handle native 8-bit encodings
962e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
963other encodings and binary data.
964
965In other cases it is the programs responsibility to transform
966characters into bytes using the API above before doing writes, and to
967transform the bytes read from a handle into characters before doing
968"character operations" (e.g. C<lc>, C</\W+/>, ...).
969
47bfe92f
JH
970You can also use PerlIO to convert larger amounts of data you don't
971want to bring into memory. For example to convert between ISO 8859-1
972(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
973
e9692b5b
JH
974 open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
975 open(G, ">:utf8", "data.utf") or die $!;
976 while (<F>) { print G }
977
978 # Could also do "print G <F>" but that would pull
979 # the whole file into memory just to write it out again.
980
981More examples:
47bfe92f 982
e9692b5b
JH
983 open(my $f, "<:encoding(cp1252)")
984 open(my $g, ">:encoding(iso-8859-2)")
985 open(my $h, ">:encoding(latin9)") # iso-8859-15
47bfe92f
JH
986
987See L<PerlIO> for more information.
4411f3b6 988
1768d7eb 989See also L<encoding> for how to change the default encoding of the
d521382b 990data in your script.
1768d7eb 991
4411f3b6
NIS
992=head1 Encoding How to ...
993
994To do:
995
996=over 4
997
998=item * IO with mixed content (faking iso-2020-*)
999
1000=item * MIME's Content-Length:
1001
1002=item * UTF-8 strings in binary data.
1003
47bfe92f 1004=item * Perl/Encode wrappers on non-Unicode XS modules.
4411f3b6
NIS
1005
1006=back
1007
1008=head1 Messing with Perl's Internals
1009
47bfe92f
JH
1010The following API uses parts of Perl's internals in the current
1011implementation. As such they are efficient, but may change.
4411f3b6
NIS
1012
1013=over 4
1014
4411f3b6
NIS
1015=item * is_utf8(STRING [, CHECK])
1016
1017[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
47bfe92f
JH
1018If CHECK is true, also checks the data in STRING for being well-formed
1019UTF-8. Returns true if successful, false otherwise.
4411f3b6
NIS
1020
1021=item * valid_utf8(STRING)
1022
47bfe92f
JH
1023[INTERNAL] Test whether STRING is in a consistent state. Will return
1024true if string is held as bytes, or is well-formed UTF-8 and has the
1025UTF-8 flag on. Main reason for this routine is to allow Perl's
1026testsuite to check that operations have left strings in a consistent
1027state.
4411f3b6
NIS
1028
1029=item *
1030
1031 _utf8_on(STRING)
1032
1033[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
1034B<not> checked for being well-formed UTF-8. Do not use unless you
1035B<know> that the STRING is well-formed UTF-8. Returns the previous
1036state of the UTF-8 flag (so please don't test the return value as
1037I<not> success or failure), or C<undef> if STRING is not a string.
1038
1039=item *
1040
1041 _utf8_off(STRING)
1042
1043[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
1044Returns the previous state of the UTF-8 flag (so please don't test the
1045return value as I<not> success or failure), or C<undef> if STRING is
1046not a string.
1047
1048=back
1049
4edaa979
NIS
1050=head1 IMPLEMENTATION CLASSES
1051
1052As mentioned above encodings are (in the current implementation at least)
1053defined by objects. The mapping of encoding name to object is via the
51ef4e11 1054C<%encodings> hash.
4edaa979
NIS
1055
1056The values of the hash can currently be either strings or objects.
1057The string form may go away in the future. The string form occurs
1058when C<encodings()> has scanned C<@INC> for loadable encodings but has
1059not actually loaded the encoding in question. This is because the
47bfe92f 1060current "loading" process is all Perl and a bit slow.
4edaa979 1061
47bfe92f
JH
1062Once an encoding is loaded then value of the hash is object which
1063implements the encoding. The object should provide the following
1064interface:
4edaa979
NIS
1065
1066=over 4
1067
1068=item -E<gt>name
1069
1070Should return the string representing the canonical name of the encoding.
1071
1072=item -E<gt>new_sequence
1073
47bfe92f
JH
1074This is a placeholder for encodings with state. It should return an
1075object which implements this interface, all current implementations
1076return the original object.
4edaa979
NIS
1077
1078=item -E<gt>encode($string,$check)
1079
47bfe92f
JH
1080Should return the octet sequence representing I<$string>. If I<$check>
1081is true it should modify I<$string> in place to remove the converted
1082part (i.e. the whole string unless there is an error). If an error
1083occurs it should return the octet sequence for the fragment of string
1084that has been converted, and modify $string in-place to remove the
1085converted part leaving it starting with the problem fragment.
4edaa979 1086
47bfe92f
JH
1087If check is is false then C<encode> should make a "best effort" to
1088convert the string - for example by using a replacement character.
4edaa979
NIS
1089
1090=item -E<gt>decode($octets,$check)
1091
47bfe92f
JH
1092Should return the string that I<$octets> represents. If I<$check> is
1093true it should modify I<$octets> in place to remove the converted part
1094(i.e. the whole sequence unless there is an error). If an error
1095occurs it should return the fragment of string that has been
1096converted, and modify $octets in-place to remove the converted part
4edaa979
NIS
1097leaving it starting with the problem fragment.
1098
47bfe92f
JH
1099If check is is false then C<decode> should make a "best effort" to
1100convert the string - for example by using Unicode's "\x{FFFD}" as a
1101replacement character.
4edaa979
NIS
1102
1103=back
1104
47bfe92f
JH
1105It should be noted that the check behaviour is different from the
1106outer public API. The logic is that the "unchecked" case is useful
1107when encoding is part of a stream which may be reporting errors
1108(e.g. STDERR). In such cases it is desirable to get everything
1109through somehow without causing additional errors which obscure the
1110original one. Also the encoding is best placed to know what the
1111correct replacement character is, so if that is the desired behaviour
1112then letting low level code do it is the most efficient.
1113
1114In contrast if check is true, the scheme above allows the encoding to
1115do as much as it can and tell layer above how much that was. What is
1116lacking at present is a mechanism to report what went wrong. The most
1117likely interface will be an additional method call to the object, or
1118perhaps (to avoid forcing per-stream objects on otherwise stateless
1119encodings) and additional parameter.
1120
1121It is also highly desirable that encoding classes inherit from
1122C<Encode::Encoding> as a base class. This allows that class to define
1123additional behaviour for all encoding objects. For example built in
1124Unicode, UCS-2 and UTF-8 classes use :
51ef4e11
NIS
1125
1126 package Encode::MyEncoding;
1127 use base qw(Encode::Encoding);
1128
1129 __PACKAGE__->Define(qw(myCanonical myAlias));
1130
47bfe92f
JH
1131To create an object with bless {Name => ...},$class, and call
1132define_encoding. They inherit their C<name> method from
1133C<Encode::Encoding>.
4edaa979
NIS
1134
1135=head2 Compiled Encodings
1136
47bfe92f
JH
1137F<Encode.xs> provides a class C<Encode::XS> which provides the
1138interface described above. It calls a generic octet-sequence to
1139octet-sequence "engine" that is driven by tables (defined in
1140F<encengine.c>). The same engine is used for both encode and
1141decode. C<Encode:XS>'s C<encode> forces Perl's characters to their
1142UTF-8 form and then treats them as just another multibyte
1143encoding. C<Encode:XS>'s C<decode> transforms the sequence and then
1144turns the UTF-8-ness flag as that is the form that the tables are
1145defined to produce. For details of the engine see the comments in
1146F<encengine.c>.
1147
1148The tables are produced by the Perl script F<compile> (the name needs
1149to change so we can eventually install it somewhere). F<compile> can
1150currently read two formats:
4edaa979
NIS
1151
1152=over 4
1153
1154=item *.enc
1155
47bfe92f
JH
1156This is a coined format used by Tcl. It is documented in
1157Encode/EncodeFormat.pod.
4edaa979
NIS
1158
1159=item *.ucm
1160
1161This is the semi-standard format used by IBM's ICU package.
1162
1163=back
1164
1165F<compile> can write the following forms:
1166
1167=over 4
1168
1169=item *.ucm
1170
1171See above - the F<Encode/*.ucm> files provided with the distribution have
1172been created from the original Tcl .enc files using this approach.
1173
1174=item *.c
1175
1176Produces tables as C data structures - this is used to build in encodings
1177into F<Encode.so>/F<Encode.dll>.
1178
1179=item *.xs
1180
47bfe92f
JH
1181In theory this allows encodings to be stand-alone loadable Perl
1182extensions. The process has not yet been tested. The plan is to use
1183this approach for large East Asian encodings.
4edaa979
NIS
1184
1185=back
1186
47bfe92f
JH
1187The set of encodings built-in to F<Encode.so>/F<Encode.dll> is
1188determined by F<Makefile.PL>. The current set is as follows:
4edaa979
NIS
1189
1190=over 4
1191
1192=item ascii and iso-8859-*
1193
1194That is all the common 8-bit "western" encodings.
1195
1196=item IBM-1047 and two other variants of EBCDIC.
1197
47bfe92f
JH
1198These are the same variants that are supported by EBCDIC Perl as
1199"native" encodings. They are included to prove "reversibility" of
1200some constructs in EBCDIC Perl.
4edaa979
NIS
1201
1202=item symbol and dingbats as used by Tk on X11.
1203
47bfe92f 1204(The reason Encode got started was to support Perl/Tk.)
4edaa979
NIS
1205
1206=back
1207
47bfe92f
JH
1208That set is rather ad hoc and has been driven by the needs of the
1209tests rather than the needs of typical applications. It is likely
1210to be rationalized.
4edaa979 1211
4411f3b6
NIS
1212=head1 SEE ALSO
1213
1768d7eb 1214L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>
4411f3b6
NIS
1215
1216=cut
1217