perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	package Encode;
	2	use strict;
	3
	4	our $VERSION = '0.02';
	5
	6	require DynaLoader;
	7	require Exporter;
	8
	9	our @ISA = qw(Exporter DynaLoader);
	10
	11	# Public, encouraged API is exported by default
	12	our @EXPORT = qw (
	13	encode
	14	decode
	15	encode_utf8
	16	decode_utf8
	17	find_encoding
	18	encodings
	19	);
	20
	21	our @EXPORT_OK =
	22	qw(
	23	define_encoding
	24	define_alias
	25	from_to
	26	is_utf8
	27	is_8bit
	28	is_16bit
	29	utf8_upgrade
	30	utf8_downgrade
	31	_utf8_on
	32	_utf8_off
	33	);
	34
	35	bootstrap Encode ();
	36
	37	# Documentation moved after __END__ for speed - NI-S
	38
	39	use Carp;
	40
	41	# Make a %encoding package variable to allow a certain amount of cheating
	42	our %encoding;
	43	my @alias; # ordered matching list
	44	my %alias; # cached known aliases
	45	# 0 1 2 3 4 5 6 7 8 9 10
	46	our @latin2iso_num = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
	47
	48
	49	sub encodings
	50	{
	51	my ($class) = @_;
	52	return keys %encoding;
	53	}
	54
	55	sub findAlias
	56	{
	57	my $class = shift;
	58	local $_ = shift;
	59	unless (exists $alias{$_})
	60	{
	61	for (my $i=0; $i < @alias; $i += 2)
	62	{
	63	my $alias = $alias[$i];
	64	my $val = $alias[$i+1];
	65	my $new;
	66	if (ref($alias) eq 'Regexp' && $_ =~ $alias)
	67	{
	68	$new = eval $val;
	69	}
	70	elsif (ref($alias) eq 'CODE')
	71	{
	72	$new = &{$alias}($val)
	73	}
	74	elsif (lc($_) eq lc($alias))
	75	{
	76	$new = $val;
	77	}
	78	if (defined($new))
	79	{
	80	next if $new eq $_; # avoid (direct) recursion on bugs
	81	my $enc = (ref($new)) ? $new : find_encoding($new);
	82	if ($enc)
	83	{
	84	$alias{$_} = $enc;
	85	last;
	86	}
	87	}
	88	}
	89	}
	90	return $alias{$_};
	91	}
	92
	93	sub define_alias
	94	{
	95	while (@_)
	96	{
	97	my ($alias,$name) = splice(@_,0,2);
	98	push(@alias, $alias => $name);
	99	}
	100	}
	101
	102	# Allow variants of iso-8859-1 etc.
	103	define_alias( qr/^iso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
	104
	105	# At least HP-UX has these.
	106	define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
	107
	108	# This is a font issue, not an encoding issue.
	109	# (The currency symbol of the Latin 1 upper half
	110	# has been redefined as the euro symbol.)
	111	define_alias( qr/^(.+)\@euro$/i => '"$1"' );
	112
	113	# Allow latin-1 style names as well
	114	define_alias( qr/^(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$latin2iso_num[$1]"' );
	115
	116	# Common names for non-latin prefered MIME names
	117	define_alias( 'ascii' => 'US-ascii',
	118	'cyrillic' => 'iso-8859-5',
	119	'arabic' => 'iso-8859-6',
	120	'greek' => 'iso-8859-7',
	121	'hebrew' => 'iso-8859-8');
	122
	123	# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
	124	define_alias( qr/^ibm[-_]?(\d\d\d\d?)$/i => '"cp$1"');
	125
	126	# Standardize on the dashed versions.
	127	define_alias( qr/^utf8$/i => 'utf-8' );
	128	define_alias( qr/^koi8r$/i => 'koi8-r' );
	129
	130	# TODO: the HP-UX '8' encodings: arabic8 greek8 hebrew8 roman8 turkish8
	131	# TODO: the Thai Encoding tis620
	132	# TODO: the Chinese Encoding gb18030
	133	# TODO: what is the Japanese 'ujis' encoding seen in some Linuxes?
	134
	135	# Map white space and _ to '-'
	136	define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
	137
	138	sub define_encoding
	139	{
	140	my $obj = shift;
	141	my $name = shift;
	142	$encoding{$name} = $obj;
	143	my $lc = lc($name);
	144	define_alias($lc => $obj) unless $lc eq $name;
	145	while (@_)
	146	{
	147	my $alias = shift;
	148	define_alias($alias,$obj);
	149	}
	150	return $obj;
	151	}
	152
	153	sub getEncoding
	154	{
	155	my ($class,$name) = @_;
	156	my $enc;
	157	if (ref($name) && $name->can('new_sequence'))
	158	{
	159	return $name;
	160	}
	161	if (exists $encoding{$name})
	162	{
	163	return $encoding{$name};
	164	}
	165	else
	166	{
	167	return $class->findAlias($name);
	168	}
	169	}
	170
	171	sub find_encoding
	172	{
	173	my ($name) = @_;
	174	return __PACKAGE__->getEncoding($name);
	175	}
	176
	177	sub encode
	178	{
	179	my ($name,$string,$check) = @_;
	180	my $enc = find_encoding($name);
	181	croak("Unknown encoding '$name'") unless defined $enc;
	182	my $octets = $enc->encode($string,$check);
	183	return undef if ($check && length($string));
	184	return $octets;
	185	}
	186
	187	sub decode
	188	{
	189	my ($name,$octets,$check) = @_;
	190	my $enc = find_encoding($name);
	191	croak("Unknown encoding '$name'") unless defined $enc;
	192	my $string = $enc->decode($octets,$check);
	193	return undef if ($check && length($octets));
	194	return $string;
	195	}
	196
	197	sub from_to
	198	{
	199	my ($string,$from,$to,$check) = @_;
	200	my $f = find_encoding($from);
	201	croak("Unknown encoding '$from'") unless defined $f;
	202	my $t = find_encoding($to);
	203	croak("Unknown encoding '$to'") unless defined $t;
	204	my $uni = $f->decode($string,$check);
	205	return undef if ($check && length($string));
	206	$string = $t->encode($uni,$check);
	207	return undef if ($check && length($uni));
	208	return length($_[0] = $string);
	209	}
	210
	211	sub encode_utf8
	212	{
	213	my ($str) = @_;
	214	utf8::encode($str);
	215	return $str;
	216	}
	217
	218	sub decode_utf8
	219	{
	220	my ($str) = @_;
	221	return undef unless utf8::decode($str);
	222	return $str;
	223	}
	224
	225	package Encode::Encoding;
	226	# Base class for classes which implement encodings
	227
	228	sub Define
	229	{
	230	my $obj = shift;
	231	my $canonical = shift;
	232	$obj = bless { Name => $canonical },$obj unless ref $obj;
	233	# warn "$canonical => $obj\n";
	234	Encode::define_encoding($obj, $canonical, @_);
	235	}
	236
	237	sub name { shift->{'Name'} }
	238
	239	# Temporary legacy methods
	240	sub toUnicode { shift->decode(@_) }
	241	sub fromUnicode { shift->encode(@_) }
	242
	243	sub new_sequence { return $_[0] }
	244
	245	package Encode::XS;
	246	use base 'Encode::Encoding';
	247
	248	package Encode::Internal;
	249	use base 'Encode::Encoding';
	250
	251	# Dummy package that provides the encode interface but leaves data
	252	# as UTF-X encoded. It is here so that from_to() works.
	253
	254	__PACKAGE__->Define('Internal');
	255
	256	Encode::define_alias( 'Unicode' => 'Internal' ) if ord('A') == 65;
	257
	258	sub decode
	259	{
	260	my ($obj,$str,$chk) = @_;
	261	utf8::upgrade($str);
	262	$_[1] = '' if $chk;
	263	return $str;
	264	}
	265
	266	*encode = \&decode;
	267
	268	package Encoding::Unicode;
	269	use base 'Encode::Encoding';
	270
	271	__PACKAGE__->Define('Unicode') unless ord('A') == 65;
	272
	273	sub decode
	274	{
	275	my ($obj,$str,$chk) = @_;
	276	my $res = '';
	277	for (my $i = 0; $i < length($str); $i++)
	278	{
	279	$res .= chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
	280	}
	281	$_[1] = '' if $chk;
	282	return $res;
	283	}
	284
	285	sub encode
	286	{
	287	my ($obj,$str,$chk) = @_;
	288	my $res = '';
	289	for (my $i = 0; $i < length($str); $i++)
	290	{
	291	$res .= chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
	292	}
	293	$_[1] = '' if $chk;
	294	return $res;
	295	}
	296
	297
	298	package Encode::utf8;
	299	use base 'Encode::Encoding';
	300	# package to allow long-hand
	301	# $octets = encode( utf8 => $string );
	302	#
	303
	304	__PACKAGE__->Define(qw(UTF-8 utf8));
	305
	306	sub decode
	307	{
	308	my ($obj,$octets,$chk) = @_;
	309	my $str = Encode::decode_utf8($octets);
	310	if (defined $str)
	311	{
	312	$_[1] = '' if $chk;
	313	return $str;
	314	}
	315	return undef;
	316	}
	317
	318	sub encode
	319	{
	320	my ($obj,$string,$chk) = @_;
	321	my $octets = Encode::encode_utf8($string);
	322	$_[1] = '' if $chk;
	323	return $octets;
	324	}
	325
	326	package Encode::iso10646_1;
	327	use base 'Encode::Encoding';
	328	# Encoding is 16-bit network order Unicode (no surogates)
	329	# Used for X font encodings
	330
	331	__PACKAGE__->Define(qw(UCS-2 iso-10646-1));
	332
	333	sub decode
	334	{
	335	my ($obj,$str,$chk) = @_;
	336	my $uni = '';
	337	while (length($str))
	338	{
	339	my $code = unpack('n',substr($str,0,2,'')) & 0xffff;
	340	$uni .= chr($code);
	341	}
	342	$_[1] = $str if $chk;
	343	utf8::upgrade($uni);
	344	return $uni;
	345	}
	346
	347	sub encode
	348	{
	349	my ($obj,$uni,$chk) = @_;
	350	my $str = '';
	351	while (length($uni))
	352	{
	353	my $ch = substr($uni,0,1,'');
	354	my $x = ord($ch);
	355	unless ($x < 32768)
	356	{
	357	last if ($chk);
	358	$x = 0;
	359	}
	360	$str .= pack('n',$x);
	361	}
	362	$_[1] = $uni if $chk;
	363	return $str;
	364	}
	365
	366	# switch back to Encode package in case we ever add AutoLoader
	367	package Encode;
	368
	369	1;
	370
	371	__END__
	372
	373	=head1 NAME
	374
	375	Encode - character encodings
	376
	377	=head1 SYNOPSIS
	378
	379	use Encode;
	380
	381	=head1 DESCRIPTION
	382
	383	The C<Encode> module provides the interfaces between Perl's strings
	384	and the rest of the system. Perl strings are sequences of B<characters>.
	385
	386	The repertoire of characters that Perl can represent is at least that
	387	defined by the Unicode Consortium. On most platforms the ordinal
	388	values of the characters (as returned by C<ord(ch)>) is the "Unicode
	389	codepoint" for the character (the exceptions are those platforms where
	390	the legacy encoding is some variant of EBCDIC rather than a super-set
	391	of ASCII - see L<perlebcdic>).
	392
	393	Traditionaly computer data has been moved around in 8-bit chunks
	394	often called "bytes". These chunks are also known as "octets" in
	395	networking standards. Perl is widely used to manipulate data of
	396	many types - not only strings of characters representing human or
	397	computer languages but also "binary" data being the machines representation
	398	of numbers, pixels in an image - or just about anything.
	399
	400	When Perl is processing "binary data" the programmer wants Perl to process
	401	"sequences of bytes". This is not a problem for Perl - as a byte has 256
	402	possible values it easily fits in Perl's much larger "logical character".
	403
	404	=head2 TERMINOLOGY
	405
	406	=over 4
	407
	408	=item *
	409
	410	I<character>: a character in the range 0..(2**32-1) (or more).
	411	(What Perl's strings are made of.)
	412
	413	=item *
	414
	415	I<byte>: a character in the range 0..255
	416	(A special case of a Perl character.)
	417
	418	=item *
	419
	420	I<octet>: 8 bits of data, with ordinal values 0..255
	421	(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
	422
	423	=back
	424
	425	The marker [INTERNAL] marks Internal Implementation Details, in
	426	general meant only for those who think they know what they are doing,
	427	and such details may change in future releases.
	428
	429	=head1 ENCODINGS
	430
	431	=head2 Characteristics of an Encoding
	432
	433	An encoding has a "repertoire" of characters that it can represent,
	434	and for each representable character there is at least one sequence of
	435	octets that represents it.
	436
	437	=head2 Types of Encodings
	438
	439	Encodings can be divided into the following types:
	440
	441	=over 4
	442
	443	=item * Fixed length 8-bit (or less) encodings.
	444
	445	Each character is a single octet so may have a repertoire of up to
	446	256 characters. ASCII and iso-8859-* are typical examples.
	447
	448	=item * Fixed length 16-bit encodings
	449
	450	Each character is two octets so may have a repertoire of up to
	451	65 536 characters. Unicode's UCS-2 is an example. Also used for
	452	encodings for East Asian languages.
	453
	454	=item * Fixed length 32-bit encodings.
	455
	456	Not really very "encoded" encodings. The Unicode code points
	457	are just represented as 4-octet integers. None the less because
	458	different architectures use different representations of integers
	459	(so called "endian") there at least two disctinct encodings.
	460
	461	=item * Multi-byte encodings
	462
	463	The number of octets needed to represent a character varies.
	464	UTF-8 is a particularly complex but regular case of a multi-byte
	465	encoding. Several East Asian countries use a multi-byte encoding
	466	where 1-octet is used to cover western roman characters and Asian
	467	characters get 2-octets.
	468	(UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets
	469	to represent a Unicode code point.)
	470
	471	=item * "Escape" encodings.
	472
	473	These encodings embed "escape sequences" into the octet sequence
	474	which describe how the following octets are to be interpreted.
	475	The iso-2022-* family is typical. Following the escape sequence
	476	octets are encoded by an "embedded" encoding (which will be one
	477	of the above types) until another escape sequence switches to
	478	a different "embedded" encoding.
	479
	480	These schemes are very flexible and can handle mixed languages but are
	481	very complex to process (and have state). No escape encodings are
	482	implemented for Perl yet.
	483
	484	=back
	485
	486	=head2 Specifying Encodings
	487
	488	Encodings can be specified to the API described below in two ways:
	489
	490	=over 4
	491
	492	=item 1. By name
	493
	494	Encoding names are strings with characters taken from a restricted
	495	repertoire. See L</"Encoding Names">.
	496
	497	=item 2. As an object
	498
	499	Encoding objects are returned by C<find_encoding($name)>.
	500
	501	=back
	502
	503	=head2 Encoding Names
	504
	505	Encoding names are case insensitive. White space in names is ignored.
	506	In addition an encoding may have aliases. Each encoding has one
	507	"canonical" name. The "canonical" name is chosen from the names of
	508	the encoding by picking the first in the following sequence:
	509
	510	=over 4
	511
	512	=item * The MIME name as defined in IETF RFC-XXXX.
	513
	514	=item * The name in the IANA registry.
	515
	516	=item * The name used by the the organization that defined it.
	517
	518	=back
	519
	520	Because of all the alias issues, and because in the general case
	521	encodings have state C<Encode> uses the encoding object internally
	522	once an operation is in progress.
	523
	524	=head1 PERL ENCODING API
	525
	526	=head2 Generic Encoding Interface
	527
	528	=over 4
	529
	530	=item *
	531
	532	$bytes = encode(ENCODING, $string[, CHECK])
	533
	534	Encodes string from Perl's internal form into I<ENCODING> and returns
	535	a sequence of octets. For CHECK see L</"Handling Malformed Data">.
	536
	537	=item *
	538
	539	$string = decode(ENCODING, $bytes[, CHECK])
	540
	541	Decode sequence of octets assumed to be in I<ENCODING> into Perl's
	542	internal form and returns the resulting string. For CHECK see
	543	L</"Handling Malformed Data">.
	544
	545	=item *
	546
	547	from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
	548
	549	Convert B<in-place> the data between two encodings. How did the data
	550	in $string originally get to be in FROM_ENCODING? Either using
	551	encode() or through PerlIO: See L</"Encoding and IO">. For CHECK
	552	see L</"Handling Malformed Data">.
	553
	554	For example to convert ISO 8859-1 data to UTF-8:
	555
	556	from_to($data, "iso-8859-1", "utf-8");
	557
	558	and to convert it back:
	559
	560	from_to($data, "utf-8", "iso-8859-1");
	561
	562	Note that because the conversion happens in place, the data to be
	563	converted cannot be a string constant, it must be a scalar variable.
	564
	565	=back
	566
	567	=head2 Handling Malformed Data
	568
	569	If CHECK is not set, C<undef> is returned. If the data is supposed to
	570	be UTF-8, an optional lexical warning (category utf8) is given. If
	571	CHECK is true but not a code reference, dies.
	572
	573	It would desirable to have a way to indicate that transform should use
	574	the encodings "replacement character" - no such mechanism is defined yet.
	575
	576	It is also planned to allow I<CHECK> to be a code reference.
	577
	578	This is not yet implemented as there are design issues with what its
	579	arguments should be and how it returns its results.
	580
	581	=over 4
	582
	583	=item Scheme 1
	584
	585	Passed remaining fragment of string being processed.
	586	Modifies it in place to remove bytes/characters it can understand
	587	and returns a string used to represent them.
	588	e.g.
	589
	590	sub fixup {
	591	my $ch = substr($_[0],0,1,'');
	592	return sprintf("\x{%02X}",ord($ch);
	593	}
	594
	595	This scheme is close to how underlying C code for Encode works, but gives
	596	the fixup routine very little context.
	597
	598	=item Scheme 2
	599
	600	Passed original string, and an index into it of the problem area, and
	601	output string so far. Appends what it will to output string and
	602	returns new index into original string. For example:
	603
	604	sub fixup {
	605	# my ($s,$i,$d) = @_;
	606	my $ch = substr($_[0],$_[1],1);
	607	$_[2] .= sprintf("\x{%02X}",ord($ch);
	608	return $_[1]+1;
	609	}
	610
	611	This scheme gives maximal control to the fixup routine but is more
	612	complicated to code, and may need internals of Encode to be tweaked to
	613	keep original string intact.
	614
	615	=item Other Schemes
	616
	617	Hybrids of above.
	618
	619	Multiple return values rather than in-place modifications.
	620
	621	Index into the string could be pos($str) allowing s/\G...//.
	622
	623	=back
	624
	625	=head2 UTF-8 / utf8
	626
	627	The Unicode consortium defines the UTF-8 standard as a way of encoding
	628	the entire Unicode repertiore as sequences of octets. This encoding is
	629	expected to become very widespread. Perl can use this form internaly
	630	to represent strings, so conversions to and from this form are
	631	particularly efficient (as octets in memory do not have to change,
	632	just the meta-data that tells Perl how to treat them).
	633
	634	=over 4
	635
	636	=item *
	637
	638	$bytes = encode_utf8($string);
	639
	640	The characters that comprise string are encoded in Perl's superset of UTF-8
	641	and the resulting octets returned as a sequence of bytes. All possible
	642	characters have a UTF-8 representation so this function cannot fail.
	643
	644	=item *
	645
	646	$string = decode_utf8($bytes [,CHECK]);
	647
	648	The sequence of octets represented by $bytes is decoded from UTF-8
	649	into a sequence of logical characters. Not all sequences of octets
	650	form valid UTF-8 encodings, so it is possible for this call to fail.
	651	For CHECK see L</"Handling Malformed Data">.
	652
	653	=back
	654
	655	=head2 Other Encodings of Unicode
	656
	657	UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks. UCS-2 can only
	658	represent 0..0xFFFF, while UTF-16 has a "surrogate pair" scheme which
	659	allows it to cover the whole Unicode range.
	660
	661	Encode implements big-endian UCS-2 aliased to "iso-10646-1" as that
	662	happens to be the name used by that representation when used with X11
	663	fonts.
	664
	665	UTF-32 or UCS-4 is 32-bit or 4-byte chunks. Perl's logical characters
	666	can be considered as being in this form without encoding. An encoding
	667	to transfer strings in this form (e.g. to write them to a file) would
	668	need to
	669
	670	pack('L',map(chr($_),split(//,$string))); # native
	671	or
	672	pack('V',map(chr($_),split(//,$string))); # little-endian
	673	or
	674	pack('N',map(chr($_),split(//,$string))); # big-endian
	675
	676	depending on the endian required.
	677
	678	No UTF-32 encodings are implemented yet.
	679
	680	Both UCS-2 and UCS-4 style encodings can have "byte order marks" by
	681	representing the code point 0xFFFE as the very first thing in a file.
	682
	683	=head2 Listing available encodings
	684
	685	use Encode qw(encodings);
	686	@list = encodings();
	687
	688	Returns a list of the canonical names of the available encodings.
	689
	690	=head2 Defining Aliases
	691
	692	use Encode qw(define_alias);
	693	define_alias( newName => ENCODING);
	694
	695	Allows newName to be used as am alias for ENCODING. ENCODING may be
	696	either the name of an encoding or and encoding object (as above).
	697
	698	Currently I<newName> can be specified in the following ways:
	699
	700	=over 4
	701
	702	=item As a simple string.
	703
	704	=item As a qr// compiled regular expression, e.g.:
	705
	706	define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
	707
	708	In this case if I<ENCODING> is not a reference it is C<eval>-ed to
	709	allow C<$1> etc. to be subsituted. The example is one way to names as
	710	used in X11 font names to alias the MIME names for the iso-8859-*
	711	family.
	712
	713	=item As a code reference, e.g.:
	714
	715	define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
	716
	717	In this case C<$_> will be set to the name that is being looked up and
	718	I<ENCODING> is passed to the sub as its first argument. The example
	719	is another way to names as used in X11 font names to alias the MIME
	720	names for the iso-8859-* family.
	721
	722	=back
	723
	724	=head2 Defining Encodings
	725
	726	use Encode qw(define_alias);
	727	define_encoding( $object, 'canonicalName' [,alias...]);
	728
	729	Causes I<canonicalName> to be associated with I<$object>. The object
	730	should provide the interface described in L</"IMPLEMENTATION CLASSES">
	731	below. If more than two arguments are provided then additional
	732	arguments are taken as aliases for I<$object> as for C<define_alias>.
	733
	734	=head1 Encoding and IO
	735
	736	It is very common to want to do encoding transformations when
	737	reading or writing files, network connections, pipes etc.
	738	If Perl is configured to use the new 'perlio' IO system then
	739	C<Encode> provides a "layer" (See L<perliol>) which can transform
	740	data as it is read or written.
	741
	742	use Encode;
	743	open(my $ilyad,'>:encoding(iso-8859-7)','ilyad.greek');
	744	print $ilyad @epic;
	745
	746	In addition the new IO system can also be configured to read/write
	747	UTF-8 encoded characters (as noted above this is efficient):
	748
	749	open(my $fh,'>:utf8','anything');
	750	print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
	751
	752	Either of the above forms of "layer" specifications can be made the default
	753	for a lexical scope with the C<use open ...> pragma. See L<open>.
	754
	755	Once a handle is open is layers can be altered using C<binmode>.
	756
	757	Without any such configuration, or if Perl itself is built using
	758	system's own IO, then write operations assume that file handle accepts
	759	only I<bytes> and will C<die> if a character larger than 255 is
	760	written to the handle. When reading, each octet from the handle
	761	becomes a byte-in-a-character. Note that this default is the same
	762	behaviour as bytes-only languages (including Perl before v5.6) would
	763	have, and is sufficient to handle native 8-bit encodings
	764	e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
	765	other encodings and binary data.
	766
	767	In other cases it is the programs responsibility to transform
	768	characters into bytes using the API above before doing writes, and to
	769	transform the bytes read from a handle into characters before doing
	770	"character operations" (e.g. C<lc>, C</\W+/>, ...).
	771
	772	You can also use PerlIO to convert larger amounts of data you don't
	773	want to bring into memory. For example to convert between ISO 8859-1
	774	(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
	775
	776	open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
	777	open(G, ">:utf8", "data.utf") or die $!;
	778	while (<F>) { print G }
	779
	780	# Could also do "print G <F>" but that would pull
	781	# the whole file into memory just to write it out again.
	782
	783	More examples:
	784
	785	open(my $f, "<:encoding(cp1252)")
	786	open(my $g, ">:encoding(iso-8859-2)")
	787	open(my $h, ">:encoding(latin9)") # iso-8859-15
	788
	789	See L<PerlIO> for more information.
	790
	791	=head1 Encoding How to ...
	792
	793	To do:
	794
	795	=over 4
	796
	797	=item * IO with mixed content (faking iso-2020-*)
	798
	799	=item * MIME's Content-Length:
	800
	801	=item * UTF-8 strings in binary data.
	802
	803	=item * Perl/Encode wrappers on non-Unicode XS modules.
	804
	805	=back
	806
	807	=head1 Messing with Perl's Internals
	808
	809	The following API uses parts of Perl's internals in the current
	810	implementation. As such they are efficient, but may change.
	811
	812	=over 4
	813
	814	=item * is_utf8(STRING [, CHECK])
	815
	816	[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
	817	If CHECK is true, also checks the data in STRING for being well-formed
	818	UTF-8. Returns true if successful, false otherwise.
	819
	820	=item * valid_utf8(STRING)
	821
	822	[INTERNAL] Test whether STRING is in a consistent state. Will return
	823	true if string is held as bytes, or is well-formed UTF-8 and has the
	824	UTF-8 flag on. Main reason for this routine is to allow Perl's
	825	testsuite to check that operations have left strings in a consistent
	826	state.
	827
	828	=item *
	829
	830	_utf8_on(STRING)
	831
	832	[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
	833	B<not> checked for being well-formed UTF-8. Do not use unless you
	834	B<know> that the STRING is well-formed UTF-8. Returns the previous
	835	state of the UTF-8 flag (so please don't test the return value as
	836	I<not> success or failure), or C<undef> if STRING is not a string.
	837
	838	=item *
	839
	840	_utf8_off(STRING)
	841
	842	[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
	843	Returns the previous state of the UTF-8 flag (so please don't test the
	844	return value as I<not> success or failure), or C<undef> if STRING is
	845	not a string.
	846
	847	=back
	848
	849	=head1 IMPLEMENTATION CLASSES
	850
	851	As mentioned above encodings are (in the current implementation at least)
	852	defined by objects. The mapping of encoding name to object is via the
	853	C<%encodings> hash.
	854
	855	The values of the hash can currently be either strings or objects.
	856	The string form may go away in the future. The string form occurs
	857	when C<encodings()> has scanned C<@INC> for loadable encodings but has
	858	not actually loaded the encoding in question. This is because the
	859	current "loading" process is all Perl and a bit slow.
	860
	861	Once an encoding is loaded then value of the hash is object which
	862	implements the encoding. The object should provide the following
	863	interface:
	864
	865	=over 4
	866
	867	=item -E<gt>name
	868
	869	Should return the string representing the canonical name of the encoding.
	870
	871	=item -E<gt>new_sequence
	872
	873	This is a placeholder for encodings with state. It should return an
	874	object which implements this interface, all current implementations
	875	return the original object.
	876
	877	=item -E<gt>encode($string,$check)
	878
	879	Should return the octet sequence representing I<$string>. If I<$check>
	880	is true it should modify I<$string> in place to remove the converted
	881	part (i.e. the whole string unless there is an error). If an error
	882	occurs it should return the octet sequence for the fragment of string
	883	that has been converted, and modify $string in-place to remove the
	884	converted part leaving it starting with the problem fragment.
	885
	886	If check is is false then C<encode> should make a "best effort" to
	887	convert the string - for example by using a replacement character.
	888
	889	=item -E<gt>decode($octets,$check)
	890
	891	Should return the string that I<$octets> represents. If I<$check> is
	892	true it should modify I<$octets> in place to remove the converted part
	893	(i.e. the whole sequence unless there is an error). If an error
	894	occurs it should return the fragment of string that has been
	895	converted, and modify $octets in-place to remove the converted part
	896	leaving it starting with the problem fragment.
	897
	898	If check is is false then C<decode> should make a "best effort" to
	899	convert the string - for example by using Unicode's "\x{FFFD}" as a
	900	replacement character.
	901
	902	=back
	903
	904	It should be noted that the check behaviour is different from the
	905	outer public API. The logic is that the "unchecked" case is useful
	906	when encoding is part of a stream which may be reporting errors
	907	(e.g. STDERR). In such cases it is desirable to get everything
	908	through somehow without causing additional errors which obscure the
	909	original one. Also the encoding is best placed to know what the
	910	correct replacement character is, so if that is the desired behaviour
	911	then letting low level code do it is the most efficient.
	912
	913	In contrast if check is true, the scheme above allows the encoding to
	914	do as much as it can and tell layer above how much that was. What is
	915	lacking at present is a mechanism to report what went wrong. The most
	916	likely interface will be an additional method call to the object, or
	917	perhaps (to avoid forcing per-stream objects on otherwise stateless
	918	encodings) and additional parameter.
	919
	920	It is also highly desirable that encoding classes inherit from
	921	C<Encode::Encoding> as a base class. This allows that class to define
	922	additional behaviour for all encoding objects. For example built in
	923	Unicode, UCS-2 and UTF-8 classes use :
	924
	925	package Encode::MyEncoding;
	926	use base qw(Encode::Encoding);
	927
	928	__PACKAGE__->Define(qw(myCanonical myAlias));
	929
	930	To create an object with bless {Name => ...},$class, and call
	931	define_encoding. They inherit their C<name> method from
	932	C<Encode::Encoding>.
	933
	934	=head2 Compiled Encodings
	935
	936	F<Encode.xs> provides a class C<Encode::XS> which provides the
	937	interface described above. It calls a generic octet-sequence to
	938	octet-sequence "engine" that is driven by tables (defined in
	939	F<encengine.c>). The same engine is used for both encode and
	940	decode. C<Encode:XS>'s C<encode> forces Perl's characters to their
	941	UTF-8 form and then treats them as just another multibyte
	942	encoding. C<Encode:XS>'s C<decode> transforms the sequence and then
	943	turns the UTF-8-ness flag as that is the form that the tables are
	944	defined to produce. For details of the engine see the comments in
	945	F<encengine.c>.
	946
	947	The tables are produced by the Perl script F<compile> (the name needs
	948	to change so we can eventually install it somewhere). F<compile> can
	949	currently read two formats:
	950
	951	=over 4
	952
	953	=item *.enc
	954
	955	This is a coined format used by Tcl. It is documented in
	956	Encode/EncodeFormat.pod.
	957
	958	=item *.ucm
	959
	960	This is the semi-standard format used by IBM's ICU package.
	961
	962	=back
	963
	964	F<compile> can write the following forms:
	965
	966	=over 4
	967
	968	=item *.ucm
	969
	970	See above - the F<Encode/*.ucm> files provided with the distribution have
	971	been created from the original Tcl .enc files using this approach.
	972
	973	=item *.c
	974
	975	Produces tables as C data structures - this is used to build in encodings
	976	into F<Encode.so>/F<Encode.dll>.
	977
	978	=item *.xs
	979
	980	In theory this allows encodings to be stand-alone loadable Perl
	981	extensions. The process has not yet been tested. The plan is to use
	982	this approach for large East Asian encodings.
	983
	984	=back
	985
	986	The set of encodings built-in to F<Encode.so>/F<Encode.dll> is
	987	determined by F<Makefile.PL>. The current set is as follows:
	988
	989	=over 4
	990
	991	=item ascii and iso-8859-*
	992
	993	That is all the common 8-bit "western" encodings.
	994
	995	=item IBM-1047 and two other variants of EBCDIC.
	996
	997	These are the same variants that are supported by EBCDIC Perl as
	998	"native" encodings. They are included to prove "reversibility" of
	999	some constructs in EBCDIC Perl.
	1000
	1001	=item symbol and dingbats as used by Tk on X11.
	1002
	1003	(The reason Encode got started was to support Perl/Tk.)
	1004
	1005	=back
	1006
	1007	That set is rather ad hoc and has been driven by the needs of the
	1008	tests rather than the needs of typical applications. It is likely
	1009	to be rationalized.
	1010
	1011	=head1 SEE ALSO
	1012
	1013	L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>
	1014
	1015	=cut
	1016