[perl5.git] / cpan / Encode / encengine.c

/*
Data structures for encoding transformations.

Perl works internally in either a native 'byte' encoding or
in UTF-8 encoded Unicode.  We have no immediate need for a "wchar_t"
representation. When we do we can use utf8_to_uv().

Most character encodings are either simple byte mappings or
variable length multi-byte encodings. UTF-8 can be viewed as a
rather extreme case of the latter.

So to solve an important part of perl's encode needs we need to solve the
"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
case. (Where one of multi-bytes will usually be UTF-8.)

The other type of encoding is a shift encoding where a prefix sequence
determines what subsequent bytes mean. Such encodings have state.

We also need to handle case where a character in one encoding has to be
represented as multiple characters in the other. e.g. letter+diacritic.

The process can be considered as pseudo perl:

my $dst = '';
while (length($src))
 {
  my $size    = src_count($src);
  my $in_seq  = substr($src,0,$size,'');
  my $out_seq = $s2d_hash{$in_seq};
  if (defined $out_seq)
   {
    $dst .= $out_seq;
   }
  else
   {
    # an error condition
   }
 }
return $dst;

That has the following components:
 &src_count - a "rule" for how many bytes make up the next character in the
              source.
 %s2d_hash  - a mapping from input sequences to output sequences

The problem with that scheme is that it does not allow the output
character repertoire to affect the characters considered from the
input.

So we use a "trie" representation which can also be considered
a state machine:

my $dst   = '';
my $seq   = \@s2d_seq;
my $next  = \@s2d_next;
while (length($src))
 {
  my $byte    = $substr($src,0,1,'');
  my $out_seq = $seq->[$byte];
  if (defined $out_seq)
   {
    $dst .= $out_seq;
   }
  else
   {
    # an error condition
   }
  ($next,$seq) = @$next->[$byte] if $next;
 }
return $dst;

There is now a pair of data structures to represent everything.
It is valid for output sequence at a particular point to
be defined but zero length, that just means "don't know yet".
For the single byte case there is no 'next' so new tables will be the same as
the original tables. For a multi-byte case a prefix byte will flip to the tables
for  the next page (adding nothing to the output), then the tables for the page
will provide the actual output and set tables back to original base page.

This scheme can also handle shift encodings.

A slight enhancement to the scheme also allows for look-ahead - if
we add a flag to re-add the removed byte to the source we could handle
  a" -> U+00E4 (LATIN SMALL LETTER A WITH DIAERESIS)
  ab -> a (and take b back please)

*/

#define PERL_NO_GET_CONTEXT
#include <EXTERN.h>
#include <perl.h>
#include "encode.h"

int
do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
      STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
{
    const U8 *s = src;
    const U8 *send = s + *slen;
    const U8 *last = s;
    U8 *d = dst;
    U8 *dend = d + dlen, *dlast = d;
    int code = 0;
    if (!dst)
      return ENCODE_NOSPACE;
    while (s < send) {
        const encpage_t *e = enc;
        U8 byte = *s;
        while (byte > e->max)
            e++;
        if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
            const U8 *cend = s + (e->slen & 0x7f);
            if (cend <= send) {
                STRLEN n;
                if ((n = e->dlen)) {
                    const U8 *out = e->seq + n * (byte - e->min);
                    U8 *oend = d + n;
                    if (dst) {
                        if (oend <= dend) {
                            while (d < oend)
                                *d++ = *out++;
                        }
                        else {
                            /* Out of space */
                            code = ENCODE_NOSPACE;
                            break;
                        }
                    }
                    else
                        d = oend;
                }
                enc = e->next;
                s++;
                if (s == cend) {
                    if (approx && (e->slen & 0x80))
                        code = ENCODE_FALLBACK;
                    last = s;
                    if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
                        code = ENCODE_FOUND_TERM;
                        break;
                    }
                    dlast = d;
                }
            }
            else {
                /* partial source character */
                code = ENCODE_PARTIAL;
                break;
            }
        }
        else {
            /* Cannot represent */
            code = ENCODE_NOREP;
            break;
        }
    }
    *slen = last - src;
    *dout = d - dst;
    return code;
}
Commit	Line	Data
017e2add NIS	1	/*
	2	Data structures for encoding transformations.
	3
	4	Perl works internally in either a native 'byte' encoding or
	5	in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t"
	6	representation. When we do we can use utf8_to_uv().
	7
	8	Most character encodings are either simple byte mappings or
	9	variable length multi-byte encodings. UTF-8 can be viewed as a
	10	rather extreme case of the latter.
	11
	12	So to solve an important part of perl's encode needs we need to solve the
	13	"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
	14	case. (Where one of multi-bytes will usually be UTF-8.)
	15
	16	The other type of encoding is a shift encoding where a prefix sequence
	17	determines what subsequent bytes mean. Such encodings have state.
	18
	19	We also need to handle case where a character in one encoding has to be
	20	represented as multiple characters in the other. e.g. letter+diacritic.
	21
	22	The process can be considered as pseudo perl:
	23
	24	my $dst = '';
	25	while (length($src))
	26	{
20797ee1	27	my $size = src_count($src);
017e2add NIS	28	my $in_seq = substr($src,0,$size,'');
	29	my $out_seq = $s2d_hash{$in_seq};
	30	if (defined $out_seq)
	31	{
	32	$dst .= $out_seq;
	33	}
	34	else
	35	{
	36	# an error condition
	37	}
	38	}
	39	return $dst;
	40
	41	That has the following components:
	42	&src_count - a "rule" for how many bytes make up the next character in the
	43	source.
	44	%s2d_hash - a mapping from input sequences to output sequences
	45
	46	The problem with that scheme is that it does not allow the output
	47	character repertoire to affect the characters considered from the
	48	input.
	49
	50	So we use a "trie" representation which can also be considered
	51	a state machine:
	52
	53	my $dst = '';
	54	my $seq = \@s2d_seq;
	55	my $next = \@s2d_next;
	56	while (length($src))
	57	{
	58	my $byte = $substr($src,0,1,'');
	59	my $out_seq = $seq->[$byte];
	60	if (defined $out_seq)
	61	{
	62	$dst .= $out_seq;
	63	}
	64	else
	65	{
	66	# an error condition
	67	}
	68	($next,$seq) = @$next->[$byte] if $next;
	69	}
	70	return $dst;
	71
	72	There is now a pair of data structures to represent everything.
	73	It is valid for output sequence at a particular point to
	74	be defined but zero length, that just means "don't know yet".
	75	For the single byte case there is no 'next' so new tables will be the same as
	76	the original tables. For a multi-byte case a prefix byte will flip to the tables
	77	for the next page (adding nothing to the output), then the tables for the page
	78	will provide the actual output and set tables back to original base page.
	79
	80	This scheme can also handle shift encodings.
	81
	82	A slight enhancement to the scheme also allows for look-ahead - if
	83	we add a flag to re-add the removed byte to the source we could handle
3cd3edd2	84	a" -> U+00E4 (LATIN SMALL LETTER A WITH DIAERESIS)
017e2add NIS	85	ab -> a (and take b back please)
	86
	87	*/
	88
c9955564	89	#define PERL_NO_GET_CONTEXT
017e2add NIS	90	#include <EXTERN.h>
017e2add NIS	91	#include <perl.h>
017e2add NIS	92	#include "encode.h"
017e2add NIS	93
2f2b4ff2	94	int
0629a5b3	95	do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
d1256cb1	96	STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
017e2add	97	{
dd88d393 JH	98	const U8 *s = src;
	99	const U8 send = s + slen;
	100	const U8 *last = s;
	101	U8 *d = dst;
220e2d4e	102	U8 dend = d + dlen, dlast = d;
dd88d393	103	int code = 0;
20797ee1 DK	104	if (!dst)
20797ee1 DK	105	return ENCODE_NOSPACE;
dd88d393	106	while (s < send) {
c31ca201 SH	107	const encpage_t *e = enc;
	108	U8 byte = *s;
	109	while (byte > e->max)
	110	e++;
	111	if (byte >= e->min && e->slen && (approx \|\| !(e->slen & 0x80))) {
	112	const U8 *cend = s + (e->slen & 0x7f);
	113	if (cend <= send) {
	114	STRLEN n;
	115	if ((n = e->dlen)) {
	116	const U8 out = e->seq + n (byte - e->min);
	117	U8 *oend = d + n;
	118	if (dst) {
	119	if (oend <= dend) {
	120	while (d < oend)
	121	d++ = out++;
	122	}
	123	else {
	124	/* Out of space */
	125	code = ENCODE_NOSPACE;
	126	break;
	127	}
	128	}
	129	else
	130	d = oend;
	131	}
	132	enc = e->next;
	133	s++;
	134	if (s == cend) {
	135	if (approx && (e->slen & 0x80))
	136	code = ENCODE_FALLBACK;
	137	last = s;
	138	if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
	139	code = ENCODE_FOUND_TERM;
	140	break;
	141	}
	142	dlast = d;
	143	}
d1256cb1 RGS	144	}
d1256cb1 RGS	145	else {
c31ca201 SH	146	/* partial source character */
c31ca201 SH	147	code = ENCODE_PARTIAL;
d1256cb1 RGS	148	break;
d1256cb1 RGS	149	}
d1256cb1 RGS	150	}
d1256cb1 RGS	151	else {
c31ca201 SH	152	/* Cannot represent */
	153	code = ENCODE_NOREP;
	154	break;
d1256cb1 RGS	155	}
d1256cb1 RGS	156	}
dd88d393 JH	157	*slen = last - src;
	158	*dout = d - dst;
	159	return code;
017e2add	160	}