[perl5.git] / regen / charset_translations.pl

#!/usr/bin/perl -w
use strict;
use warnings;

# Utilities for various character set issues.  Currently handles ASCII and
# EBCDIC only.  It is trivial to add support for new EBCDIC code pages (unless
# they have identical variant character signatures as existing ones, and there
# aren't other glitches that arise): just add a mapping table to
# %ebcdic_translations and regen everything that uses this.

my %ebcdic_translations = (
    # Keys are code page name; values are arrays that map ASCII ordinals to
    # the code page's ordinals

    'EBCDIC 1047' =>
      [ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
        0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
        0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
        0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F,
        0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
        0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xAD, 0xE0, 0xBD, 0x5F, 0x6D,
        0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
        0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07,
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B,
        0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0xFF,
        0x41, 0xAA, 0x4A, 0xB1, 0x9F, 0xB2, 0x6A, 0xB5, 0xBB, 0xB4, 0x9A, 0x8A, 0xB0, 0xCA, 0xAF, 0xBC,
        0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB,
        0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77,
        0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xFD, 0xFE, 0xFB, 0xFC, 0xBA, 0xAE, 0x59,
        0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57,
        0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xDD, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF
      ],

    'EBCDIC POSIX-BC' =>
      [
        0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
        0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
        0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
        0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F,
        0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
        0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xBB, 0xBC, 0xBD, 0x6A, 0x6D,
        0x4A, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
        0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xFB, 0x4F, 0xFD, 0xFF, 0x07,
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B,
        0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0x5F,
        0x41, 0xAA, 0xB0, 0xB1, 0x9F, 0xB2, 0xD0, 0xB5, 0x79, 0xB4, 0x9A, 0x8A, 0xBA, 0xCA, 0xAF, 0xA1,
        0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB,
        0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77,
        0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xE0, 0xFE, 0xDD, 0xFC, 0xAD, 0xAE, 0x59,
        0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57,
        0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xC0, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF
      ],

    'EBCDIC 037' =>
      [
        0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x25, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
        0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
        0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
        0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F,
        0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
        0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xBA, 0xE0, 0xBB, 0xB0, 0x6D,
        0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
        0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07,
        0x20, 0x21, 0x22, 0x23, 0x24, 0x15, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B,
        0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0xFF,
        0x41, 0xAA, 0x4A, 0xB1, 0x9F, 0xB2, 0x6A, 0xB5, 0xBD, 0xB4, 0x9A, 0x8A, 0x5F, 0xCA, 0xAF, 0xBC,
        0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB,
        0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77,
        0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xFD, 0xFE, 0xFB, 0xFC, 0xAD, 0xAE, 0x59,
        0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57,
        0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xDD, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF
      ],
);

my $ascii_key = 'ASCII/Latin1';

my %I8_TO_NATIVE_UTF8;  # Maps I8 UTF to final UTF-EBCDIC
                        # See http://www.unicode.org/reports/tr16/

sub get_supported_code_pages() {
    # Returns an ordered array of the currently supported code pages,
    # including ASCII as the 0th element, 1047 as the 1th, and the others
    # sorted lexically by code page name.

    # Create an ASCII table.
    unless (exists $ebcdic_translations{$ascii_key}) {
        for my $i (0 .. 255) {
            $ebcdic_translations{$ascii_key}->[$i] = $i;
        }
    }

    return sort {
                  $a eq $ascii_key
                  ? -1
                  : $b eq $ascii_key
                    ? 1
                    : $a =~ /1047/
                      ? -1
                      : $b =~ /1047/
                        ? 1
                        : $a cmp $b
                } keys %ebcdic_translations;
}

sub get_a2n($) {
    # Returns the mapping array for ASCII to code page for the code page named
    # by the input parameter.

    my $charset = shift;

    if (! exists $ebcdic_translations{$charset}) {
        die "Unknown character set '$charset'";
    }

    return $ebcdic_translations{$charset};
}

sub get_I8_2_utf($) {
    # Returns the mapping array for I8 to code page UTF-EBCDIC for the code
    # page named by the input parameter.  This is Table 2 of TR16 customized
    # for the code page.  See utfebcdic.h for why, contrary to TR16, it has to
    # be code-page-specific.

    my $charset = shift;

    die "I8 not a valid concept for ASCII" if $charset eq $ascii_key;
    die "'$charset' unknown" unless exists $ebcdic_translations{$charset};

    # Generate the table if not already present
    if (! exists $I8_TO_NATIVE_UTF8{$charset}) {

        # The code points not used for invariants.  Initialized to everything,
        # then entries are removed as we go along.
        my %unused_cps;
        for my $i (0 .. 255) {
            $unused_cps{$i} = 1;
        }

        # These are the invariants.  The output has them mapped to the
        # original EBCDIC code point.
        for my $i (0 .. 0x9F) {
            use charnames ();
            my $ebcdic_value = $ebcdic_translations{$charset}[$i];
            #printf "$charset: using %02x which is %02x ascii, %s\n", $ebcdic_value, $i, charnames::viacode($i);
            $I8_TO_NATIVE_UTF8{$charset}[$i] = $ebcdic_value;
            if (! defined delete $unused_cps{$ebcdic_value}) {
                die "Two code points map to $ebcdic_value; one is $i";
            }
        }

        # Put the unused code points in order
        my @unused_cps = sort { $a <=> $b } keys %unused_cps;

        # Fill in the rest of the map with these ordered code points, as TR16
        # specifies
        for my $i (0xA0 .. 255) {
            $I8_TO_NATIVE_UTF8{$charset}[$i] = shift @unused_cps;
            #printf "$charset: filling in %02x which is %02x ascii, %s\n", $I8_TO_NATIVE_UTF8{$charset}[$i], $i, charnames::viacode($i);
        }

        if (@unused_cps) {
            die "Left-over code points";
        }
    }

    return $I8_TO_NATIVE_UTF8{$charset};
}

{ # Closure

    my $charset;    # We use these to do some error checking that the #if and
                    # #endif are matched.
    my $indent;

    sub get_conditional_compile_line_start($;$) {
        # Returns the '#if' line to put into C code to compile for the code
        # page given by the first parameter.  The second parameter, if
        # present, is the indentation level, like '#   if ...'

        if (defined $charset || defined $indent) {
            die "Missing call to get_conditional_compile_line_end()"
        }

        $charset = shift;
        my $indent_level = shift // 0;

        die "This is designed to run only on an ASCII platform" unless ord "A" == 65;

        if ($indent_level == 0) {
            $indent = "";
        }
        else {
            $indent = " " x (($indent_level * 4) - 1);
        }

        die "Unknown character set '$charset'" unless exists $ebcdic_translations{$charset};

        my $return = "";
        {
            no warnings 'qw';
            my $count = -1;

            # We use all the typical variant characters to construct the #if,
            # so that it is unlikely that a different code page will match
            # this #if
            for my $char (qw/A \\\ [ ] { } ^ ~ ! # | $ @ `/) {
                my $compare;
                my $ascii_ord = ord $char;
                my $first_time = $return eq "";

                $compare = $ebcdic_translations{$charset}[$ascii_ord];
                $return .=  " && " unless $first_time;
                $return .= "'$char' == $compare";
                $return .= " /* $charset */" if $first_time;
                last if $charset eq $ascii_key;
                $count++;
                $return .= " \\\n    " if $first_time || $count % 5 == 0;
            }
        }

        return "#${indent}if $return\n";
    }

    sub get_conditional_compile_line_end () {
        # Returns the #endif for the currently open #if

        my $return = "#${indent}endif\t/* $charset */\n";
        undef $charset;
        undef $indent;
        return $return;
    }
}

sub _UTF_START_MASK($) {
    # Internal
    my $len = shift;
    return ((($len) >= 6) ? 0x01 : (0x1F >> (($len)-2)));
}

sub _UTF_START_MARK($) {
    # Internal
    return (0xFF & (0xFE << (7-(shift))));
}

sub cp_2_utfbytes($$) {
    # Returns a string consisting of the UTF-EBCDIC for the code page given by
    # the 2nd parameter, of the Unicode code point given by the first
    # parameter, using the UTF-MOD algorithm published in TR16.  (If the "code
    # page" is ASCII, straight UTF-8 is returned.)

    my ($ucp, $charset) = @_;

    if ($charset eq $ascii_key) {
        my $str = chr $ucp;
        utf8::upgrade($str);
        utf8::encode($str);
        return $str;
    }
    elsif (exists $ebcdic_translations{$charset}) {

        if ($ucp < 0xA0) {
            return chr $ebcdic_translations{$charset}[$ucp];
        }

        my $I8_2_utf = get_I8_2_utf($charset);

        my $len = $ucp < 0xA0      ? 1 :
		  $ucp < 0x400     ? 2 :
		  $ucp < 0x4000    ? 3 :
		  $ucp < 0x40000   ? 4 :
		  $ucp < 0x400000  ? 5 :
		  $ucp < 0x4000000 ? 6 : 7;

        my @str;
	for (1 .. $len - 1) {
            unshift @str, chr $I8_2_utf->[($ucp & 0x1f) | 0xA0];
	    $ucp >>= 5;
	}

	unshift @str, chr $I8_2_utf->[($ucp & _UTF_START_MASK($len)) | _UTF_START_MARK($len)];

        return join "", @str;
    }
    else {
        die "Unknown character set '$charset'";
    }
}

1;
Commit	Line	Data
6ff677df KW	1	#!/usr/bin/perl -w
	2	use strict;
	3	use warnings;
	4
	5	# Utilities for various character set issues. Currently handles ASCII and
	6	# EBCDIC only. It is trivial to add support for new EBCDIC code pages (unless
	7	# they have identical variant character signatures as existing ones, and there
	8	# aren't other glitches that arise): just add a mapping table to
	9	# %ebcdic_translations and regen everything that uses this.
	10
	11	my %ebcdic_translations = (
	12	# Keys are code page name; values are arrays that map ASCII ordinals to
	13	# the code page's ordinals
	14
	15	'EBCDIC 1047' =>
	16	[ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
	17	0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
	18	0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
	19	0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F,
	20	0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
	21	0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xAD, 0xE0, 0xBD, 0x5F, 0x6D,
	22	0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
	23	0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07,
	24	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B,
	25	0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0xFF,
	26	0x41, 0xAA, 0x4A, 0xB1, 0x9F, 0xB2, 0x6A, 0xB5, 0xBB, 0xB4, 0x9A, 0x8A, 0xB0, 0xCA, 0xAF, 0xBC,
	27	0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB,
	28	0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77,
	29	0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xFD, 0xFE, 0xFB, 0xFC, 0xBA, 0xAE, 0x59,
	30	0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57,
	31	0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xDD, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF
	32	],
	33
	34	'EBCDIC POSIX-BC' =>
	35	[
	36	0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
	37	0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
	38	0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
	39	0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F,
	40	0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
	41	0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xBB, 0xBC, 0xBD, 0x6A, 0x6D,
	42	0x4A, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
	43	0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xFB, 0x4F, 0xFD, 0xFF, 0x07,
	44	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B,
	45	0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0x5F,
	46	0x41, 0xAA, 0xB0, 0xB1, 0x9F, 0xB2, 0xD0, 0xB5, 0x79, 0xB4, 0x9A, 0x8A, 0xBA, 0xCA, 0xAF, 0xA1,
	47	0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB,
	48	0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77,
	49	0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xE0, 0xFE, 0xDD, 0xFC, 0xAD, 0xAE, 0x59,
	50	0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57,
	51	0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xC0, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF
	52	],
	53
	54	'EBCDIC 037' =>
	55	[
	56	0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x25, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
	57	0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
	58	0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
	59	0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F,
	60	0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
	61	0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xBA, 0xE0, 0xBB, 0xB0, 0x6D,
	62	0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
	63	0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07,
	64	0x20, 0x21, 0x22, 0x23, 0x24, 0x15, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B,
65	0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0xFF,
66	0x41, 0xAA, 0x4A, 0xB1, 0x9F, 0xB2, 0x6A, 0xB5, 0xBD, 0xB4, 0x9A, 0x8A, 0x5F, 0xCA, 0xAF, 0xBC,
67	0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB,
68	0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77,
69	0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xFD, 0xFE, 0xFB, 0xFC, 0xAD, 0xAE, 0x59,
70	0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57,
71	0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xDD, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF
72	],
73	);
74
75	my $ascii_key = 'ASCII/Latin1';
76
77	my %I8_TO_NATIVE_UTF8; # Maps I8 UTF to final UTF-EBCDIC
78	# See http://www.unicode.org/reports/tr16/
79
80	sub get_supported_code_pages() {
81	# Returns an ordered array of the currently supported code pages,
82	# including ASCII as the 0th element, 1047 as the 1th, and the others
83	# sorted lexically by code page name.
84
85	# Create an ASCII table.
86	unless (exists $ebcdic_translations{$ascii_key}) {
87	for my $i (0 .. 255) {
88	$ebcdic_translations{$ascii_key}->[$i] = $i;
89	}
90	}
91
92	return sort {
93	$a eq $ascii_key
94	? -1
95	: $b eq $ascii_key
96	? 1
97	: $a =~ /1047/
98	? -1
99	: $b =~ /1047/
100	? 1
101	: $a cmp $b
102	} keys %ebcdic_translations;
103	}
104
105	sub get_a2n($) {
106	# Returns the mapping array for ASCII to code page for the code page named
107	# by the input parameter.
108
109	my $charset = shift;
110
111	if (! exists $ebcdic_translations{$charset}) {
112	die "Unknown character set '$charset'";
113	}
114
c30a0cf2	115	return $ebcdic_translations{$charset};
6ff677df KW	116	}
	117
	118	sub get_I8_2_utf($) {
	119	# Returns the mapping array for I8 to code page UTF-EBCDIC for the code
	120	# page named by the input parameter. This is Table 2 of TR16 customized
	121	# for the code page. See utfebcdic.h for why, contrary to TR16, it has to
	122	# be code-page-specific.
	123
	124	my $charset = shift;
	125
	126	die "I8 not a valid concept for ASCII" if $charset eq $ascii_key;
	127	die "'$charset' unknown" unless exists $ebcdic_translations{$charset};
	128
	129	# Generate the table if not already present
	130	if (! exists $I8_TO_NATIVE_UTF8{$charset}) {
	131
	132	# The code points not used for invariants. Initialized to everything,
	133	# then entries are removed as we go along.
	134	my %unused_cps;
	135	for my $i (0 .. 255) {
	136	$unused_cps{$i} = 1;
	137	}
	138
	139	# These are the invariants. The output has them mapped to the
	140	# original EBCDIC code point.
	141	for my $i (0 .. 0x9F) {
	142	use charnames ();
	143	my $ebcdic_value = $ebcdic_translations{$charset}[$i];
	144	#printf "$charset: using %02x which is %02x ascii, %s\n", $ebcdic_value, $i, charnames::viacode($i);
	145	$I8_TO_NATIVE_UTF8{$charset}[$i] = $ebcdic_value;
	146	if (! defined delete $unused_cps{$ebcdic_value}) {
	147	die "Two code points map to $ebcdic_value; one is $i";
	148	}
	149	}
	150
	151	# Put the unused code points in order
	152	my @unused_cps = sort { $a <=> $b } keys %unused_cps;
	153
	154	# Fill in the rest of the map with these ordered code points, as TR16
	155	# specifies
	156	for my $i (0xA0 .. 255) {
	157	$I8_TO_NATIVE_UTF8{$charset}[$i] = shift @unused_cps;
	158	#printf "$charset: filling in %02x which is %02x ascii, %s\n", $I8_TO_NATIVE_UTF8{$charset}[$i], $i, charnames::viacode($i);
	159	}
	160
	161	if (@unused_cps) {
	162	die "Left-over code points";
	163	}
	164	}
	165
e0dcdb0a	166	return $I8_TO_NATIVE_UTF8{$charset};
6ff677df KW	167	}
	168
	169	{ # Closure
	170
	171	my $charset; # We use these to do some error checking that the #if and
	172	# #endif are matched.
	173	my $indent;
	174
	175	sub get_conditional_compile_line_start($;$) {
	176	# Returns the '#if' line to put into C code to compile for the code
	177	# page given by the first parameter. The second parameter, if
	178	# present, is the indentation level, like '# if ...'
	179
	180	if (defined $charset \|\| defined $indent) {
	181	die "Missing call to get_conditional_compile_line_end()"
	182	}
	183
	184	$charset = shift;
	185	my $indent_level = shift // 0;
	186
	187	die "This is designed to run only on an ASCII platform" unless ord "A" == 65;
	188
	189	if ($indent_level == 0) {
	190	$indent = "";
	191	}
	192	else {
	193	$indent = " " x (($indent_level * 4) - 1);
	194	}
	195
	196	die "Unknown character set '$charset'" unless exists $ebcdic_translations{$charset};
	197
	198	my $return = "";
	199	{
	200	no warnings 'qw';
	201	my $count = -1;
	202
	203	# We use all the typical variant characters to construct the #if,
	204	# so that it is unlikely that a different code page will match
	205	# this #if
	206	for my $char (qw/A \\\ [ ] { } ^ ~ ! # \| $ @ `/) {
	207	my $compare;
	208	my $ascii_ord = ord $char;
	209	my $first_time = $return eq "";
	210
	211	$compare = $ebcdic_translations{$charset}[$ascii_ord];
	212	$return .= " && " unless $first_time;
	213	$return .= "'$char' == $compare";
	214	$return .= " /* $charset */" if $first_time;
	215	last if $charset eq $ascii_key;
	216	$count++;
	217	$return .= " \\\n " if $first_time \|\| $count % 5 == 0;
	218	}
	219	}
	220
	221	return "#${indent}if $return\n";
	222	}
	223
	224	sub get_conditional_compile_line_end () {
	225	# Returns the #endif for the currently open #if
	226
	227	my $return = "#${indent}endif\t/* $charset */\n";
	228	undef $charset;
	229	undef $indent;
	230	return $return;
231	}
232	}
233
234	sub _UTF_START_MASK($) {
235	# Internal
236	my $len = shift;
237	return ((($len) >= 6) ? 0x01 : (0x1F >> (($len)-2)));
238	}
239
240	sub _UTF_START_MARK($) {
241	# Internal
242	return (0xFF & (0xFE << (7-(shift))));
243	}
244
245	sub cp_2_utfbytes($$) {
246	# Returns a string consisting of the UTF-EBCDIC for the code page given by
7799b1c9 KW	247	# the 2nd parameter, of the Unicode code point given by the first
	248	# parameter, using the UTF-MOD algorithm published in TR16. (If the "code
	249	# page" is ASCII, straight UTF-8 is returned.)
6ff677df KW	250
	251	my ($ucp, $charset) = @_;
	252
	253	if ($charset eq $ascii_key) {
	254	my $str = chr $ucp;
	255	utf8::upgrade($str);
	256	utf8::encode($str);
	257	return $str;
	258	}
	259	elsif (exists $ebcdic_translations{$charset}) {
	260
	261	if ($ucp < 0xA0) {
	262	return chr $ebcdic_translations{$charset}[$ucp];
	263	}
	264
e0dcdb0a	265	my $I8_2_utf = get_I8_2_utf($charset);
6ff677df KW	266
	267	my $len = $ucp < 0xA0 ? 1 :
	268	$ucp < 0x400 ? 2 :
	269	$ucp < 0x4000 ? 3 :
	270	$ucp < 0x40000 ? 4 :
	271	$ucp < 0x400000 ? 5 :
	272	$ucp < 0x4000000 ? 6 : 7;
	273
	274	my @str;
	275	for (1 .. $len - 1) {
e0dcdb0a	276	unshift @str, chr $I8_2_utf->[($ucp & 0x1f) \| 0xA0];
6ff677df KW	277	$ucp >>= 5;
	278	}
	279
e0dcdb0a	280	unshift @str, chr $I8_2_utf->[($ucp & _UTF_START_MASK($len)) \| _UTF_START_MARK($len)];
6ff677df KW	281
	282	return join "", @str;
	283	}
	284	else {
	285	die "Unknown character set '$charset'";
	286	}
	287	}
	288
	289	1;