[perl5.git] / regen / ebcdic.pl

use v5.16.0;
use strict;
use warnings;

BEGIN { unshift @INC, '.' }

require './regen/regen_lib.pl';
require './regen/charset_translations.pl';

# Generates the EBCDIC translation tables that were formerly hard-coded into
# utfebcdic.h

my $out_fh = open_new('ebcdic_tables.h', '>',
        {style => '*', by => $0, });

sub output_table ($$;$) {
    my $table_ref = shift;
    my $name = shift;

    # Tables in hex easier to debug, but don't fit into 80 columns
    my $print_in_hex = shift // 1;

    die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256;

    print $out_fh "EXTCONST U8 $name\[\] = {\n";

    my $column_numbers= "/*_0   _1   _2   _3   _4   _5   _6   _7   _8   _9   _A   _B   _C   _D   _E  _F*/\n";
    print $out_fh $column_numbers if $print_in_hex;
    for my $i (0 .. 255) {
        if ($print_in_hex) {
            # No row headings, so will fit in 80 cols.
            #printf $out_fh "/* %X_ */ ", $i / 16 if $i % 16 == 0;
            printf $out_fh "0x%02X", $table_ref->[$i];
        }
        else {
            printf $out_fh "%4d", $table_ref->[$i];
        }
        print $out_fh ",", if $i < 255;
        #print $out_fh ($i < 255) ? "," : " ";
        #printf $out_fh " /* %X_ */", $i / 16 if $print_in_hex && $i % 16 == 15;
        print $out_fh "\n" if $i % 16 == 15;
    }
    print $out_fh $column_numbers if $print_in_hex;
    print $out_fh "};\n\n";
}

print $out_fh <<END;

#ifndef H_EBCDIC_TABLES   /* Guard against nested #includes */
#define H_EBCDIC_TABLES   1

/* This file contains definitions for various tables used in EBCDIC handling.
 * More info is in utfebcdic.h */
END

my @charsets = get_supported_code_pages();
shift @charsets;    # ASCII is the 0th, and we don't deal with that here.
foreach my $charset (@charsets) {
    # we process the whole array several times, make a copy
    my @a2e = @{get_a2n($charset)};

    print $out_fh "\n" . get_conditional_compile_line_start($charset);
    print $out_fh "\n";

    print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
    output_table(\@a2e, "PL_a2e");

    { # Construct the inverse
        my @e2a;
        for my $i (0 .. 255) {
            $e2a[$a2e[$i]] = $i;
        }
        print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
        output_table(\@e2a, "PL_e2a");
    }

    my @i82utf = @{get_I8_2_utf($charset)};
    print $out_fh <<END;
/* (Confusingly named) Index is $charset I8 byte; value is
 * $charset UTF-EBCDIC equivalent */
END
    output_table(\@i82utf, "PL_utf2e");

    { #Construct the inverse
        my @utf2i8;
        for my $i (0 .. 255) {
            $utf2i8[$i82utf[$i]] = $i;
        }
        print $out_fh <<END;
/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
 * $charset I8 equivalent */
END
        output_table(\@utf2i8, "PL_e2utf");
    }

    {
        my @utf8skip;

        # These are invariants or continuation bytes.
        for my $i (0 .. 0xBF) {
            $utf8skip[$i82utf[$i]] = 1;
        }

        # These are start bytes;  The skip is the number of consecutive highest
        # order 1-bits (up to 7)
        for my $i (0xC0 .. 255) {
            my $count;
            if ($i == 0b11111111) {
                no warnings 'once';
                $count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES;
            }
            elsif (($i & 0b11111110) == 0b11111110) {
                $count= 7;
            }
            elsif (($i & 0b11111100) == 0b11111100) {
                $count= 6;
            }
            elsif (($i & 0b11111000) == 0b11111000) {
                $count= 5;
            }
            elsif (($i & 0b11110000) == 0b11110000) {
                $count= 4;
            }
            elsif (($i & 0b11100000) == 0b11100000) {
                $count= 3;
            }
            elsif (($i & 0b11000000) == 0b11000000) {
                $count= 2;
            }
            else {
                die "Something wrong for UTF8SKIP calculation for $i";
            }
            $utf8skip[$i82utf[$i]] = $count;
        }

        print $out_fh <<END;
/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes;
 * 1 for continuation.  Adapted from the shadow flags table in tr16.  The
 * entries marked 9 in tr16 are continuation bytes and are marked as length 1
 * here so that we can recover. */
END
        output_table(\@utf8skip, "PL_utf8skip", 0);  # The 0 means don't print
                                                     # in hex
    }

    use feature 'unicode_strings';

    {
        my @lc;
        for my $i (0 .. 255) {
            $lc[$a2e[$i]] = $a2e[ord lc chr $i];
        }
        print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n";
        output_table(\@lc, "PL_latin1_lc");
    }

    {
        my @uc;
        for my $i (0 .. 255) {
            my $uc = uc chr $i;
            if (length $uc > 1 || ord $uc > 255) {
                $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
            }
            $uc[$a2e[$i]] = $a2e[ord $uc];
        }
        print $out_fh <<END;
/* Index is $charset code point; value is its uppercase equivalent.
 * The 'mod' in the name means that codepoints whose uppercase is above 255 or
 * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
END
        output_table(\@uc, "PL_mod_latin1_uc");
    }

    { # PL_fold
        my @ascii_fold;
        for my $i (0 .. 255) {  # Initialise to identity map
            $ascii_fold[$i] = $i;
        }

        # Overwrite the entries that aren't identity
        for my $chr ('A' .. 'Z') {
            $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
        }
        for my $chr ('a' .. 'z') {
            $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
        }
        print $out_fh <<END;
/* Index is $charset code point; For A-Z, value is a-z; for a-z, value
 * is A-Z; all other code points map to themselves */
END
        output_table(\@ascii_fold, "PL_fold");
    }

    {
        my @latin1_fold;
        for my $i (0 .. 255) {
            my $char = chr $i;
            my $lc = lc $char;

            # lc and uc adequately proxy for fold-case pairs in this 0-255
            # range
            my $uc = uc $char;
            $uc = $char if length $uc > 1 || ord $uc > 255;
            if ($lc ne $char) {
                $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
            }
            elsif ($uc ne $char) {
                $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
            }
            else {
                $latin1_fold[$a2e[$i]] = $a2e[$i];
            }
        }
        print $out_fh <<END;
/* Index is $charset code point; value is its other fold-pair equivalent
 * (A => a; a => A, etc) in the 0-255 range.  If no such equivalent, value is
 * the code point itself */
END
        output_table(\@latin1_fold, "PL_fold_latin1");
    }

    print $out_fh get_conditional_compile_line_end();
}

print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n";

read_only_bottom_close_and_rename($out_fh);
Commit	Line	Data
4bc3dcfa KW	1	use v5.16.0;
	2	use strict;
	3	use warnings;
3d7c117d MB	4
	5	BEGIN { unshift @INC, '.' }
	6
	7	require './regen/regen_lib.pl';
	8	require './regen/charset_translations.pl';
4bc3dcfa KW	9
	10	# Generates the EBCDIC translation tables that were formerly hard-coded into
	11	# utfebcdic.h
	12
	13	my $out_fh = open_new('ebcdic_tables.h', '>',
	14	{style => '*', by => $0, });
	15
702cfe48	16	sub output_table ($$;$) {
4bc3dcfa KW	17	my $table_ref = shift;
	18	my $name = shift;
	19
43bf0d50	20	# Tables in hex easier to debug, but don't fit into 80 columns
702cfe48	21	my $print_in_hex = shift // 1;
43bf0d50	22
4bc3dcfa KW	23	die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256;
	24
	25	print $out_fh "EXTCONST U8 $name\[\] = {\n";
	26
702cfe48 KW	27	my $column_numbers= "/_0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F/\n";
702cfe48 KW	28	print $out_fh $column_numbers if $print_in_hex;
4bc3dcfa	29	for my $i (0 .. 255) {
43bf0d50	30	if ($print_in_hex) {
702cfe48 KW	31	# No row headings, so will fit in 80 cols.
	32	#printf $out_fh "/* %X_ */ ", $i / 16 if $i % 16 == 0;
	33	printf $out_fh "0x%02X", $table_ref->[$i];
43bf0d50 KW	34	}
	35	else {
	36	printf $out_fh "%4d", $table_ref->[$i];
	37	}
4bc3dcfa	38	print $out_fh ",", if $i < 255;
702cfe48 KW	39	#print $out_fh ($i < 255) ? "," : " ";
702cfe48 KW	40	#printf $out_fh " /* %X_ */", $i / 16 if $print_in_hex && $i % 16 == 15;
4bc3dcfa KW	41	print $out_fh "\n" if $i % 16 == 15;
4bc3dcfa KW	42	}
702cfe48	43	print $out_fh $column_numbers if $print_in_hex;
4bc3dcfa KW	44	print $out_fh "};\n\n";
	45	}
	46
	47	print $out_fh <<END;
	48
	49	#ifndef H_EBCDIC_TABLES /* Guard against nested #includes */
	50	#define H_EBCDIC_TABLES 1
	51
	52	/* This file contains definitions for various tables used in EBCDIC handling.
	53	* More info is in utfebcdic.h */
	54	END
	55
	56	my @charsets = get_supported_code_pages();
	57	shift @charsets; # ASCII is the 0th, and we don't deal with that here.
	58	foreach my $charset (@charsets) {
c30a0cf2 TC	59	# we process the whole array several times, make a copy
c30a0cf2 TC	60	my @a2e = @{get_a2n($charset)};
4bc3dcfa KW	61
	62	print $out_fh "\n" . get_conditional_compile_line_start($charset);
	63	print $out_fh "\n";
	64
	65	print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
	66	output_table(\@a2e, "PL_a2e");
	67
	68	{ # Construct the inverse
	69	my @e2a;
	70	for my $i (0 .. 255) {
	71	$e2a[$a2e[$i]] = $i;
	72	}
	73	print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
	74	output_table(\@e2a, "PL_e2a");
	75	}
	76
e0dcdb0a	77	my @i82utf = @{get_I8_2_utf($charset)};
4bc3dcfa KW	78	print $out_fh <<END;
	79	/* (Confusingly named) Index is $charset I8 byte; value is
	80	* $charset UTF-EBCDIC equivalent */
	81	END
	82	output_table(\@i82utf, "PL_utf2e");
	83
	84	{ #Construct the inverse
	85	my @utf2i8;
	86	for my $i (0 .. 255) {
	87	$utf2i8[$i82utf[$i]] = $i;
	88	}
	89	print $out_fh <<END;
	90	/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
	91	* $charset I8 equivalent */
	92	END
	93	output_table(\@utf2i8, "PL_e2utf");
	94	}
	95
	96	{
	97	my @utf8skip;
	98
	99	# These are invariants or continuation bytes.
	100	for my $i (0 .. 0xBF) {
	101	$utf8skip[$i82utf[$i]] = 1;
	102	}
	103
	104	# These are start bytes; The skip is the number of consecutive highest
	105	# order 1-bits (up to 7)
	106	for my $i (0xC0 .. 255) {
	107	my $count;
c0236afe KW	108	if ($i == 0b11111111) {
	109	no warnings 'once';
	110	$count = $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES;
	111	}
	112	elsif (($i & 0b11111110) == 0b11111110) {
4bc3dcfa KW	113	$count= 7;
	114	}
	115	elsif (($i & 0b11111100) == 0b11111100) {
	116	$count= 6;
	117	}
	118	elsif (($i & 0b11111000) == 0b11111000) {
	119	$count= 5;
	120	}
	121	elsif (($i & 0b11110000) == 0b11110000) {
	122	$count= 4;
	123	}
	124	elsif (($i & 0b11100000) == 0b11100000) {
	125	$count= 3;
	126	}
	127	elsif (($i & 0b11000000) == 0b11000000) {
	128	$count= 2;
	129	}
	130	else {
	131	die "Something wrong for UTF8SKIP calculation for $i";
	132	}
	133	$utf8skip[$i82utf[$i]] = $count;
	134	}
	135
	136	print $out_fh <<END;
	137	/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes;
	138	* 1 for continuation. Adapted from the shadow flags table in tr16. The
	139	* entries marked 9 in tr16 are continuation bytes and are marked as length 1
	140	* here so that we can recover. */
	141	END
702cfe48 KW	142	output_table(\@utf8skip, "PL_utf8skip", 0); # The 0 means don't print
702cfe48 KW	143	# in hex
4bc3dcfa KW	144	}
	145
	146	use feature 'unicode_strings';
	147
	148	{
	149	my @lc;
	150	for my $i (0 .. 255) {
	151	$lc[$a2e[$i]] = $a2e[ord lc chr $i];
	152	}
	153	print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n";
	154	output_table(\@lc, "PL_latin1_lc");
	155	}
	156
	157	{
	158	my @uc;
	159	for my $i (0 .. 255) {
	160	my $uc = uc chr $i;
	161	if (length $uc > 1 \|\| ord $uc > 255) {
	162	$uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
	163	}
	164	$uc[$a2e[$i]] = $a2e[ord $uc];
	165	}
	166	print $out_fh <<END;
	167	/* Index is $charset code point; value is its uppercase equivalent.
	168	* The 'mod' in the name means that codepoints whose uppercase is above 255 or
	169	* longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
	170	END
	171	output_table(\@uc, "PL_mod_latin1_uc");
	172	}
	173
	174	{ # PL_fold
	175	my @ascii_fold;
	176	for my $i (0 .. 255) { # Initialise to identity map
	177	$ascii_fold[$i] = $i;
	178	}
	179
	180	# Overwrite the entries that aren't identity
	181	for my $chr ('A' .. 'Z') {
	182	$ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
	183	}
	184	for my $chr ('a' .. 'z') {
	185	$ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
	186	}
	187	print $out_fh <<END;
	188	/* Index is $charset code point; For A-Z, value is a-z; for a-z, value
	189	* is A-Z; all other code points map to themselves */
	190	END
	191	output_table(\@ascii_fold, "PL_fold");
	192	}
	193
	194	{
	195	my @latin1_fold;
	196	for my $i (0 .. 255) {
	197	my $char = chr $i;
	198	my $lc = lc $char;
	199
	200	# lc and uc adequately proxy for fold-case pairs in this 0-255
	201	# range
	202	my $uc = uc $char;
	203	$uc = $char if length $uc > 1 \|\| ord $uc > 255;
	204	if ($lc ne $char) {
	205	$latin1_fold[$a2e[$i]] = $a2e[ord $lc];
	206	}
	207	elsif ($uc ne $char) {
208	$latin1_fold[$a2e[$i]] = $a2e[ord $uc];
209	}
210	else {
211	$latin1_fold[$a2e[$i]] = $a2e[$i];
212	}
213	}
214	print $out_fh <<END;
215	/* Index is $charset code point; value is its other fold-pair equivalent
216	* (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is
217	* the code point itself */
218	END
219	output_table(\@latin1_fold, "PL_fold_latin1");
220	}
221
222	print $out_fh get_conditional_compile_line_end();
223	}
224
225	print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n";
226
227	read_only_bottom_close_and_rename($out_fh);