[perl5.git] / regen / ebcdic.pl

use v5.16.0;
use strict;
use warnings;
require 'regen/regen_lib.pl';
require 'regen/charset_translations.pl';

# Generates the EBCDIC translation tables that were formerly hard-coded into
# utfebcdic.h

my $out_fh = open_new('ebcdic_tables.h', '>',
        {style => '*', by => $0, });

sub output_table ($$) {
    my $table_ref = shift;
    my $name = shift;

    die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256;

    print $out_fh "EXTCONST U8 $name\[\] = {\n";

    for my $i (0 .. 255) {
        printf $out_fh "%4d", $table_ref->[$i];
        #printf $out_fh " 0x%02X", $table_ref->[$i];
        print $out_fh ",", if $i < 255;
        print $out_fh "\n" if $i % 16 == 15;
    }
    print $out_fh "};\n\n";
}

print $out_fh <<END;

#ifndef H_EBCDIC_TABLES   /* Guard against nested #includes */
#define H_EBCDIC_TABLES   1

/* This file contains definitions for various tables used in EBCDIC handling.
 * More info is in utfebcdic.h */
END

my @charsets = get_supported_code_pages();
shift @charsets;    # ASCII is the 0th, and we don't deal with that here.
foreach my $charset (@charsets) {
    my @a2e = get_a2n($charset);

    print $out_fh "\n" . get_conditional_compile_line_start($charset);
    print $out_fh "\n";

    print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
    output_table(\@a2e, "PL_a2e");

    { # Construct the inverse
        my @e2a;
        for my $i (0 .. 255) {
            $e2a[$a2e[$i]] = $i;
        }
        print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
        output_table(\@e2a, "PL_e2a");
    }

    my @i82utf = get_I8_2_utf($charset);
    print $out_fh <<END;
/* (Confusingly named) Index is $charset I8 byte; value is
 * $charset UTF-EBCDIC equivalent */
END
    output_table(\@i82utf, "PL_utf2e");

    { #Construct the inverse
        my @utf2i8;
        for my $i (0 .. 255) {
            $utf2i8[$i82utf[$i]] = $i;
        }
        print $out_fh <<END;
/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
 * $charset I8 equivalent */
END
        output_table(\@utf2i8, "PL_e2utf");
    }

    {
        my @utf8skip;

        # These are invariants or continuation bytes.
        for my $i (0 .. 0xBF) {
            $utf8skip[$i82utf[$i]] = 1;
        }

        # These are start bytes;  The skip is the number of consecutive highest
        # order 1-bits (up to 7)
        for my $i (0xC0 .. 255) {
            my $count;
            if (($i & 0b11111110) == 0b11111110) {
                $count= 7;
            }
            elsif (($i & 0b11111100) == 0b11111100) {
                $count= 6;
            }
            elsif (($i & 0b11111000) == 0b11111000) {
                $count= 5;
            }
            elsif (($i & 0b11110000) == 0b11110000) {
                $count= 4;
            }
            elsif (($i & 0b11100000) == 0b11100000) {
                $count= 3;
            }
            elsif (($i & 0b11000000) == 0b11000000) {
                $count= 2;
            }
            else {
                die "Something wrong for UTF8SKIP calculation for $i";
            }
            $utf8skip[$i82utf[$i]] = $count;
        }

        print $out_fh <<END;
/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes;
 * 1 for continuation.  Adapted from the shadow flags table in tr16.  The
 * entries marked 9 in tr16 are continuation bytes and are marked as length 1
 * here so that we can recover. */
END
        output_table(\@utf8skip, "PL_utf8skip");
    }

    use feature 'unicode_strings';

    {
        my @lc;
        for my $i (0 .. 255) {
            $lc[$a2e[$i]] = $a2e[ord lc chr $i];
        }
        print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n";
        output_table(\@lc, "PL_latin1_lc");
    }

    {
        my @uc;
        for my $i (0 .. 255) {
            my $uc = uc chr $i;
            if (length $uc > 1 || ord $uc > 255) {
                $uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
            }
            $uc[$a2e[$i]] = $a2e[ord $uc];
        }
        print $out_fh <<END;
/* Index is $charset code point; value is its uppercase equivalent.
 * The 'mod' in the name means that codepoints whose uppercase is above 255 or
 * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
END
        output_table(\@uc, "PL_mod_latin1_uc");
    }

    { # PL_fold
        my @ascii_fold;
        for my $i (0 .. 255) {  # Initialise to identity map
            $ascii_fold[$i] = $i;
        }

        # Overwrite the entries that aren't identity
        for my $chr ('A' .. 'Z') {
            $ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
        }
        for my $chr ('a' .. 'z') {
            $ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
        }
        print $out_fh <<END;
/* Index is $charset code point; For A-Z, value is a-z; for a-z, value
 * is A-Z; all other code points map to themselves */
END
        output_table(\@ascii_fold, "PL_fold");
    }

    {
        my @latin1_fold;
        for my $i (0 .. 255) {
            my $char = chr $i;
            my $lc = lc $char;

            # lc and uc adequately proxy for fold-case pairs in this 0-255
            # range
            my $uc = uc $char;
            $uc = $char if length $uc > 1 || ord $uc > 255;
            if ($lc ne $char) {
                $latin1_fold[$a2e[$i]] = $a2e[ord $lc];
            }
            elsif ($uc ne $char) {
                $latin1_fold[$a2e[$i]] = $a2e[ord $uc];
            }
            else {
                $latin1_fold[$a2e[$i]] = $a2e[$i];
            }
        }
        print $out_fh <<END;
/* Index is $charset code point; value is its other fold-pair equivalent
 * (A => a; a => A, etc) in the 0-255 range.  If no such equivalent, value is
 * the code point itself */
END
        output_table(\@latin1_fold, "PL_fold_latin1");
    }

    print $out_fh get_conditional_compile_line_end();
}

print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n";

read_only_bottom_close_and_rename($out_fh);
Commit	Line	Data
4bc3dcfa KW	1	use v5.16.0;
	2	use strict;
	3	use warnings;
	4	require 'regen/regen_lib.pl';
	5	require 'regen/charset_translations.pl';
	6
	7	# Generates the EBCDIC translation tables that were formerly hard-coded into
	8	# utfebcdic.h
	9
	10	my $out_fh = open_new('ebcdic_tables.h', '>',
	11	{style => '*', by => $0, });
	12
	13	sub output_table ($$) {
	14	my $table_ref = shift;
	15	my $name = shift;
	16
	17	die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256;
	18
	19	print $out_fh "EXTCONST U8 $name\[\] = {\n";
	20
	21	for my $i (0 .. 255) {
	22	printf $out_fh "%4d", $table_ref->[$i];
	23	#printf $out_fh " 0x%02X", $table_ref->[$i];
	24	print $out_fh ",", if $i < 255;
	25	print $out_fh "\n" if $i % 16 == 15;
	26	}
	27	print $out_fh "};\n\n";
	28	}
	29
	30	print $out_fh <<END;
	31
	32	#ifndef H_EBCDIC_TABLES /* Guard against nested #includes */
	33	#define H_EBCDIC_TABLES 1
	34
	35	/* This file contains definitions for various tables used in EBCDIC handling.
	36	* More info is in utfebcdic.h */
	37	END
	38
	39	my @charsets = get_supported_code_pages();
	40	shift @charsets; # ASCII is the 0th, and we don't deal with that here.
	41	foreach my $charset (@charsets) {
	42	my @a2e = get_a2n($charset);
	43
	44	print $out_fh "\n" . get_conditional_compile_line_start($charset);
	45	print $out_fh "\n";
	46
	47	print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
	48	output_table(\@a2e, "PL_a2e");
	49
	50	{ # Construct the inverse
	51	my @e2a;
	52	for my $i (0 .. 255) {
	53	$e2a[$a2e[$i]] = $i;
	54	}
	55	print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
	56	output_table(\@e2a, "PL_e2a");
	57	}
	58
	59	my @i82utf = get_I8_2_utf($charset);
	60	print $out_fh <<END;
	61	/* (Confusingly named) Index is $charset I8 byte; value is
	62	* $charset UTF-EBCDIC equivalent */
	63	END
	64	output_table(\@i82utf, "PL_utf2e");
65
66	{ #Construct the inverse
67	my @utf2i8;
68	for my $i (0 .. 255) {
69	$utf2i8[$i82utf[$i]] = $i;
70	}
71	print $out_fh <<END;
72	/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
73	* $charset I8 equivalent */
74	END
75	output_table(\@utf2i8, "PL_e2utf");
76	}
77
78	{
79	my @utf8skip;
80
81	# These are invariants or continuation bytes.
82	for my $i (0 .. 0xBF) {
83	$utf8skip[$i82utf[$i]] = 1;
84	}
85
86	# These are start bytes; The skip is the number of consecutive highest
87	# order 1-bits (up to 7)
88	for my $i (0xC0 .. 255) {
89	my $count;
90	if (($i & 0b11111110) == 0b11111110) {
91	$count= 7;
92	}
93	elsif (($i & 0b11111100) == 0b11111100) {
94	$count= 6;
95	}
96	elsif (($i & 0b11111000) == 0b11111000) {
97	$count= 5;
98	}
99	elsif (($i & 0b11110000) == 0b11110000) {
100	$count= 4;
101	}
102	elsif (($i & 0b11100000) == 0b11100000) {
103	$count= 3;
104	}
105	elsif (($i & 0b11000000) == 0b11000000) {
106	$count= 2;
107	}
108	else {
109	die "Something wrong for UTF8SKIP calculation for $i";
110	}
111	$utf8skip[$i82utf[$i]] = $count;
112	}
113
114	print $out_fh <<END;
115	/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes;
116	* 1 for continuation. Adapted from the shadow flags table in tr16. The
117	* entries marked 9 in tr16 are continuation bytes and are marked as length 1
118	* here so that we can recover. */
119	END
120	output_table(\@utf8skip, "PL_utf8skip");
121	}
122
123	use feature 'unicode_strings';
124
125	{
126	my @lc;
127	for my $i (0 .. 255) {
128	$lc[$a2e[$i]] = $a2e[ord lc chr $i];
129	}
130	print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n";
131	output_table(\@lc, "PL_latin1_lc");
132	}
133
134	{
135	my @uc;
136	for my $i (0 .. 255) {
137	my $uc = uc chr $i;
138	if (length $uc > 1 \|\| ord $uc > 255) {
139	$uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
140	}
141	$uc[$a2e[$i]] = $a2e[ord $uc];
142	}
143	print $out_fh <<END;
144	/* Index is $charset code point; value is its uppercase equivalent.
145	* The 'mod' in the name means that codepoints whose uppercase is above 255 or
146	* longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
147	END
148	output_table(\@uc, "PL_mod_latin1_uc");
149	}
150
151	{ # PL_fold
152	my @ascii_fold;
153	for my $i (0 .. 255) { # Initialise to identity map
154	$ascii_fold[$i] = $i;
155	}
156
157	# Overwrite the entries that aren't identity
158	for my $chr ('A' .. 'Z') {
159	$ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
160	}
161	for my $chr ('a' .. 'z') {
162	$ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
163	}
164	print $out_fh <<END;
165	/* Index is $charset code point; For A-Z, value is a-z; for a-z, value
166	* is A-Z; all other code points map to themselves */
167	END
168	output_table(\@ascii_fold, "PL_fold");
169	}
170
171	{
172	my @latin1_fold;
173	for my $i (0 .. 255) {
174	my $char = chr $i;
175	my $lc = lc $char;
176
177	# lc and uc adequately proxy for fold-case pairs in this 0-255
178	# range
179	my $uc = uc $char;
180	$uc = $char if length $uc > 1 \|\| ord $uc > 255;
181	if ($lc ne $char) {
182	$latin1_fold[$a2e[$i]] = $a2e[ord $lc];
183	}
184	elsif ($uc ne $char) {
185	$latin1_fold[$a2e[$i]] = $a2e[ord $uc];
186	}
187	else {
188	$latin1_fold[$a2e[$i]] = $a2e[$i];
189	}
190	}
191	print $out_fh <<END;
192	/* Index is $charset code point; value is its other fold-pair equivalent
193	* (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is
194	* the code point itself */
195	END
196	output_table(\@latin1_fold, "PL_fold_latin1");
197	}
198
199	print $out_fh get_conditional_compile_line_end();
200	}
201
202	print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n";
203
204	read_only_bottom_close_and_rename($out_fh);