perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	use v5.16.0;
	2	use strict;
	3	use warnings;
	4	require 'regen/regen_lib.pl';
	5	require 'regen/charset_translations.pl';
	6
	7	# Generates the EBCDIC translation tables that were formerly hard-coded into
	8	# utfebcdic.h
	9
	10	my $out_fh = open_new('ebcdic_tables.h', '>',
	11	{style => '*', by => $0, });
	12
	13	sub output_table ($$) {
	14	my $table_ref = shift;
	15	my $name = shift;
	16
	17	die "Requres 256 entries in table $name, got @$table_ref" if @$table_ref != 256;
	18
	19	print $out_fh "EXTCONST U8 $name\[\] = {\n";
	20
	21	for my $i (0 .. 255) {
	22	printf $out_fh "%4d", $table_ref->[$i];
	23	#printf $out_fh " 0x%02X", $table_ref->[$i];
	24	print $out_fh ",", if $i < 255;
	25	print $out_fh "\n" if $i % 16 == 15;
	26	}
	27	print $out_fh "};\n\n";
	28	}
	29
	30	print $out_fh <<END;
	31
	32	#ifndef H_EBCDIC_TABLES /* Guard against nested #includes */
	33	#define H_EBCDIC_TABLES 1
	34
	35	/* This file contains definitions for various tables used in EBCDIC handling.
	36	* More info is in utfebcdic.h */
	37	END
	38
	39	my @charsets = get_supported_code_pages();
	40	shift @charsets; # ASCII is the 0th, and we don't deal with that here.
	41	foreach my $charset (@charsets) {
	42	# we process the whole array several times, make a copy
	43	my @a2e = @{get_a2n($charset)};
	44
	45	print $out_fh "\n" . get_conditional_compile_line_start($charset);
	46	print $out_fh "\n";
	47
	48	print $out_fh "/* Index is ASCII platform code point; value is $charset equivalent */\n";
	49	output_table(\@a2e, "PL_a2e");
	50
	51	{ # Construct the inverse
	52	my @e2a;
	53	for my $i (0 .. 255) {
	54	$e2a[$a2e[$i]] = $i;
	55	}
	56	print $out_fh "/* Index is $charset code point; value is ASCII platform equivalent */\n";
	57	output_table(\@e2a, "PL_e2a");
	58	}
	59
	60	my @i82utf = @{get_I8_2_utf($charset)};
	61	print $out_fh <<END;
	62	/* (Confusingly named) Index is $charset I8 byte; value is
	63	* $charset UTF-EBCDIC equivalent */
	64	END
	65	output_table(\@i82utf, "PL_utf2e");
	66
	67	{ #Construct the inverse
	68	my @utf2i8;
	69	for my $i (0 .. 255) {
	70	$utf2i8[$i82utf[$i]] = $i;
	71	}
	72	print $out_fh <<END;
	73	/* (Confusingly named) Index is $charset UTF-EBCDIC byte; value is
	74	* $charset I8 equivalent */
	75	END
	76	output_table(\@utf2i8, "PL_e2utf");
	77	}
	78
	79	{
	80	my @utf8skip;
	81
	82	# These are invariants or continuation bytes.
	83	for my $i (0 .. 0xBF) {
	84	$utf8skip[$i82utf[$i]] = 1;
	85	}
	86
	87	# These are start bytes; The skip is the number of consecutive highest
	88	# order 1-bits (up to 7)
	89	for my $i (0xC0 .. 255) {
	90	my $count;
	91	if (($i & 0b11111110) == 0b11111110) {
	92	$count= 7;
	93	}
	94	elsif (($i & 0b11111100) == 0b11111100) {
	95	$count= 6;
	96	}
	97	elsif (($i & 0b11111000) == 0b11111000) {
	98	$count= 5;
	99	}
	100	elsif (($i & 0b11110000) == 0b11110000) {
	101	$count= 4;
	102	}
	103	elsif (($i & 0b11100000) == 0b11100000) {
	104	$count= 3;
	105	}
	106	elsif (($i & 0b11000000) == 0b11000000) {
	107	$count= 2;
	108	}
	109	else {
	110	die "Something wrong for UTF8SKIP calculation for $i";
	111	}
	112	$utf8skip[$i82utf[$i]] = $count;
	113	}
	114
	115	print $out_fh <<END;
	116	/* Index is $charset UTF-EBCDIC byte; value is UTF8SKIP for start bytes;
	117	* 1 for continuation. Adapted from the shadow flags table in tr16. The
	118	* entries marked 9 in tr16 are continuation bytes and are marked as length 1
	119	* here so that we can recover. */
	120	END
	121	output_table(\@utf8skip, "PL_utf8skip");
	122	}
	123
	124	use feature 'unicode_strings';
	125
	126	{
	127	my @lc;
	128	for my $i (0 .. 255) {
	129	$lc[$a2e[$i]] = $a2e[ord lc chr $i];
	130	}
	131	print $out_fh "/* Index is $charset code point; value is its lowercase equivalent */\n";
	132	output_table(\@lc, "PL_latin1_lc");
	133	}
	134
	135	{
	136	my @uc;
	137	for my $i (0 .. 255) {
	138	my $uc = uc chr $i;
	139	if (length $uc > 1 \|\| ord $uc > 255) {
	140	$uc = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}";
	141	}
	142	$uc[$a2e[$i]] = $a2e[ord $uc];
	143	}
	144	print $out_fh <<END;
	145	/* Index is $charset code point; value is its uppercase equivalent.
	146	* The 'mod' in the name means that codepoints whose uppercase is above 255 or
	147	* longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */
	148	END
	149	output_table(\@uc, "PL_mod_latin1_uc");
	150	}
	151
	152	{ # PL_fold
	153	my @ascii_fold;
	154	for my $i (0 .. 255) { # Initialise to identity map
	155	$ascii_fold[$i] = $i;
	156	}
	157
	158	# Overwrite the entries that aren't identity
	159	for my $chr ('A' .. 'Z') {
	160	$ascii_fold[$a2e[ord $chr]] = $a2e[ord lc $chr];
	161	}
	162	for my $chr ('a' .. 'z') {
	163	$ascii_fold[$a2e[ord $chr]] = $a2e[ord uc $chr];
	164	}
	165	print $out_fh <<END;
	166	/* Index is $charset code point; For A-Z, value is a-z; for a-z, value
	167	* is A-Z; all other code points map to themselves */
	168	END
	169	output_table(\@ascii_fold, "PL_fold");
	170	}
	171
	172	{
	173	my @latin1_fold;
	174	for my $i (0 .. 255) {
	175	my $char = chr $i;
	176	my $lc = lc $char;
	177
	178	# lc and uc adequately proxy for fold-case pairs in this 0-255
	179	# range
	180	my $uc = uc $char;
	181	$uc = $char if length $uc > 1 \|\| ord $uc > 255;
	182	if ($lc ne $char) {
	183	$latin1_fold[$a2e[$i]] = $a2e[ord $lc];
	184	}
	185	elsif ($uc ne $char) {
	186	$latin1_fold[$a2e[$i]] = $a2e[ord $uc];
	187	}
	188	else {
	189	$latin1_fold[$a2e[$i]] = $a2e[$i];
	190	}
	191	}
	192	print $out_fh <<END;
	193	/* Index is $charset code point; value is its other fold-pair equivalent
	194	* (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is
	195	* the code point itself */
	196	END
	197	output_table(\@latin1_fold, "PL_fold_latin1");
	198	}
	199
	200	print $out_fh get_conditional_compile_line_end();
	201	}
	202
	203	print $out_fh "\n#endif /* H_EBCDIC_TABLES */\n";
	204
	205	read_only_bottom_close_and_rename($out_fh);