perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	use v5.16.0;
	2	use strict;
	3	use warnings;
	4	require './regen/regen_lib.pl';
	5	require './regen/charset_translations.pl';
	6	use Unicode::UCD;
	7	use charnames qw(:loose);
	8
	9	my $out_fh = open_new('unicode_constants.h', '>',
	10	{style => '*', by => $0,
	11	from => "Unicode data"});
	12
	13	print $out_fh <<END;
	14
	15	#ifndef PERL_UNICODE_CONSTANTS_H_ /* Guard against nested #includes */
	16	#define PERL_UNICODE_CONSTANTS_H_ 1
	17
	18	/* This file contains #defines for the version of Unicode being used and
	19	* various Unicode code points. The values the code point macros expand to
	20	* are the native Unicode code point, or all or portions of the UTF-8 encoding
	21	* for the code point. In the former case, the macro name has the suffix
	22	* "_NATIVE"; otherwise, the suffix "_UTF8".
	23	*
	24	* The macros that have the suffix "_UTF8" may have further suffixes, as
	25	* follows:
	26	* "_FIRST_BYTE" if the value is just the first byte of the UTF-8
	27	* representation; the value will be a numeric constant.
	28	* "_TAIL" if instead it represents all but the first byte. This, and
	29	* with no additional suffix are both string constants */
	30
	31	/*
	32	=head1 Unicode Support
	33
	34	=for apidoc AmnU\|const char *\|BOM_UTF8
	35
	36	This is a macro that evaluates to a string constant of the UTF-8 bytes that
	37	define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl
	38	is compiled on. This allows code to use a mnemonic for this character that
	39	works on both ASCII and EBCDIC platforms.
	40	S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in
	41	bytes.
	42
	43	=for apidoc AmnU\|const char *\|REPLACEMENT_CHARACTER_UTF8
	44
	45	This is a macro that evaluates to a string constant of the UTF-8 bytes that
	46	define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl
	47	is compiled on. This allows code to use a mnemonic for this character that
	48	works on both ASCII and EBCDIC platforms.
	49	S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in
	50	bytes.
	51
	52	=cut
	53	*/
	54
	55	END
	56
	57	my $version = Unicode::UCD::UnicodeVersion();
	58	my ($major, $dot, $dotdot) = $version =~ / (.?) \. (.?) (?: \. (.*) )? $ /x;
	59	$dotdot = 0 unless defined $dotdot;
	60
	61	print $out_fh <<END;
	62	#define UNICODE_MAJOR_VERSION $major
	63	#define UNICODE_DOT_VERSION $dot
	64	#define UNICODE_DOT_DOT_VERSION $dotdot
	65
	66	END
	67
	68	# The data are at __DATA__ in this file.
	69
	70	my @data = <DATA>;
	71
	72	foreach my $charset (get_supported_code_pages()) {
	73	print $out_fh "\n" . get_conditional_compile_line_start($charset);
	74
	75	my @a2n = @{get_a2n($charset)};
	76
	77	for ( @data ) {
	78	chomp;
	79
	80	# Convert any '#' comments to /* ... */; empty lines and comments are
	81	# output as blank lines
	82	if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
	83	my $comment_body = $1 // "";
	84	if ($comment_body ne "") {
	85	print $out_fh "/* $comment_body */\n";
	86	}
	87	else {
	88	print $out_fh "\n";
	89	}
	90	next;
	91	}
	92
	93	unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
	94	(?: [\ ]+ ( [^ ]* ) )? # optional flag
	95	(?: [\ ]+ ( .* ) )? # name if unnamed; flag is required
	96	/x)
	97	{
	98	die "Unexpected syntax at line $.: $_\n";
	99	}
	100
	101	my $name_or_cp = $1;
	102	my $flag = $2;
	103	my $desired_name = $3;
	104
	105	my $name;
	106	my $cp;
	107	my $U_cp; # code point in Unicode (not-native) terms
	108
	109	if ($name_or_cp =~ /^U\+(.*)/) {
	110	$U_cp = hex $1;
	111	$name = charnames::viacode($name_or_cp);
	112	if (! defined $name) {
	113	next if $flag =~ /skip_if_undef/;
	114	die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name;
	115	$name = "";
	116	}
	117	}
	118	else {
	119	$name = $name_or_cp;
	120	die "Unknown name '$name' at line $.: $_\n" unless defined $name;
	121	$U_cp = charnames::vianame($name =~ s/_/ /gr);
	122	}
	123
	124	$cp = ($U_cp < 256)
	125	? $a2n[$U_cp]
	126	: $U_cp;
	127
	128	$name = $desired_name if $name eq "" && $desired_name;
	129	$name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes
	130
	131	my $str;
	132	my $suffix;
	133	if (defined $flag && $flag eq 'native') {
	134	die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
	135	$suffix = '_NATIVE';
	136	$str = sprintf "0x%02X", $cp; # Is a numeric constant
	137	}
	138	else {
	139	$str = join "", map { sprintf "\\x%02X", ord $_ } split //, cp_2_utfbytes($U_cp, $charset);
	140
	141	$suffix = '_UTF8';
	142	if (! defined $flag \|\| $flag =~ /^ string (_skip_if_undef)? $/x) {
	143	$str = "\"$str\""; # Will be a string constant
	144	} elsif ($flag eq 'tail') {
	145	$str =~ s/\\x..//; # Remove the first byte
	146	$suffix .= '_TAIL';
	147	$str = "\"$str\""; # Will be a string constant
	148	}
	149	elsif ($flag eq 'first') {
	150	$str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
	151	$suffix .= '_FIRST_BYTE';
	152	$str = "0x$str"; # Is a numeric constant
	153	}
	154	else {
	155	die "Unknown flag at line $.: $_\n";
	156	}
	157	}
	158	printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp;
	159	}
	160
	161	my $max_PRINT_A = 0;
	162	for my $i (0x20 .. 0x7E) {
	163	$max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
	164	}
	165	printf $out_fh "# define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C 0x%02X /* The max code point that isPRINT_A */\n", $max_PRINT_A;
	166
	167	print $out_fh "\n" . get_conditional_compile_line_end();
	168
	169	}
	170
	171	use Unicode::UCD 'prop_invlist';
	172
	173	my $count = 0;
	174	my @other_invlist = prop_invlist("Other");
	175	for (my $i = 0; $i < @other_invlist; $i += 2) {
	176	$count += ((defined $other_invlist[$i+1])
	177	? $other_invlist[$i+1]
	178	: 0x110000)
	179	- $other_invlist[$i];
	180	}
	181	printf $out_fh "\n/* The number of code points not matching \\pC */\n"
	182	. "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C %d\n",
	183	0x110000 - $count;
	184
	185	# If this release has both the CWCM and CWCF properties, find the highest code
	186	# point which changes under any case change. We can use this to short-circuit
	187	# code
	188	my @cwcm = prop_invlist('CWCM');
	189	if (@cwcm) {
	190	my @cwcf = prop_invlist('CWCF');
	191	if (@cwcf) {
	192	my $max = ($cwcm[-1] < $cwcf[-1])
	193	? $cwcf[-1]
	194	: $cwcm[-1];
	195	printf $out_fh "\n/* The highest code point that has any type of case change */\n"
	196	. "#define HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C 0x%X\n",
	197	$max - 1;
	198	}
	199	}
	200
	201	print $out_fh "\n#endif /* PERL_UNICODE_CONSTANTS_H_ */\n";
	202
	203	read_only_bottom_close_and_rename($out_fh);
	204
	205	# DATA FORMAT
	206	#
	207	# Note that any apidoc comments you want in the file need to be added to one
	208	# of the prints above
	209	#
	210	# A blank line is output as-is.
	211	# Comments (lines whose first non-blank is a '#') are converted to C-style,
	212	# though empty comments are converted to blank lines. Otherwise, each line
	213	# represents one #define, and begins with either a Unicode character name with
	214	# the blanks and dashes in it squeezed out or replaced by underscores; or it
	215	# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter
	216	# case, the name will be looked-up to use as the name of the macro. In either
	217	# case, the macro name will have suffixes as listed above, and all blanks and
	218	# dashes will be replaced by underscores.
	219	#
	220	# Each line may optionally have one of the following flags on it, separated by
	221	# white space from the initial token.
	222	# string indicates that the output is to be of the string form
	223	# described in the comments above that are placed in the file.
	224	# string_skip_ifundef is the same as 'string', but instead of dying if the
	225	# code point doesn't exist, the line is just skipped: no output is
	226	# generated for it
	227	# first indicates that the output is to be of the FIRST_BYTE form.
	228	# tail indicates that the output is of the _TAIL form.
	229	# native indicates that the output is the code point, converted to the
	230	# platform's native character set if applicable
	231	#
	232	# If the code point has no official name, the desired name may be appended
	233	# after the flag, which will be ignored if there is an official name.
	234	#
	235	# This program is used to make it convenient to create compile time constants
	236	# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
	237	# having to figure things out.
	238
	239	__DATA__
	240	U+017F string
	241
	242	U+0300 string
	243	U+0307 string
	244
	245	U+1E9E string_skip_if_undef
	246
	247	U+FB05 string
	248	U+FB06 string
	249	U+0130 string
	250	U+0131 string
	251
	252	U+2010 string
	253	BOM first
	254	BOM tail
	255
	256	BOM string
	257
	258	U+FFFD string
	259
	260	U+10FFFF string MAX_UNICODE
	261
	262	NBSP native
	263	NBSP string
	264
	265	DEL native
	266	CR native
	267	LF native
	268	VT native
	269	ESC native
	270	U+00DF native
	271	U+00E5 native
	272	U+00C5 native
	273	U+00FF native
	274	U+00B5 native