perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!perl -w
	2	use 5.015;
	3	use strict;
	4	use warnings;
	5	use Unicode::UCD "prop_invlist";
	6	require 'regen/regen_lib.pl';
	7
	8	# This program outputs charclass_invlists.h, which contains various inversion
	9	# lists in the form of C arrays that are to be used as-is for inversion lists.
	10	# Thus, the lists it contains are essentially pre-compiled, and need only a
	11	# light-weight fast wrapper to make them usable at run-time.
	12
	13	# As such, this code knows about the internal structure of these lists, and
	14	# any change made to that has to be done here as well. A random number stored
	15	# in the headers is used to minimize the possibility of things getting
	16	# out-of-sync, or the wrong data structure being passed. Currently that
	17	# random number is:
	18	my $VERSION_DATA_STRUCTURE_TYPE = 1064334010;
	19
	20	my $out_fh = open_new('charclass_invlists.h', '>',
	21	{style => '*', by => $0,
	22	from => "Unicode::UCD"});
	23
	24	print $out_fh "/* See the generating file for comments */\n\n";
	25
	26	sub output_invlist ($$) {
	27	my $name = shift;
	28	my $invlist = shift; # Reference to inversion list array
	29
	30	# Output the inversion list $invlist using the name $name for it.
	31	# It is output in the exact internal form for inversion lists.
	32
	33	my $zero_or_one; # Is the last element of the header 0, or 1 ?
	34
	35	# If the first element is 0, it goes in the header, instead of the body
	36	if ($invlist->[0] == 0) {
	37	shift @$invlist;
	38
	39	$zero_or_one = 0;
	40
	41	# Add a dummy 0 at the end so that the length is constant. inversion
	42	# lists are always stored with enough room so that if they change from
	43	# beginning with 0, they don't have to grow.
	44	push @$invlist, 0;
	45	}
	46	else {
	47	$zero_or_one = 1;
	48	}
	49
	50	print $out_fh "\nUV ${name}_invlist[] = {\n";
	51
	52	print $out_fh "\t", scalar @$invlist, ",\t/* Number of elements */\n";
	53	print $out_fh "\t0,\t/* Current iteration position */\n";
	54	print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
	55	print $out_fh "\t", $zero_or_one,
	56	",\t/* 0 if this is the first element of the list proper;",
	57	"\n\t\t 1 if the next element is the first */\n";
	58
	59	# The main body are the UVs passed in to this routine. Do the final
	60	# element separately
	61	for my $i (0 .. @$invlist - 1 - 1) {
	62	print $out_fh "\t$invlist->[$i],\n";
	63	}
	64
	65	# The final element does not have a trailing comma, as C can't handle it.
	66	print $out_fh "\t$invlist->[-1]\n";
	67
	68	print $out_fh "};\n";
	69	}
	70
	71	output_invlist("Latin1", [ 0, 256 ]);
	72	output_invlist("AboveLatin1", [ 256 ]);
	73
	74	# We construct lists for all the POSIX and backslash sequence character
	75	# classes in two forms:
	76	# 1) ones which match only in the ASCII range
	77	# 2) ones which match either in the Latin1 range, or the entire Unicode range
	78	#
	79	# These get compiled in, and hence affect the memory footprint of every Perl
	80	# program, even those not using Unicode. To minimize the size, currently
	81	# the Latin1 version is generated for the beyond ASCII range except for those
	82	# lists that are quite small for the entire range, such as for \s, which is 22
	83	# UVs long plus 4 UVs (currently) for the header.
	84	#
	85	# To save even more memory, the ASCII versions could be derived from the
	86	# larger ones at runtime, saving some memory (minus the expense of the machine
	87	# instructions to do so), but these are all small anyway, so their total is
	88	# about 100 UVs.
	89	#
	90	# In the list of properties below that get generated, the L1 prefix is a fake
	91	# property that means just the Latin1 range of the full property (whose name
	92	# has an X prefix instead of L1).
	93
	94	for my $prop (qw(
	95	ASCII
	96	HorizSpace
	97	VertSpace
	98	PerlSpace
	99	XPerlSpace
	100	PosixAlnum
	101	L1PosixAlnum
	102	PosixAlpha
	103	L1PosixAlpha
	104	PosixBlank
	105	XPosixBlank
	106	PosixCntrl
	107	XPosixCntrl
	108	PosixDigit
	109	PosixGraph
	110	L1PosixGraph
	111	PosixLower
	112	L1PosixLower
	113	PosixPrint
	114	L1PosixPrint
	115	PosixPunct
	116	L1PosixPunct
	117	PosixSpace
	118	XPosixSpace
	119	PosixUpper
	120	L1PosixUpper
	121	PosixWord
	122	L1PosixWord
	123	PosixXDigit
	124	XPosixXDigit
	125	)
	126	) {
	127
	128	# For the Latin1 properties, we change to use the eXtended version of the
	129	# base property, then go through the result and get rid of everything not
	130	# in Latin1 (above 255). Actually, we retain the element that crosses the
	131	# 255/256 boundary. For example, in the Word property, there is a range
	132	# of code points that start at U+00F8 and goes through U+02C1. Instead of
	133	# artifically cutting that off at 256 because 256 is the first code point
	134	# above Latin1, we let the range go to its natural ending. That gives us
	135	# extra information with no added space taken.
	136	my $lookup_prop = $prop =~ s/^L1/X/r;
	137	my @invlist = prop_invlist($lookup_prop);
	138
	139	if ($lookup_prop ne $prop) {
	140	for my $i (0 .. @invlist - 1 - 1) {
	141	if ($invlist[$i] > 255) {
	142	splice @invlist, $i+1;
	143	last;
	144	}
	145	}
	146	}
	147
	148	output_invlist($prop, \@invlist);
	149	}
	150
	151	read_only_bottom_close_and_rename($out_fh)