perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!perl -w
	2	use v5.15.8;
	3	use strict;
	4	use warnings;
	5	require 'regen/regen_lib.pl';
	6
	7	# This program outputs l1_charclass_tab.h, which defines the guts of the
	8	# PL_charclass table. Each line is a bit map of properties that the Unicode
	9	# code point at the corresponding position in the table array has. The first
	10	# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
	11	# an application to see if the code point "i" has a particular property, it
	12	# just does
	13	# 'PL_charclass[i] & BIT'
	14	# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
	15	# character class, and 'property' is the corresponding property, and 'suffix'
	16	# is one of '_A' to mean the property is true only if the corresponding code
	17	# point is ASCII, and '_L1' means that the range includes any Latin1
	18	# character (ISO-8859-1 including the C0 and C1 controls). A property without
	19	# these suffixes does not have different forms for both ranges.
	20
	21	# This program need be run only when adding new properties to it, or upon a
	22	# new Unicode release, to make sure things haven't been changed by it.
	23
	24	my @properties = qw(
	25	NONLATIN1_FOLD
	26	ALPHANUMERIC
	27	ALPHA
	28	ASCII
	29	BLANK
	30	CASED
	31	CHARNAME_CONT
	32	CNTRL
	33	DIGIT
	34	GRAPH
	35	IDFIRST
	36	LOWER
	37	NON_FINAL_FOLD
	38	PRINT
	39	PSXSPC
	40	PUNCT
	41	QUOTEMETA
	42	SPACE
	43	UPPER
	44	WORDCHAR
	45	XDIGIT
	46	VERTSPACE
	47	IS_IN_SOME_FOLD
	48	BACKSLASH_FOO_LBRACE_IS_META
	49	);
	50
	51	# Read in the case fold mappings.
	52	my %folded_closure;
	53	my @hex_non_final_folds;
	54	my @folds;
	55	use Unicode::UCD;
	56
	57	BEGIN { # Have to do this at compile time because using user-defined \p{property}
	58
	59	# Use the Unicode data file if we are on an ASCII platform (which its data
	60	# is for), and it is in the modern format (starting in Unicode 3.1.0) and
	61	# it is available. This avoids being affected by potential bugs
	62	# introduced by other layers of Perl
	63	my $file="lib/unicore/CaseFolding.txt";
	64
	65	if (ord('A') == 65
	66	&& pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
	67	&& open my $fh, "<", $file)
	68	{
	69	@folds = <$fh>;
	70	}
	71	else {
	72	my ($invlist_ref, $invmap_ref, undef, $default)
	73	= Unicode::UCD::prop_invmap('Case_Folding');
	74	for my $i (0 .. @$invlist_ref - 1 - 1) {
	75	next if $invmap_ref->[$i] == $default;
	76	my $adjust = -1;
	77	for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
	78	$adjust++;
	79
	80	# Single-code point maps go to a 'C' type
	81	if (! ref $invmap_ref->[$i]) {
	82	push @folds, sprintf("%04X; C; %04X\n",
	83	$j,
	84	$invmap_ref->[$i] + $adjust);
	85	}
	86	else { # Multi-code point maps go to 'F'. prop_invmap()
	87	# guarantees that no adjustment is needed for these,
	88	# as the range will contain just one element
	89	push @folds, sprintf("%04X; F; %s\n",
	90	$j,
	91	join " ", map { sprintf "%04X", $_ }
	92	@{$invmap_ref->[$i]});
	93	}
	94	}
	95	}
	96	}
	97
	98	for (@folds) {
	99	chomp;
	100
	101	# Lines look like (without the initial '#'
	102	#0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
	103	# Get rid of comments, ignore blank or comment-only lines
	104	my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
	105	next unless length $line;
	106	my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
	107
	108	my $from = hex $hex_from;
	109
	110	# Perl only deals with C and F folds
	111	next if $fold_type ne 'C' and $fold_type ne 'F';
	112
	113	# Get each code point in the range that participates in this line's fold.
	114	# The hash has keys of each code point in the range, and values of what it
	115	# folds to and what folds to it
	116	for my $i (0 .. @folded - 1) {
	117	my $hex_fold = $folded[$i];
	118	my $fold = hex $hex_fold;
	119	push @{$folded_closure{$fold}}, $from if $fold < 256;
	120	push @{$folded_closure{$from}}, $fold if $from < 256;
	121
	122	if ($i < @folded-1
	123	&& $fold < 256
	124	&& ! grep { $_ eq $hex_fold } @hex_non_final_folds)
	125	{
	126	push @hex_non_final_folds, $hex_fold;
	127
	128	# Also add the upper case, which in the latin1 range folds to
	129	# $fold
	130	push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold;
	131	}
	132	}
	133	}
	134
	135	# Now having read all the lines, combine them into the full closure of each
	136	# code point in the range by adding lists together that share a common
	137	# element
	138	foreach my $folded (keys %folded_closure) {
	139	foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
	140	push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
	141	}
	142	}
	143	}
	144
	145	sub Is_Non_Latin1_Fold {
	146	my @return;
	147
	148	foreach my $folded (keys %folded_closure) {
	149	push @return, sprintf("%X", $folded), if grep { $_ > 255 }
	150	@{$folded_closure{$folded}};
	151	}
	152	return join("\n", @return) . "\n";
	153	}
	154
	155	sub Is_Non_Final_Fold {
	156	return join("\n", @hex_non_final_folds) . "\n";
	157	}
	158
	159	my @bits; # Bit map for each code point
	160
	161	# For each character, calculate which properties it matches.
	162	for my $ord (0..255) {
	163	my $char = chr($ord);
	164	utf8::upgrade($char); # Important to use Unicode semantics!
	165
	166	# Look at all the properties we care about here.
	167	for my $property (@properties) {
	168	my $name = $property;
	169
	170	# Remove the suffix to get the actual property name.
	171	# Currently the suffixes are '_L1', '_A', and none.
	172	# If is a latin1 version, no further checking is needed.
	173	if (! ($name =~ s/_L1$//)) {
	174
	175	# Here, isn't an _L1. If its _A, it's automatically false for
	176	# non-ascii. The only current ones (besides ASCII) without a
	177	# suffix are valid over the whole range.
	178	next if $name =~ s/_A$// && $ord >= 128;
	179
	180	}
	181	my $re;
	182	if ($name eq 'PUNCT') {;
	183
	184	# Sadly, this is inconsistent: \pP and \pS for the ascii range,
	185	# just \pP outside it.
	186	$re = qr/\p{Punct}\|[^\P{Symbol}\P{ASCII}]/;
	187	} elsif ($name eq 'CHARNAME_CONT') {;
	188	$re = qr/\p{_Perl_Charname_Continue}/,
	189	} elsif ($name eq 'SPACE') {;
	190	$re = qr/\p{XPerlSpace}/;
	191	} elsif ($name eq 'IDFIRST') {
	192	$re = qr/[_\p{Alpha}]/;
	193	} elsif ($name eq 'PSXSPC') {
	194	$re = qr/[\v\p{Space}]/;
	195	} elsif ($name eq 'WORDCHAR') {
	196	$re = qr/\p{XPosixWord}/;
	197	} elsif ($name eq 'ALPHANUMERIC') {
	198	# Like \w, but no underscore
	199	$re = qr/\p{Alnum}/;
	200	} elsif ($name eq 'QUOTEMETA') {
	201	$re = qr/\p{_Perl_Quotemeta}/;
	202	} elsif ($name eq 'NONLATIN1_FOLD') {
	203	$re = qr/\p{Is_Non_Latin1_Fold}/;
	204	} elsif ($name eq 'NON_FINAL_FOLD') {
	205	$re = qr/\p{Is_Non_Final_Fold}/;
	206	} elsif ($name eq 'IS_IN_SOME_FOLD') {
	207	$re = qr/\p{_Perl_Any_Folds}/;
	208	} elsif ($name eq 'BACKSLASH_FOO_LBRACE_IS_META') {
	209
	210	# This is true for FOO where FOO is the varying character in:
	211	# \a{, \b{, \c{, ...
	212	# and the sequence has non-literal meaning to Perl; so it is true
	213	# for 'x' because \x{ is special, but not 'a' because \a{ isn't.
	214	$re = qr/[gkNopPx]/;
	215	} else { # The remainder have the same name and values as Unicode
	216	$re = eval "qr/\\p{$name}/";
	217	use Carp;
	218	carp $@ if ! defined $re;
	219	}
	220	#print "$ord, $name $property, $re\n";
	221	if ($char =~ $re) { # Add this property if matches
	222	$bits[$ord] .= '\|' if $bits[$ord];
	223	$bits[$ord] .= "(1U<<_CC_$property)";
	224	}
	225	}
	226	#print __LINE__, " $ord $char $bits[$ord]\n";
	227	}
	228
	229	# Names of C0 controls
	230	my @C0 = qw (
	231	NUL
	232	SOH
	233	STX
	234	ETX
	235	EOT
	236	ENQ
	237	ACK
	238	BEL
	239	BS
	240	HT
	241	LF
	242	VT
	243	FF
	244	CR
	245	SO
	246	SI
	247	DLE
	248	DC1
	249	DC2
	250	DC3
	251	DC4
	252	NAK
	253	SYN
	254	ETB
	255	CAN
	256	EOM
	257	SUB
	258	ESC
	259	FS
	260	GS
	261	RS
	262	US
	263	);
	264
	265	# Names of C1 controls, plus the adjacent DEL
	266	my @C1 = qw(
	267	DEL
	268	PAD
	269	HOP
	270	BPH
	271	NBH
	272	IND
	273	NEL
	274	SSA
	275	ESA
	276	HTS
	277	HTJ
	278	VTS
	279	PLD
	280	PLU
	281	RI
	282	SS2
	283	SS3
	284	DCS
	285	PU1
	286	PU2
	287	STS
	288	CCH
	289	MW
	290	SPA
	291	EPA
	292	SOS
	293	SGC
	294	SCI
	295	CSI
	296	ST
	297	OSC
	298	PM
	299	APC
	300	);
	301
	302	my $out_fh = open_new('l1_char_class_tab.h', '>',
	303	{style => '*', by => $0,
	304	from => "property definitions"});
	305
	306	# Output the table using fairly short names for each char.
	307	for my $ord (0..255) {
	308	my $name;
	309	if ($ord < 32) { # A C0 control
	310	$name = $C0[$ord];
	311	} elsif ($ord > 32 && $ord < 127) { # Graphic
	312	$name = "'" . chr($ord) . "'";
	313	} elsif ($ord >= 127 && $ord <= 0x9f) {
	314	$name = $C1[$ord - 127]; # A C1 control + DEL
	315	} else { # SPACE, or, if Latin1, shorten the name */
	316	use charnames();
	317	$name = charnames::viacode($ord);
	318	$name =~ s/LATIN CAPITAL LETTER //
	319	\|\| $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
	320	}
	321	printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
	322	}
	323
	324	read_only_bottom_close_and_rename($out_fh)