[perl5.git] / regen / mk_PL_charclass.pl

#!perl -w
use 5.012;
use strict;
use warnings;
require 'regen/regen_lib.pl';

# This program outputs l1_charclass_tab.h, which defines the guts of the
# PL_charclass table.  Each line is a bit map of properties that the Unicode
# code point at the corresponding position in the table array has.  The first
# line corresponds to code point U+0000, NULL, the last line to U+00FF.  For
# an application to see if the code point "i" has a particular property, it
# just does
#    'PL_charclass[i] & BIT'
# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
# character class, and 'property' is the corresponding property, and 'suffix'
# is one of '_A' to mean the property is true only if the corresponding code
# point is ASCII, and '_L1' means that the range includes any Latin1
# character (ISO-8859-1 including the C0 and C1 controls).  A property without
# these suffixes does not have different forms for both ranges.

# This program need be run only when adding new properties to it, or upon a
# new Unicode release, to make sure things haven't been changed by it.

my @properties = qw(
    ALNUMC_A
    ALNUMC_L1
    ALPHA_A
    ALPHA_L1
    BLANK_A
    BLANK_L1
    CHARNAME_CONT
    CNTRL_A
    CNTRL_L1
    DIGIT_A
    GRAPH_A
    GRAPH_L1
    IDFIRST_A
    IDFIRST_L1
    LOWER_A
    LOWER_L1
    OCTAL_A
    PRINT_A
    PRINT_L1
    PSXSPC_A
    PSXSPC_L1
    PUNCT_A
    PUNCT_L1
    SPACE_A
    SPACE_L1
    UPPER_A
    UPPER_L1
    WORDCHAR_A
    WORDCHAR_L1
    XDIGIT_A
);

# Read in the case fold mappings.
my %folded_closure;
my $file="lib/unicore/CaseFolding.txt";
open my $fh, "<", $file or die "Failed to read '$file': $!";
while (<$fh>) {
    chomp;

    # Lines look like (without the initial '#'
    #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
    # Get rid of comments, ignore blank or comment-only lines
    my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
    next unless length $line;
    my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;

    my $from = hex $hex_from;

    # Perl only deals with C and F folds
    next if $fold_type ne 'C' and $fold_type ne 'F';

    # Get each code point in the range that participates in this line's fold.
    # The hash has keys of each code point in the range, and values of what it
    # folds to and what folds to it
    foreach my $hex_fold (@folded) {
        my $fold = hex $hex_fold;
        push @{$folded_closure{$fold}}, $from if $fold < 256;
        push @{$folded_closure{$from}}, $fold if $from < 256;
    }
}

# Now having read all the lines, combine them into the full closure of each
# code point in the range by adding lists together that share a common element
foreach my $folded (keys %folded_closure) {
    foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
        push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
    }
}

my @bits;   # Bit map for each code point

foreach my $folded (keys %folded_closure) {
    $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
                                                @{$folded_closure{$folded}};
}

# For each character, calculate which properties it matches.
for my $ord (0..255) {
    my $char = chr($ord);
    utf8::upgrade($char);   # Important to use Unicode semantics!

    # Look at all the properties we care about here.
    for my $property (@properties) {
        my $name = $property;

        # Remove the suffix to get the actual property name.
        # Currently the suffixes are '_L1', '_A', and none.
        # If is a latin1 version, no further checking is needed.
        if (! ($name =~ s/_L1$//)) {

            # Here, isn't an _L1.  If its _A, it's automatically false for
            # non-ascii.  The only one current one without a suffix is valid
            # over the whole range.
            next if $name =~ s/_A$// && $ord >= 128;

        }
        my $re;
        if ($name eq 'PUNCT') {;

            # Sadly, this is inconsistent: \pP and \pS for the ascii range,
            # just \pP outside it.
            $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
        } elsif ($name eq 'CHARNAME_CONT') {;
            $re = qr/[-\w ():\xa0]/;
        } elsif ($name eq 'SPACE') {;
            $re = qr/\s/;
        } elsif ($name eq 'IDFIRST') {
            $re = qr/[_\p{Alpha}]/;
        } elsif ($name eq 'PSXSPC') {
            $re = qr/[\v\p{Space}]/;
        } elsif ($name eq 'WORDCHAR') {
            $re = qr/\w/;
        } elsif ($name eq 'ALNUMC') {
            # Like \w, but no underscore
            $re = qr/\p{Alnum}/;
        } elsif ($name eq 'OCTAL') {
            $re = qr/[0-7]/;
        } else {    # The remainder have the same name and values as Unicode
            $re = eval "qr/\\p{$name}/";
            use Carp;
            carp $@ if ! defined $re;
        }
        #print "$ord, $name $property, $re\n";
        if ($char =~ $re) {  # Add this property if matches
            $bits[$ord] .= '|' if $bits[$ord];
            $bits[$ord] .= "_CC_$property";
        }
    }
    #print __LINE__, " $ord $char $bits[$ord]\n";
}

# Names of C0 controls
my @C0 = qw (
                NUL
                SOH
                STX
                ETX
                EOT
                ENQ
                ACK
                BEL
                BS
                HT
                LF
                VT
                FF
                CR
                SO
                SI
                DLE
                DC1
                DC2
                DC3
                DC4
                NAK
                SYN
                ETB
                CAN
                EOM
                SUB
                ESC
                FS
                GS
                RS
                US
            );

# Names of C1 controls, plus the adjacent DEL
my @C1 = qw(
                DEL
                PAD
                HOP
                BPH
                NBH
                IND
                NEL
                SSA
                ESA
                HTS
                HTJ
                VTS
                PLD
                PLU
                RI 
                SS2
                SS3
                DCS
                PU1
                PU2
                STS
                CCH
                MW 
                SPA
                EPA
                SOS
                SGC
                SCI
                CSI
                ST 
                OSC
                PM 
                APC
            );

my $out_fh = open_new('l1_char_class_tab.h', '>',
		      {style => '*', by => $0,
                      from => "property definitions and $file"});

# Output the table using fairly short names for each char.
for my $ord (0..255) {
    my $name;
    if ($ord < 32) {    # A C0 control
        $name = $C0[$ord];
    } elsif ($ord > 32 && $ord < 127) { # Graphic
        $name = "'" . chr($ord) . "'";
    } elsif ($ord >= 127 && $ord <= 0x9f) {
        $name = $C1[$ord - 127];    # A C1 control + DEL
    } else {    # SPACE, or, if Latin1, shorten the name */
        use charnames();
        $name = charnames::viacode($ord);
        $name =~ s/LATIN CAPITAL LETTER //
        || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
    }
    printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
}

read_only_bottom_close_and_rename($out_fh)
Commit	Line	Data
9c68f0ab KW	1	#!perl -w
	2	use 5.012;
	3	use strict;
	4	use warnings;
cfb8fd6a	5	require 'regen/regen_lib.pl';
9c68f0ab	6
b1909af7 KW	7	# This program outputs l1_charclass_tab.h, which defines the guts of the
b1909af7 KW	8	# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1	9	# code point at the corresponding position in the table array has. The first
b1909af7	10	# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1 KW	11	# an application to see if the code point "i" has a particular property, it
8d4ab2a1 KW	12	# just does
9c68f0ab KW	13	# 'PL_charclass[i] & BIT'
	14	# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
	15	# character class, and 'property' is the corresponding property, and 'suffix'
	16	# is one of '_A' to mean the property is true only if the corresponding code
	17	# point is ASCII, and '_L1' means that the range includes any Latin1
	18	# character (ISO-8859-1 including the C0 and C1 controls). A property without
	19	# these suffixes does not have different forms for both ranges.
	20
b1909af7 KW	21	# This program need be run only when adding new properties to it, or upon a
b1909af7 KW	22	# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab KW	23
	24	my @properties = qw(
	25	ALNUMC_A
	26	ALNUMC_L1
	27	ALPHA_A
	28	ALPHA_L1
	29	BLANK_A
	30	BLANK_L1
	31	CHARNAME_CONT
	32	CNTRL_A
	33	CNTRL_L1
	34	DIGIT_A
	35	GRAPH_A
	36	GRAPH_L1
	37	IDFIRST_A
	38	IDFIRST_L1
	39	LOWER_A
	40	LOWER_L1
	41	OCTAL_A
	42	PRINT_A
	43	PRINT_L1
	44	PSXSPC_A
	45	PSXSPC_L1
	46	PUNCT_A
	47	PUNCT_L1
	48	SPACE_A
	49	SPACE_L1
	50	UPPER_A
	51	UPPER_L1
	52	WORDCHAR_A
	53	WORDCHAR_L1
	54	XDIGIT_A
	55	);
	56
00c072cf KW	57	# Read in the case fold mappings.
00c072cf KW	58	my %folded_closure;
cfb8fd6a	59	my $file="lib/unicore/CaseFolding.txt";
00c072cf KW	60	open my $fh, "<", $file or die "Failed to read '$file': $!";
	61	while (<$fh>) {
	62	chomp;
	63
	64	# Lines look like (without the initial '#'
	65	#0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
e3136cf9 KW	66	# Get rid of comments, ignore blank or comment-only lines
	67	my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
	68	next unless length $line;
00c072cf KW	69	my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
	70
	71	my $from = hex $hex_from;
	72
	73	# Perl only deals with C and F folds
33e4950c	74	next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf KW	75
	76	# Get each code point in the range that participates in this line's fold.
	77	# The hash has keys of each code point in the range, and values of what it
	78	# folds to and what folds to it
	79	foreach my $hex_fold (@folded) {
	80	my $fold = hex $hex_fold;
	81	push @{$folded_closure{$fold}}, $from if $fold < 256;
	82	push @{$folded_closure{$from}}, $fold if $from < 256;
	83	}
	84	}
	85
	86	# Now having read all the lines, combine them into the full closure of each
	87	# code point in the range by adding lists together that share a common element
	88	foreach my $folded (keys %folded_closure) {
	89	foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
	90	push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
	91	}
	92	}
	93
9c68f0ab KW	94	my @bits; # Bit map for each code point
9c68f0ab KW	95
00c072cf KW	96	foreach my $folded (keys %folded_closure) {
	97	$bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
	98	@{$folded_closure{$folded}};
	99	}
	100
b1909af7	101	# For each character, calculate which properties it matches.
9c68f0ab KW	102	for my $ord (0..255) {
	103	my $char = chr($ord);
	104	utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7 KW	105
b1909af7 KW	106	# Look at all the properties we care about here.
9c68f0ab KW	107	for my $property (@properties) {
	108	my $name = $property;
	109
b1909af7 KW	110	# Remove the suffix to get the actual property name.
b1909af7 KW	111	# Currently the suffixes are '_L1', '_A', and none.
9c68f0ab KW	112	# If is a latin1 version, no further checking is needed.
	113	if (! ($name =~ s/_L1$//)) {
	114
b1909af7 KW	115	# Here, isn't an _L1. If its _A, it's automatically false for
	116	# non-ascii. The only one current one without a suffix is valid
	117	# over the whole range.
9c68f0ab KW	118	next if $name =~ s/_A$// && $ord >= 128;
	119
	120	}
	121	my $re;
	122	if ($name eq 'PUNCT') {;
	123
	124	# Sadly, this is inconsistent: \pP and \pS for the ascii range,
	125	# just \pP outside it.
	126	$re = qr/\p{Punct}\|[^\P{Symbol}\P{ASCII}]/;
	127	} elsif ($name eq 'CHARNAME_CONT') {;
	128	$re = qr/[-\w ():\xa0]/;
	129	} elsif ($name eq 'SPACE') {;
	130	$re = qr/\s/;
	131	} elsif ($name eq 'IDFIRST') {
	132	$re = qr/[_\p{Alpha}]/;
	133	} elsif ($name eq 'PSXSPC') {
	134	$re = qr/[\v\p{Space}]/;
	135	} elsif ($name eq 'WORDCHAR') {
	136	$re = qr/\w/;
	137	} elsif ($name eq 'ALNUMC') {
	138	# Like \w, but no underscore
aedd44b5	139	$re = qr/\p{Alnum}/;
9c68f0ab KW	140	} elsif ($name eq 'OCTAL') {
	141	$re = qr/[0-7]/;
	142	} else { # The remainder have the same name and values as Unicode
	143	$re = eval "qr/\\p{$name}/";
	144	use Carp;
	145	carp $@ if ! defined $re;
	146	}
	147	#print "$ord, $name $property, $re\n";
	148	if ($char =~ $re) { # Add this property if matches
	149	$bits[$ord] .= '\|' if $bits[$ord];
	150	$bits[$ord] .= "_CC_$property";
	151	}
	152	}
	153	#print __LINE__, " $ord $char $bits[$ord]\n";
	154	}
	155
	156	# Names of C0 controls
	157	my @C0 = qw (
	158	NUL
	159	SOH
	160	STX
	161	ETX
	162	EOT
	163	ENQ
	164	ACK
	165	BEL
	166	BS
	167	HT
	168	LF
	169	VT
	170	FF
	171	CR
	172	SO
	173	SI
	174	DLE
	175	DC1
	176	DC2
	177	DC3
	178	DC4
	179	NAK
	180	SYN
	181	ETB
	182	CAN
	183	EOM
	184	SUB
	185	ESC
	186	FS
	187	GS
	188	RS
	189	US
	190	);
	191
	192	# Names of C1 controls, plus the adjacent DEL
	193	my @C1 = qw(
	194	DEL
	195	PAD
	196	HOP
	197	BPH
	198	NBH
	199	IND
	200	NEL
	201	SSA
	202	ESA
	203	HTS
204	HTJ
205	VTS
206	PLD
207	PLU
208	RI
209	SS2
210	SS3
211	DCS
212	PU1
213	PU2
214	STS
215	CCH
216	MW
217	SPA
218	EPA
219	SOS
220	SGC
221	SCI
222	CSI
223	ST
224	OSC
225	PM
226	APC
227	);
228
cc49830d	229	my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7 KW	230	{style => '*', by => $0,
b1909af7 KW	231	from => "property definitions and $file"});
cfb8fd6a	232
9c68f0ab KW	233	# Output the table using fairly short names for each char.
	234	for my $ord (0..255) {
	235	my $name;
	236	if ($ord < 32) { # A C0 control
	237	$name = $C0[$ord];
	238	} elsif ($ord > 32 && $ord < 127) { # Graphic
	239	$name = "'" . chr($ord) . "'";
	240	} elsif ($ord >= 127 && $ord <= 0x9f) {
	241	$name = $C1[$ord - 127]; # A C1 control + DEL
	242	} else { # SPACE, or, if Latin1, shorten the name */
	243	use charnames();
	244	$name = charnames::viacode($ord);
	245	$name =~ s/LATIN CAPITAL LETTER //
	246	\|\| $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
	247	}
cfb8fd6a	248	printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab KW	249	}
9c68f0ab KW	250
cfb8fd6a	251	read_only_bottom_close_and_rename($out_fh)