[perl5.git] / regen / mk_PL_charclass.pl

#!perl -w
use 5.012;
use strict;
use warnings;
require 'regen/regen_lib.pl';

# This program outputs l1_charclass_tab.h, which defines the guts of the
# PL_charclass table.  Each line is a bit map of properties that the Unicode
# code point at the corresponding position in the table array has.  The first
# line corresponds to code point U+0000, NULL, the last line to U+00FF.  For
# an application to see if the code point "i" has a particular property, it
# just does
#    'PL_charclass[i] & BIT'
# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
# character class, and 'property' is the corresponding property, and 'suffix'
# is one of '_A' to mean the property is true only if the corresponding code
# point is ASCII, and '_L1' means that the range includes any Latin1
# character (ISO-8859-1 including the C0 and C1 controls).  A property without
# these suffixes does not have different forms for both ranges.

# This program need be run only when adding new properties to it, or upon a
# new Unicode release, to make sure things haven't been changed by it.

my @properties = qw(
    ALNUMC_A
    ALNUMC_L1
    ALPHA_A
    ALPHA_L1
    BLANK_A
    BLANK_L1
    CHARNAME_CONT
    CNTRL_A
    CNTRL_L1
    DIGIT_A
    GRAPH_A
    GRAPH_L1
    IDFIRST_A
    IDFIRST_L1
    LOWER_A
    LOWER_L1
    OCTAL_A
    PRINT_A
    PRINT_L1
    PSXSPC_A
    PSXSPC_L1
    PUNCT_A
    PUNCT_L1
    SPACE_A
    SPACE_L1
    UPPER_A
    UPPER_L1
    WORDCHAR_A
    WORDCHAR_L1
    XDIGIT_A
    QUOTEMETA
);

# Read in the case fold mappings.
my %folded_closure;
my $file="lib/unicore/CaseFolding.txt";
open my $fh, "<", $file or die "Failed to read '$file': $!";
while (<$fh>) {
    chomp;

    # Lines look like (without the initial '#'
    #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
    # Get rid of comments, ignore blank or comment-only lines
    my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
    next unless length $line;
    my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;

    my $from = hex $hex_from;

    # Perl only deals with C and F folds
    next if $fold_type ne 'C' and $fold_type ne 'F';

    # Get each code point in the range that participates in this line's fold.
    # The hash has keys of each code point in the range, and values of what it
    # folds to and what folds to it
    foreach my $hex_fold (@folded) {
        my $fold = hex $hex_fold;
        push @{$folded_closure{$fold}}, $from if $fold < 256;
        push @{$folded_closure{$from}}, $fold if $from < 256;
    }
}

# Now having read all the lines, combine them into the full closure of each
# code point in the range by adding lists together that share a common element
foreach my $folded (keys %folded_closure) {
    foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
        push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
    }
}

my @bits;   # Bit map for each code point

foreach my $folded (keys %folded_closure) {
    $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
                                                @{$folded_closure{$folded}};
}

# For each character, calculate which properties it matches.
for my $ord (0..255) {
    my $char = chr($ord);
    utf8::upgrade($char);   # Important to use Unicode semantics!

    # Look at all the properties we care about here.
    for my $property (@properties) {
        my $name = $property;

        # Remove the suffix to get the actual property name.
        # Currently the suffixes are '_L1', '_A', and none.
        # If is a latin1 version, no further checking is needed.
        if (! ($name =~ s/_L1$//)) {

            # Here, isn't an _L1.  If its _A, it's automatically false for
            # non-ascii.  The only one current one without a suffix is valid
            # over the whole range.
            next if $name =~ s/_A$// && $ord >= 128;

        }
        my $re;
        if ($name eq 'PUNCT') {;

            # Sadly, this is inconsistent: \pP and \pS for the ascii range,
            # just \pP outside it.
            $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
        } elsif ($name eq 'CHARNAME_CONT') {;
            $re = qr/[-\w ():\xa0]/;
        } elsif ($name eq 'SPACE') {;
            $re = qr/\s/;
        } elsif ($name eq 'IDFIRST') {
            $re = qr/[_\p{Alpha}]/;
        } elsif ($name eq 'PSXSPC') {
            $re = qr/[\v\p{Space}]/;
        } elsif ($name eq 'WORDCHAR') {
            $re = qr/\w/;
        } elsif ($name eq 'ALNUMC') {
            # Like \w, but no underscore
            $re = qr/\p{Alnum}/;
        } elsif ($name eq 'OCTAL') {
            $re = qr/[0-7]/;
        } elsif ($name eq 'QUOTEMETA') {
            $re = qr/\p{_Perl_Quotemeta}/;
        } else {    # The remainder have the same name and values as Unicode
            $re = eval "qr/\\p{$name}/";
            use Carp;
            carp $@ if ! defined $re;
        }
        #print "$ord, $name $property, $re\n";
        if ($char =~ $re) {  # Add this property if matches
            $bits[$ord] .= '|' if $bits[$ord];
            $bits[$ord] .= "_CC_$property";
        }
    }
    #print __LINE__, " $ord $char $bits[$ord]\n";
}

# Names of C0 controls
my @C0 = qw (
                NUL
                SOH
                STX
                ETX
                EOT
                ENQ
                ACK
                BEL
                BS
                HT
                LF
                VT
                FF
                CR
                SO
                SI
                DLE
                DC1
                DC2
                DC3
                DC4
                NAK
                SYN
                ETB
                CAN
                EOM
                SUB
                ESC
                FS
                GS
                RS
                US
            );

# Names of C1 controls, plus the adjacent DEL
my @C1 = qw(
                DEL
                PAD
                HOP
                BPH
                NBH
                IND
                NEL
                SSA
                ESA
                HTS
                HTJ
                VTS
                PLD
                PLU
                RI 
                SS2
                SS3
                DCS
                PU1
                PU2
                STS
                CCH
                MW 
                SPA
                EPA
                SOS
                SGC
                SCI
                CSI
                ST 
                OSC
                PM 
                APC
            );

my $out_fh = open_new('l1_char_class_tab.h', '>',
		      {style => '*', by => $0,
                      from => "property definitions and $file"});

# Output the table using fairly short names for each char.
for my $ord (0..255) {
    my $name;
    if ($ord < 32) {    # A C0 control
        $name = $C0[$ord];
    } elsif ($ord > 32 && $ord < 127) { # Graphic
        $name = "'" . chr($ord) . "'";
    } elsif ($ord >= 127 && $ord <= 0x9f) {
        $name = $C1[$ord - 127];    # A C1 control + DEL
    } else {    # SPACE, or, if Latin1, shorten the name */
        use charnames();
        $name = charnames::viacode($ord);
        $name =~ s/LATIN CAPITAL LETTER //
        || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
    }
    printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
}

read_only_bottom_close_and_rename($out_fh)
Commit	Line	Data
9c68f0ab KW	1	#!perl -w
	2	use 5.012;
	3	use strict;
	4	use warnings;
cfb8fd6a	5	require 'regen/regen_lib.pl';
9c68f0ab	6
b1909af7 KW	7	# This program outputs l1_charclass_tab.h, which defines the guts of the
b1909af7 KW	8	# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1	9	# code point at the corresponding position in the table array has. The first
b1909af7	10	# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1 KW	11	# an application to see if the code point "i" has a particular property, it
8d4ab2a1 KW	12	# just does
9c68f0ab KW	13	# 'PL_charclass[i] & BIT'
	14	# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
	15	# character class, and 'property' is the corresponding property, and 'suffix'
	16	# is one of '_A' to mean the property is true only if the corresponding code
	17	# point is ASCII, and '_L1' means that the range includes any Latin1
	18	# character (ISO-8859-1 including the C0 and C1 controls). A property without
	19	# these suffixes does not have different forms for both ranges.
	20
b1909af7 KW	21	# This program need be run only when adding new properties to it, or upon a
b1909af7 KW	22	# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab KW	23
	24	my @properties = qw(
	25	ALNUMC_A
	26	ALNUMC_L1
	27	ALPHA_A
	28	ALPHA_L1
	29	BLANK_A
	30	BLANK_L1
	31	CHARNAME_CONT
	32	CNTRL_A
	33	CNTRL_L1
	34	DIGIT_A
	35	GRAPH_A
	36	GRAPH_L1
	37	IDFIRST_A
	38	IDFIRST_L1
	39	LOWER_A
	40	LOWER_L1
	41	OCTAL_A
	42	PRINT_A
	43	PRINT_L1
	44	PSXSPC_A
	45	PSXSPC_L1
	46	PUNCT_A
	47	PUNCT_L1
	48	SPACE_A
	49	SPACE_L1
	50	UPPER_A
	51	UPPER_L1
	52	WORDCHAR_A
	53	WORDCHAR_L1
	54	XDIGIT_A
9a022f3a	55	QUOTEMETA
9c68f0ab KW	56	);
9c68f0ab KW	57
00c072cf KW	58	# Read in the case fold mappings.
00c072cf KW	59	my %folded_closure;
cfb8fd6a	60	my $file="lib/unicore/CaseFolding.txt";
00c072cf KW	61	open my $fh, "<", $file or die "Failed to read '$file': $!";
	62	while (<$fh>) {
	63	chomp;
	64
	65	# Lines look like (without the initial '#'
	66	#0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
e3136cf9 KW	67	# Get rid of comments, ignore blank or comment-only lines
	68	my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
	69	next unless length $line;
00c072cf KW	70	my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
	71
	72	my $from = hex $hex_from;
	73
	74	# Perl only deals with C and F folds
33e4950c	75	next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf KW	76
	77	# Get each code point in the range that participates in this line's fold.
	78	# The hash has keys of each code point in the range, and values of what it
	79	# folds to and what folds to it
	80	foreach my $hex_fold (@folded) {
	81	my $fold = hex $hex_fold;
	82	push @{$folded_closure{$fold}}, $from if $fold < 256;
	83	push @{$folded_closure{$from}}, $fold if $from < 256;
	84	}
	85	}
	86
	87	# Now having read all the lines, combine them into the full closure of each
	88	# code point in the range by adding lists together that share a common element
	89	foreach my $folded (keys %folded_closure) {
	90	foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
	91	push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
	92	}
	93	}
	94
9c68f0ab KW	95	my @bits; # Bit map for each code point
9c68f0ab KW	96
00c072cf KW	97	foreach my $folded (keys %folded_closure) {
	98	$bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
	99	@{$folded_closure{$folded}};
	100	}
	101
b1909af7	102	# For each character, calculate which properties it matches.
9c68f0ab KW	103	for my $ord (0..255) {
	104	my $char = chr($ord);
	105	utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7 KW	106
b1909af7 KW	107	# Look at all the properties we care about here.
9c68f0ab KW	108	for my $property (@properties) {
	109	my $name = $property;
	110
b1909af7 KW	111	# Remove the suffix to get the actual property name.
b1909af7 KW	112	# Currently the suffixes are '_L1', '_A', and none.
9c68f0ab KW	113	# If is a latin1 version, no further checking is needed.
	114	if (! ($name =~ s/_L1$//)) {
	115
b1909af7 KW	116	# Here, isn't an _L1. If its _A, it's automatically false for
	117	# non-ascii. The only one current one without a suffix is valid
	118	# over the whole range.
9c68f0ab KW	119	next if $name =~ s/_A$// && $ord >= 128;
	120
	121	}
	122	my $re;
	123	if ($name eq 'PUNCT') {;
	124
	125	# Sadly, this is inconsistent: \pP and \pS for the ascii range,
	126	# just \pP outside it.
	127	$re = qr/\p{Punct}\|[^\P{Symbol}\P{ASCII}]/;
	128	} elsif ($name eq 'CHARNAME_CONT') {;
	129	$re = qr/[-\w ():\xa0]/;
	130	} elsif ($name eq 'SPACE') {;
	131	$re = qr/\s/;
	132	} elsif ($name eq 'IDFIRST') {
	133	$re = qr/[_\p{Alpha}]/;
	134	} elsif ($name eq 'PSXSPC') {
	135	$re = qr/[\v\p{Space}]/;
	136	} elsif ($name eq 'WORDCHAR') {
	137	$re = qr/\w/;
	138	} elsif ($name eq 'ALNUMC') {
	139	# Like \w, but no underscore
aedd44b5	140	$re = qr/\p{Alnum}/;
9c68f0ab KW	141	} elsif ($name eq 'OCTAL') {
9c68f0ab KW	142	$re = qr/[0-7]/;
9a022f3a KW	143	} elsif ($name eq 'QUOTEMETA') {
9a022f3a KW	144	$re = qr/\p{_Perl_Quotemeta}/;
9c68f0ab KW	145	} else { # The remainder have the same name and values as Unicode
	146	$re = eval "qr/\\p{$name}/";
	147	use Carp;
	148	carp $@ if ! defined $re;
	149	}
	150	#print "$ord, $name $property, $re\n";
	151	if ($char =~ $re) { # Add this property if matches
	152	$bits[$ord] .= '\|' if $bits[$ord];
	153	$bits[$ord] .= "_CC_$property";
	154	}
	155	}
	156	#print __LINE__, " $ord $char $bits[$ord]\n";
	157	}
	158
	159	# Names of C0 controls
	160	my @C0 = qw (
	161	NUL
	162	SOH
	163	STX
	164	ETX
	165	EOT
	166	ENQ
	167	ACK
	168	BEL
	169	BS
	170	HT
	171	LF
	172	VT
	173	FF
	174	CR
	175	SO
	176	SI
	177	DLE
	178	DC1
	179	DC2
	180	DC3
	181	DC4
	182	NAK
	183	SYN
	184	ETB
	185	CAN
	186	EOM
	187	SUB
	188	ESC
	189	FS
	190	GS
	191	RS
	192	US
	193	);
	194
	195	# Names of C1 controls, plus the adjacent DEL
	196	my @C1 = qw(
	197	DEL
	198	PAD
	199	HOP
	200	BPH
	201	NBH
	202	IND
	203	NEL
	204	SSA
	205	ESA
	206	HTS
	207	HTJ
	208	VTS
209	PLD
210	PLU
211	RI
212	SS2
213	SS3
214	DCS
215	PU1
216	PU2
217	STS
218	CCH
219	MW
220	SPA
221	EPA
222	SOS
223	SGC
224	SCI
225	CSI
226	ST
227	OSC
228	PM
229	APC
230	);
231
cc49830d	232	my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7 KW	233	{style => '*', by => $0,
b1909af7 KW	234	from => "property definitions and $file"});
cfb8fd6a	235
9c68f0ab KW	236	# Output the table using fairly short names for each char.
	237	for my $ord (0..255) {
	238	my $name;
	239	if ($ord < 32) { # A C0 control
	240	$name = $C0[$ord];
	241	} elsif ($ord > 32 && $ord < 127) { # Graphic
	242	$name = "'" . chr($ord) . "'";
	243	} elsif ($ord >= 127 && $ord <= 0x9f) {
	244	$name = $C1[$ord - 127]; # A C1 control + DEL
	245	} else { # SPACE, or, if Latin1, shorten the name */
	246	use charnames();
	247	$name = charnames::viacode($ord);
	248	$name =~ s/LATIN CAPITAL LETTER //
	249	\|\| $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
	250	}
cfb8fd6a	251	printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab KW	252	}
9c68f0ab KW	253
cfb8fd6a	254	read_only_bottom_close_and_rename($out_fh)