[perl5.git] / regen / mk_PL_charclass.pl

#!perl -w
use 5.012;
use strict;
use warnings;
require 'regen/regen_lib.pl';

# This program outputs the 256 lines that form the guts of the PL_charclass
# table.  The output should be used to manually replace the table contents in
# l1_charclass_tab.h.  Each line is a bit map of properties that the Unicode
# code point at the corresponding position in the table array has.  The first
# line corresponds to code point U+0000, NULL, the last line to U=00FF.  For
# an application to see if the code point "i" has a particular property, it
# just does
#    'PL_charclass[i] & BIT'
# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
# character class, and 'property' is the corresponding property, and 'suffix'
# is one of '_A' to mean the property is true only if the corresponding code
# point is ASCII, and '_L1' means that the range includes any Latin1
# character (ISO-8859-1 including the C0 and C1 controls).  A property without
# these suffixes does not have different forms for both ranges.

# The data in the table is pretty well set in stone, so that this program need
# be run only when adding new properties to it.

my @properties = qw(
    ALNUMC_A
    ALNUMC_L1
    ALPHA_A
    ALPHA_L1
    BLANK_A
    BLANK_L1
    CHARNAME_CONT
    CNTRL_A
    CNTRL_L1
    DIGIT_A
    GRAPH_A
    GRAPH_L1
    IDFIRST_A
    IDFIRST_L1
    LOWER_A
    LOWER_L1
    OCTAL_A
    PRINT_A
    PRINT_L1
    PSXSPC_A
    PSXSPC_L1
    PUNCT_A
    PUNCT_L1
    SPACE_A
    SPACE_L1
    UPPER_A
    UPPER_L1
    WORDCHAR_A
    WORDCHAR_L1
    XDIGIT_A
);

# Read in the case fold mappings.
my %folded_closure;
my $file="lib/unicore/CaseFolding.txt";
open my $fh, "<", $file or die "Failed to read '$file': $!";
while (<$fh>) {
    chomp;

    # Lines look like (without the initial '#'
    #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
    my ($line, $comment) = split / \s+ \# \s+ /x, $_;
    next if $line eq "" || substr($line, 0, 1) eq '#';
    my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;

    my $from = hex $hex_from;

    # Perl only deals with C and F folds
    next if $fold_type ne 'C'; # Not currently used: && $fold_type ne 'F';

    # Get each code point in the range that participates in this line's fold.
    # The hash has keys of each code point in the range, and values of what it
    # folds to and what folds to it
    foreach my $hex_fold (@folded) {
        my $fold = hex $hex_fold;
        push @{$folded_closure{$fold}}, $from if $fold < 256;
        push @{$folded_closure{$from}}, $fold if $from < 256;
    }
}

# Now having read all the lines, combine them into the full closure of each
# code point in the range by adding lists together that share a common element
foreach my $folded (keys %folded_closure) {
    foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
        push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
    }
}

my @bits;   # Bit map for each code point

foreach my $folded (keys %folded_closure) {
    $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
                                                @{$folded_closure{$folded}};
}

for my $ord (0..255) {
    my $char = chr($ord);
    utf8::upgrade($char);   # Important to use Unicode semantics!
    for my $property (@properties) {
        my $name = $property;

        # The property name that corresponds to this doesn't have a suffix.
        # If is a latin1 version, no further checking is needed.
        if (! ($name =~ s/_L1$//)) {

            # Here, isn't an L1.  It's either a special one or the suffix ends
            # in _A.  In the latter case, it's automatically false for
            # non-ascii.  The one current special is valid over the whole range.
            next if $name =~ s/_A$// && $ord >= 128;

        }
        my $re;
        if ($name eq 'PUNCT') {;

            # Sadly, this is inconsistent: \pP and \pS for the ascii range,
            # just \pP outside it.
            $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
        } elsif ($name eq 'CHARNAME_CONT') {;
            $re = qr/[-\w ():\xa0]/;
        } elsif ($name eq 'SPACE') {;
            $re = qr/\s/;
        } elsif ($name eq 'IDFIRST') {
            $re = qr/[_\p{Alpha}]/;
        } elsif ($name eq 'PSXSPC') {
            $re = qr/[\v\p{Space}]/;
        } elsif ($name eq 'WORDCHAR') {
            $re = qr/\w/;
        } elsif ($name eq 'ALNUMC') {
            # Like \w, but no underscore
            $re = qr/\p{Alnum}/;
        } elsif ($name eq 'OCTAL') {
            $re = qr/[0-7]/;
        } else {    # The remainder have the same name and values as Unicode
            $re = eval "qr/\\p{$name}/";
            use Carp;
            carp $@ if ! defined $re;
        }
        #print "$ord, $name $property, $re\n";
        if ($char =~ $re) {  # Add this property if matches
            $bits[$ord] .= '|' if $bits[$ord];
            $bits[$ord] .= "_CC_$property";
        }
    }
    #print __LINE__, " $ord $char $bits[$ord]\n";
}

# Names of C0 controls
my @C0 = qw (
                NUL
                SOH
                STX
                ETX
                EOT
                ENQ
                ACK
                BEL
                BS
                HT
                LF
                VT
                FF
                CR
                SO
                SI
                DLE
                DC1
                DC2
                DC3
                DC4
                NAK
                SYN
                ETB
                CAN
                EOM
                SUB
                ESC
                FS
                GS
                RS
                US
            );

# Names of C1 controls, plus the adjacent DEL
my @C1 = qw(
                DEL
                PAD
                HOP
                BPH
                NBH
                IND
                NEL
                SSA
                ESA
                HTS
                HTJ
                VTS
                PLD
                PLU
                RI 
                SS2
                SS3
                DCS
                PU1
                PU2
                STS
                CCH
                MW 
                SPA
                EPA
                SOS
                SGC
                SCI
                CSI
                ST 
                OSC
                PM 
                APC
            );

my $out_fh = safer_open('l1_char_class_tab.h-new', 'l1_char_class_tab.h');
print $out_fh read_only_top(lang => 'C', style => '*', by => $0, from => $file);

# Output the table using fairly short names for each char.
for my $ord (0..255) {
    my $name;
    if ($ord < 32) {    # A C0 control
        $name = $C0[$ord];
    } elsif ($ord > 32 && $ord < 127) { # Graphic
        $name = "'" . chr($ord) . "'";
    } elsif ($ord >= 127 && $ord <= 0x9f) {
        $name = $C1[$ord - 127];    # A C1 control + DEL
    } else {    # SPACE, or, if Latin1, shorten the name */
        use charnames();
        $name = charnames::viacode($ord);
        $name =~ s/LATIN CAPITAL LETTER //
        || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
    }
    printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
}

read_only_bottom_close_and_rename($out_fh)
Commit	Line	Data
9c68f0ab KW	1	#!perl -w
	2	use 5.012;
	3	use strict;
	4	use warnings;
cfb8fd6a	5	require 'regen/regen_lib.pl';
9c68f0ab KW	6
	7	# This program outputs the 256 lines that form the guts of the PL_charclass
	8	# table. The output should be used to manually replace the table contents in
8d4ab2a1 KW	9	# l1_charclass_tab.h. Each line is a bit map of properties that the Unicode
	10	# code point at the corresponding position in the table array has. The first
	11	# line corresponds to code point U+0000, NULL, the last line to U=00FF. For
	12	# an application to see if the code point "i" has a particular property, it
	13	# just does
9c68f0ab KW	14	# 'PL_charclass[i] & BIT'
	15	# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
	16	# character class, and 'property' is the corresponding property, and 'suffix'
	17	# is one of '_A' to mean the property is true only if the corresponding code
	18	# point is ASCII, and '_L1' means that the range includes any Latin1
	19	# character (ISO-8859-1 including the C0 and C1 controls). A property without
	20	# these suffixes does not have different forms for both ranges.
	21
	22	# The data in the table is pretty well set in stone, so that this program need
	23	# be run only when adding new properties to it.
	24
	25	my @properties = qw(
	26	ALNUMC_A
	27	ALNUMC_L1
	28	ALPHA_A
	29	ALPHA_L1
	30	BLANK_A
	31	BLANK_L1
	32	CHARNAME_CONT
	33	CNTRL_A
	34	CNTRL_L1
	35	DIGIT_A
	36	GRAPH_A
	37	GRAPH_L1
	38	IDFIRST_A
	39	IDFIRST_L1
	40	LOWER_A
	41	LOWER_L1
	42	OCTAL_A
	43	PRINT_A
	44	PRINT_L1
	45	PSXSPC_A
	46	PSXSPC_L1
	47	PUNCT_A
	48	PUNCT_L1
	49	SPACE_A
	50	SPACE_L1
	51	UPPER_A
	52	UPPER_L1
	53	WORDCHAR_A
	54	WORDCHAR_L1
	55	XDIGIT_A
	56	);
	57
00c072cf KW	58	# Read in the case fold mappings.
00c072cf KW	59	my %folded_closure;
cfb8fd6a	60	my $file="lib/unicore/CaseFolding.txt";
00c072cf KW	61	open my $fh, "<", $file or die "Failed to read '$file': $!";
	62	while (<$fh>) {
	63	chomp;
	64
	65	# Lines look like (without the initial '#'
	66	#0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
	67	my ($line, $comment) = split / \s+ \# \s+ /x, $_;
	68	next if $line eq "" \|\| substr($line, 0, 1) eq '#';
	69	my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
	70
	71	my $from = hex $hex_from;
	72
	73	# Perl only deals with C and F folds
88c8c961	74	next if $fold_type ne 'C'; # Not currently used: && $fold_type ne 'F';
00c072cf KW	75
	76	# Get each code point in the range that participates in this line's fold.
	77	# The hash has keys of each code point in the range, and values of what it
	78	# folds to and what folds to it
	79	foreach my $hex_fold (@folded) {
	80	my $fold = hex $hex_fold;
	81	push @{$folded_closure{$fold}}, $from if $fold < 256;
	82	push @{$folded_closure{$from}}, $fold if $from < 256;
	83	}
	84	}
	85
	86	# Now having read all the lines, combine them into the full closure of each
	87	# code point in the range by adding lists together that share a common element
	88	foreach my $folded (keys %folded_closure) {
	89	foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
	90	push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
	91	}
	92	}
	93
9c68f0ab KW	94	my @bits; # Bit map for each code point
9c68f0ab KW	95
00c072cf KW	96	foreach my $folded (keys %folded_closure) {
	97	$bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
	98	@{$folded_closure{$folded}};
	99	}
	100
9c68f0ab KW	101	for my $ord (0..255) {
	102	my $char = chr($ord);
	103	utf8::upgrade($char); # Important to use Unicode semantics!
	104	for my $property (@properties) {
	105	my $name = $property;
	106
	107	# The property name that corresponds to this doesn't have a suffix.
	108	# If is a latin1 version, no further checking is needed.
	109	if (! ($name =~ s/_L1$//)) {
	110
	111	# Here, isn't an L1. It's either a special one or the suffix ends
	112	# in _A. In the latter case, it's automatically false for
	113	# non-ascii. The one current special is valid over the whole range.
	114	next if $name =~ s/_A$// && $ord >= 128;
	115
	116	}
	117	my $re;
	118	if ($name eq 'PUNCT') {;
	119
	120	# Sadly, this is inconsistent: \pP and \pS for the ascii range,
	121	# just \pP outside it.
	122	$re = qr/\p{Punct}\|[^\P{Symbol}\P{ASCII}]/;
	123	} elsif ($name eq 'CHARNAME_CONT') {;
	124	$re = qr/[-\w ():\xa0]/;
	125	} elsif ($name eq 'SPACE') {;
	126	$re = qr/\s/;
	127	} elsif ($name eq 'IDFIRST') {
	128	$re = qr/[_\p{Alpha}]/;
	129	} elsif ($name eq 'PSXSPC') {
	130	$re = qr/[\v\p{Space}]/;
	131	} elsif ($name eq 'WORDCHAR') {
	132	$re = qr/\w/;
	133	} elsif ($name eq 'ALNUMC') {
	134	# Like \w, but no underscore
aedd44b5	135	$re = qr/\p{Alnum}/;
9c68f0ab KW	136	} elsif ($name eq 'OCTAL') {
	137	$re = qr/[0-7]/;
	138	} else { # The remainder have the same name and values as Unicode
	139	$re = eval "qr/\\p{$name}/";
	140	use Carp;
	141	carp $@ if ! defined $re;
	142	}
	143	#print "$ord, $name $property, $re\n";
	144	if ($char =~ $re) { # Add this property if matches
	145	$bits[$ord] .= '\|' if $bits[$ord];
	146	$bits[$ord] .= "_CC_$property";
	147	}
	148	}
	149	#print __LINE__, " $ord $char $bits[$ord]\n";
	150	}
	151
	152	# Names of C0 controls
	153	my @C0 = qw (
	154	NUL
	155	SOH
	156	STX
	157	ETX
	158	EOT
	159	ENQ
	160	ACK
	161	BEL
	162	BS
	163	HT
	164	LF
	165	VT
	166	FF
	167	CR
	168	SO
	169	SI
	170	DLE
	171	DC1
	172	DC2
	173	DC3
	174	DC4
	175	NAK
	176	SYN
	177	ETB
	178	CAN
	179	EOM
	180	SUB
	181	ESC
	182	FS
	183	GS
	184	RS
	185	US
	186	);
	187
	188	# Names of C1 controls, plus the adjacent DEL
	189	my @C1 = qw(
	190	DEL
	191	PAD
	192	HOP
	193	BPH
	194	NBH
	195	IND
	196	NEL
	197	SSA
	198	ESA
	199	HTS
200	HTJ
201	VTS
202	PLD
203	PLU
204	RI
205	SS2
206	SS3
207	DCS
208	PU1
209	PU2
210	STS
211	CCH
212	MW
213	SPA
214	EPA
215	SOS
216	SGC
217	SCI
218	CSI
219	ST
220	OSC
221	PM
222	APC
223	);
224
cfb8fd6a NC	225	my $out_fh = safer_open('l1_char_class_tab.h-new', 'l1_char_class_tab.h');
	226	print $out_fh read_only_top(lang => 'C', style => '*', by => $0, from => $file);
	227
9c68f0ab KW	228	# Output the table using fairly short names for each char.
	229	for my $ord (0..255) {
	230	my $name;
	231	if ($ord < 32) { # A C0 control
	232	$name = $C0[$ord];
	233	} elsif ($ord > 32 && $ord < 127) { # Graphic
	234	$name = "'" . chr($ord) . "'";
	235	} elsif ($ord >= 127 && $ord <= 0x9f) {
	236	$name = $C1[$ord - 127]; # A C1 control + DEL
	237	} else { # SPACE, or, if Latin1, shorten the name */
	238	use charnames();
	239	$name = charnames::viacode($ord);
	240	$name =~ s/LATIN CAPITAL LETTER //
	241	\|\| $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
	242	}
cfb8fd6a	243	printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab KW	244	}
9c68f0ab KW	245
cfb8fd6a	246	read_only_bottom_close_and_rename($out_fh)