[perl5.git] / Porting / mk_PL_charclass.pl

#!perl -w
use 5.012;
use strict;
use warnings;
use Config;

# This program outputs the 256 lines that form the guts of the PL_charclass
# table.  The output should be used to manually replace the table contents in
# l1_charclass_tab.h.  Each line is a bit map of properties that the Unicode
# code point at the corresponding position in the table array has.  The first
# line corresponds to code point U+0000, NULL, the last line to U=00FF.  For
# an application to see if the code point "i" has a particular property, it
# just does
#    'PL_charclass[i] & BIT'
# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
# character class, and 'property' is the corresponding property, and 'suffix'
# is one of '_A' to mean the property is true only if the corresponding code
# point is ASCII, and '_L1' means that the range includes any Latin1
# character (ISO-8859-1 including the C0 and C1 controls).  A property without
# these suffixes does not have different forms for both ranges.

# The data in the table is pretty well set in stone, so that this program need
# be run only when adding new properties to it.

my @properties = qw(
    ALNUMC_A
    ALNUMC_L1
    ALPHA_A
    ALPHA_L1
    BLANK_A
    BLANK_L1
    CHARNAME_CONT
    CNTRL_A
    CNTRL_L1
    DIGIT_A
    GRAPH_A
    GRAPH_L1
    IDFIRST_A
    IDFIRST_L1
    LOWER_A
    LOWER_L1
    OCTAL_A
    PRINT_A
    PRINT_L1
    PSXSPC_A
    PSXSPC_L1
    PUNCT_A
    PUNCT_L1
    SPACE_A
    SPACE_L1
    UPPER_A
    UPPER_L1
    WORDCHAR_A
    WORDCHAR_L1
    XDIGIT_A
);

# Read in the case fold mappings.
my %folded_closure;
my $file="$Config{privlib}/unicore/CaseFolding.txt";
open my $fh, "<", $file or die "Failed to read '$file': $!";
while (<$fh>) {
    chomp;

    # Lines look like (without the initial '#'
    #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
    my ($line, $comment) = split / \s+ \# \s+ /x, $_;
    next if $line eq "" || substr($line, 0, 1) eq '#';
    my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;

    my $from = hex $hex_from;

    # Perl only deals with C and F folds
    next if $fold_type ne 'C' and $fold_type ne 'F';
    next if $fold_type ne 'C';  # And for now, just single-char folds. XXX

    # Get each code point in the range that participates in this line's fold.
    # The hash has keys of each code point in the range, and values of what it
    # folds to and what folds to it
    foreach my $hex_fold (@folded) {
        my $fold = hex $hex_fold;
        push @{$folded_closure{$fold}}, $from if $fold < 256;
        push @{$folded_closure{$from}}, $fold if $from < 256;
    }
}

# Now having read all the lines, combine them into the full closure of each
# code point in the range by adding lists together that share a common element
foreach my $folded (keys %folded_closure) {
    foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
        push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
    }
}

my @bits;   # Bit map for each code point

foreach my $folded (keys %folded_closure) {
    $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
                                                @{$folded_closure{$folded}};
}

for my $ord (0..255) {
    my $char = chr($ord);
    utf8::upgrade($char);   # Important to use Unicode semantics!
    for my $property (@properties) {
        my $name = $property;

        # The property name that corresponds to this doesn't have a suffix.
        # If is a latin1 version, no further checking is needed.
        if (! ($name =~ s/_L1$//)) {

            # Here, isn't an L1.  It's either a special one or the suffix ends
            # in _A.  In the latter case, it's automatically false for
            # non-ascii.  The one current special is valid over the whole range.
            next if $name =~ s/_A$// && $ord >= 128;

        }
        my $re;
        if ($name eq 'PUNCT') {;

            # Sadly, this is inconsistent: \pP and \pS for the ascii range,
            # just \pP outside it.
            $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
        } elsif ($name eq 'CHARNAME_CONT') {;
            $re = qr/[-\w ():\xa0]/;
        } elsif ($name eq 'SPACE') {;
            $re = qr/\s/;
        } elsif ($name eq 'IDFIRST') {
            $re = qr/[_\p{Alpha}]/;
        } elsif ($name eq 'PSXSPC') {
            $re = qr/[\v\p{Space}]/;
        } elsif ($name eq 'WORDCHAR') {
            $re = qr/\w/;
        } elsif ($name eq 'ALNUMC') {
            # Like \w, but no underscore
            $re = qr/\p{Alnum}/;
        } elsif ($name eq 'OCTAL') {
            $re = qr/[0-7]/;
        } else {    # The remainder have the same name and values as Unicode
            $re = eval "qr/\\p{$name}/";
            use Carp;
            carp $@ if ! defined $re;
        }
        #print "$ord, $name $property, $re\n";
        if ($char =~ $re) {  # Add this property if matches
            $bits[$ord] .= '|' if $bits[$ord];
            $bits[$ord] .= "_CC_$property";
        }
    }
    #print __LINE__, " $ord $char $bits[$ord]\n";
}

# Names of C0 controls
my @C0 = qw (
                NUL
                SOH
                STX
                ETX
                EOT
                ENQ
                ACK
                BEL
                BS
                HT
                LF
                VT
                FF
                CR
                SO
                SI
                DLE
                DC1
                DC2
                DC3
                DC4
                NAK
                SYN
                ETB
                CAN
                EOM
                SUB
                ESC
                FS
                GS
                RS
                US
            );

# Names of C1 controls, plus the adjacent DEL
my @C1 = qw(
                DEL
                PAD
                HOP
                BPH
                NBH
                IND
                NEL
                SSA
                ESA
                HTS
                HTJ
                VTS
                PLD
                PLU
                RI 
                SS2
                SS3
                DCS
                PU1
                PU2
                STS
                CCH
                MW 
                SPA
                EPA
                SOS
                SGC
                SCI
                CSI
                ST 
                OSC
                PM 
                APC
            );

# Output the table using fairly short names for each char.
for my $ord (0..255) {
    my $name;
    if ($ord < 32) {    # A C0 control
        $name = $C0[$ord];
    } elsif ($ord > 32 && $ord < 127) { # Graphic
        $name = "'" . chr($ord) . "'";
    } elsif ($ord >= 127 && $ord <= 0x9f) {
        $name = $C1[$ord - 127];    # A C1 control + DEL
    } else {    # SPACE, or, if Latin1, shorten the name */
        use charnames();
        $name = charnames::viacode($ord);
        $name =~ s/LATIN CAPITAL LETTER //
        || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
    }
    printf "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
}
Commit	Line	Data
9c68f0ab KW	1	#!perl -w
	2	use 5.012;
	3	use strict;
	4	use warnings;
00c072cf	5	use Config;
9c68f0ab KW	6
	7	# This program outputs the 256 lines that form the guts of the PL_charclass
	8	# table. The output should be used to manually replace the table contents in
8d4ab2a1 KW	9	# l1_charclass_tab.h. Each line is a bit map of properties that the Unicode
	10	# code point at the corresponding position in the table array has. The first
	11	# line corresponds to code point U+0000, NULL, the last line to U=00FF. For
	12	# an application to see if the code point "i" has a particular property, it
	13	# just does
9c68f0ab KW	14	# 'PL_charclass[i] & BIT'
	15	# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
	16	# character class, and 'property' is the corresponding property, and 'suffix'
	17	# is one of '_A' to mean the property is true only if the corresponding code
	18	# point is ASCII, and '_L1' means that the range includes any Latin1
	19	# character (ISO-8859-1 including the C0 and C1 controls). A property without
	20	# these suffixes does not have different forms for both ranges.
	21
	22	# The data in the table is pretty well set in stone, so that this program need
	23	# be run only when adding new properties to it.
	24
	25	my @properties = qw(
	26	ALNUMC_A
	27	ALNUMC_L1
	28	ALPHA_A
	29	ALPHA_L1
	30	BLANK_A
	31	BLANK_L1
	32	CHARNAME_CONT
	33	CNTRL_A
	34	CNTRL_L1
	35	DIGIT_A
	36	GRAPH_A
	37	GRAPH_L1
	38	IDFIRST_A
	39	IDFIRST_L1
	40	LOWER_A
	41	LOWER_L1
	42	OCTAL_A
	43	PRINT_A
	44	PRINT_L1
	45	PSXSPC_A
	46	PSXSPC_L1
	47	PUNCT_A
	48	PUNCT_L1
	49	SPACE_A
	50	SPACE_L1
	51	UPPER_A
	52	UPPER_L1
	53	WORDCHAR_A
	54	WORDCHAR_L1
	55	XDIGIT_A
	56	);
	57
00c072cf KW	58	# Read in the case fold mappings.
	59	my %folded_closure;
	60	my $file="$Config{privlib}/unicore/CaseFolding.txt";
	61	open my $fh, "<", $file or die "Failed to read '$file': $!";
	62	while (<$fh>) {
	63	chomp;
	64
	65	# Lines look like (without the initial '#'
	66	#0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
	67	my ($line, $comment) = split / \s+ \# \s+ /x, $_;
	68	next if $line eq "" \|\| substr($line, 0, 1) eq '#';
	69	my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
	70
	71	my $from = hex $hex_from;
	72
	73	# Perl only deals with C and F folds
	74	next if $fold_type ne 'C' and $fold_type ne 'F';
	75	next if $fold_type ne 'C'; # And for now, just single-char folds. XXX
	76
	77	# Get each code point in the range that participates in this line's fold.
	78	# The hash has keys of each code point in the range, and values of what it
	79	# folds to and what folds to it
	80	foreach my $hex_fold (@folded) {
	81	my $fold = hex $hex_fold;
	82	push @{$folded_closure{$fold}}, $from if $fold < 256;
	83	push @{$folded_closure{$from}}, $fold if $from < 256;
	84	}
	85	}
	86
	87	# Now having read all the lines, combine them into the full closure of each
	88	# code point in the range by adding lists together that share a common element
	89	foreach my $folded (keys %folded_closure) {
	90	foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
	91	push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
	92	}
	93	}
	94
9c68f0ab KW	95	my @bits; # Bit map for each code point
9c68f0ab KW	96
00c072cf KW	97	foreach my $folded (keys %folded_closure) {
	98	$bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
	99	@{$folded_closure{$folded}};
	100	}
	101
9c68f0ab KW	102	for my $ord (0..255) {
	103	my $char = chr($ord);
	104	utf8::upgrade($char); # Important to use Unicode semantics!
	105	for my $property (@properties) {
	106	my $name = $property;
	107
	108	# The property name that corresponds to this doesn't have a suffix.
	109	# If is a latin1 version, no further checking is needed.
	110	if (! ($name =~ s/_L1$//)) {
	111
	112	# Here, isn't an L1. It's either a special one or the suffix ends
	113	# in _A. In the latter case, it's automatically false for
	114	# non-ascii. The one current special is valid over the whole range.
	115	next if $name =~ s/_A$// && $ord >= 128;
	116
	117	}
	118	my $re;
	119	if ($name eq 'PUNCT') {;
	120
	121	# Sadly, this is inconsistent: \pP and \pS for the ascii range,
	122	# just \pP outside it.
	123	$re = qr/\p{Punct}\|[^\P{Symbol}\P{ASCII}]/;
	124	} elsif ($name eq 'CHARNAME_CONT') {;
	125	$re = qr/[-\w ():\xa0]/;
	126	} elsif ($name eq 'SPACE') {;
	127	$re = qr/\s/;
	128	} elsif ($name eq 'IDFIRST') {
	129	$re = qr/[_\p{Alpha}]/;
	130	} elsif ($name eq 'PSXSPC') {
	131	$re = qr/[\v\p{Space}]/;
	132	} elsif ($name eq 'WORDCHAR') {
	133	$re = qr/\w/;
	134	} elsif ($name eq 'ALNUMC') {
	135	# Like \w, but no underscore
aedd44b5	136	$re = qr/\p{Alnum}/;
9c68f0ab KW	137	} elsif ($name eq 'OCTAL') {
	138	$re = qr/[0-7]/;
	139	} else { # The remainder have the same name and values as Unicode
	140	$re = eval "qr/\\p{$name}/";
	141	use Carp;
	142	carp $@ if ! defined $re;
	143	}
	144	#print "$ord, $name $property, $re\n";
	145	if ($char =~ $re) { # Add this property if matches
	146	$bits[$ord] .= '\|' if $bits[$ord];
	147	$bits[$ord] .= "_CC_$property";
	148	}
	149	}
	150	#print __LINE__, " $ord $char $bits[$ord]\n";
	151	}
	152
	153	# Names of C0 controls
	154	my @C0 = qw (
	155	NUL
	156	SOH
	157	STX
	158	ETX
	159	EOT
	160	ENQ
	161	ACK
	162	BEL
	163	BS
	164	HT
	165	LF
	166	VT
	167	FF
	168	CR
	169	SO
	170	SI
	171	DLE
	172	DC1
	173	DC2
	174	DC3
	175	DC4
	176	NAK
	177	SYN
	178	ETB
	179	CAN
	180	EOM
	181	SUB
	182	ESC
	183	FS
	184	GS
	185	RS
	186	US
	187	);
	188
	189	# Names of C1 controls, plus the adjacent DEL
	190	my @C1 = qw(
	191	DEL
	192	PAD
	193	HOP
	194	BPH
	195	NBH
	196	IND
	197	NEL
	198	SSA
	199	ESA
	200	HTS
201	HTJ
202	VTS
203	PLD
204	PLU
205	RI
206	SS2
207	SS3
208	DCS
209	PU1
210	PU2
211	STS
212	CCH
213	MW
214	SPA
215	EPA
216	SOS
217	SGC
218	SCI
219	CSI
220	ST
221	OSC
222	PM
223	APC
224	);
225
226	# Output the table using fairly short names for each char.
227	for my $ord (0..255) {
228	my $name;
229	if ($ord < 32) { # A C0 control
230	$name = $C0[$ord];
231	} elsif ($ord > 32 && $ord < 127) { # Graphic
232	$name = "'" . chr($ord) . "'";
233	} elsif ($ord >= 127 && $ord <= 0x9f) {
234	$name = $C1[$ord - 127]; # A C1 control + DEL
235	} else { # SPACE, or, if Latin1, shorten the name */
236	use charnames();
237	$name = charnames::viacode($ord);
238	$name =~ s/LATIN CAPITAL LETTER //
239	\|\| $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
240	}
241	printf "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
242	}
243