[perl5.git] / regen / mk_PL_charclass.pl

#!perl -w
use v5.15.8;
use strict;
use warnings;
require 'regen/regen_lib.pl';

# This program outputs l1_charclass_tab.h, which defines the guts of the
# PL_charclass table.  Each line is a bit map of properties that the Unicode
# code point at the corresponding position in the table array has.  The first
# line corresponds to code point U+0000, NULL, the last line to U+00FF.  For
# an application to see if the code point "i" has a particular property, it
# just does
#    'PL_charclass[i] & BIT'
# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
# character class, and 'property' is the corresponding property, and 'suffix'
# is one of '_A' to mean the property is true only if the corresponding code
# point is ASCII, and '_L1' means that the range includes any Latin1
# character (ISO-8859-1 including the C0 and C1 controls).  A property without
# these suffixes does not have different forms for both ranges.

# This program need be run only when adding new properties to it, or upon a
# new Unicode release, to make sure things haven't been changed by it.

my @properties = qw(
    ALNUMC
    ALPHA
    ASCII
    BLANK
    CHARNAME_CONT
    CNTRL
    DIGIT
    GRAPH
    IDFIRST
    LOWER
    NON_FINAL_FOLD
    PRINT
    PSXSPC
    PUNCT
    QUOTEMETA
    SPACE
    UPPER
    WORDCHAR
    XDIGIT
);

# Read in the case fold mappings.
my %folded_closure;
my $file="lib/unicore/CaseFolding.txt";
my @folds;
use Unicode::UCD;

# Use the Unicode data file if we are on an ASCII platform (which its data is
# for), and it is in the modern format (starting in Unicode 3.1.0) and it is
# available.  This avoids being affected by potential bugs introduced by other
# layers of Perl
if (ord('A') == 65
    && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
    && open my $fh, "<", $file)
{
    @folds = <$fh>;
}
else {
    my ($invlist_ref, $invmap_ref, undef, $default)
                                    = Unicode::UCD::prop_invmap('Case_Folding');
    for my $i (0 .. @$invlist_ref - 1 - 1) {
        next if $invmap_ref->[$i] == $default;
        my $adjust = -1;
        for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
            $adjust++;

            # Single-code point maps go to a 'C' type
            if (! ref $invmap_ref->[$i]) {
                push @folds, sprintf("%04X; C; %04X\n",
                                     $j,
                                     $invmap_ref->[$i] + $adjust);
            }
            else {  # Multi-code point maps go to 'F'.  prop_invmap()
                    # guarantees that no adjustment is needed for these,
                    # as the range will contain just one element
                push @folds, sprintf("%04X; F; %s\n",
                                    $j,
                                    join " ", map { sprintf "%04X", $_ }
                                                    @{$invmap_ref->[$i]});
            }
        }
    }
}

for (@folds) {
    chomp;

    # Lines look like (without the initial '#'
    #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
    # Get rid of comments, ignore blank or comment-only lines
    my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
    next unless length $line;
    my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;

    my $from = hex $hex_from;

    # Perl only deals with C and F folds
    next if $fold_type ne 'C' and $fold_type ne 'F';

    # Get each code point in the range that participates in this line's fold.
    # The hash has keys of each code point in the range, and values of what it
    # folds to and what folds to it
    foreach my $hex_fold (@folded) {
        my $fold = hex $hex_fold;
        push @{$folded_closure{$fold}}, $from if $fold < 256;
        push @{$folded_closure{$from}}, $fold if $from < 256;
    }
}

# Now having read all the lines, combine them into the full closure of each
# code point in the range by adding lists together that share a common element
foreach my $folded (keys %folded_closure) {
    foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
        push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
    }
}

my @bits;   # Bit map for each code point

foreach my $folded (keys %folded_closure) {
    $bits[$folded] = "(1U<<_CC_NONLATIN1_FOLD)" if grep { $_ > 255 }
                                                @{$folded_closure{$folded}};
}

# For each character, calculate which properties it matches.
for my $ord (0..255) {
    my $char = chr($ord);
    utf8::upgrade($char);   # Important to use Unicode semantics!

    # Look at all the properties we care about here.
    for my $property (@properties) {
        my $name = $property;

        # Remove the suffix to get the actual property name.
        # Currently the suffixes are '_L1', '_A', and none.
        # If is a latin1 version, no further checking is needed.
        if (! ($name =~ s/_L1$//)) {

            # Here, isn't an _L1.  If its _A, it's automatically false for
            # non-ascii.  The only one current one (besides ASCII) without a
            # suffix is valid over the whole range.
            next if $name =~ s/_A$// && $ord >= 128;

        }
        my $re;
        if ($name eq 'PUNCT') {;

            # Sadly, this is inconsistent: \pP and \pS for the ascii range,
            # just \pP outside it.
            $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
        } elsif ($name eq 'CHARNAME_CONT') {;
            $re = qr/[-\p{XPosixWord} ():\xa0]/;
        } elsif ($name eq 'SPACE') {;
            $re = qr/\p{XPerlSpace}/;
        } elsif ($name eq 'IDFIRST') {
            $re = qr/[_\p{Alpha}]/;
        } elsif ($name eq 'PSXSPC') {
            $re = qr/[\v\p{Space}]/;
        } elsif ($name eq 'WORDCHAR') {
            $re = qr/\p{XPosixWord}/;
        } elsif ($name eq 'ALNUMC') {
            # Like \w, but no underscore
            $re = qr/\p{Alnum}/;
        } elsif ($name eq 'QUOTEMETA') {
            $re = qr/\p{_Perl_Quotemeta}/;
        } elsif ($name eq 'NON_FINAL_FOLD') {
            $re = qr/\p{_Perl_Non_Final_Folds}/;
        } else {    # The remainder have the same name and values as Unicode
            $re = eval "qr/\\p{$name}/";
            use Carp;
            carp $@ if ! defined $re;
        }
        #print "$ord, $name $property, $re\n";
        if ($char =~ $re  # Add this property if matches
            || ($name eq 'NON_FINAL_FOLD'
                # Also include chars that fold to the non-final
                && CORE::fc($char) =~ $re))
        {
            $bits[$ord] .= '|' if $bits[$ord];
            $bits[$ord] .= "(1U<<_CC_$property)";
        }
    }
    #print __LINE__, " $ord $char $bits[$ord]\n";
}

# Names of C0 controls
my @C0 = qw (
                NUL
                SOH
                STX
                ETX
                EOT
                ENQ
                ACK
                BEL
                BS
                HT
                LF
                VT
                FF
                CR
                SO
                SI
                DLE
                DC1
                DC2
                DC3
                DC4
                NAK
                SYN
                ETB
                CAN
                EOM
                SUB
                ESC
                FS
                GS
                RS
                US
            );

# Names of C1 controls, plus the adjacent DEL
my @C1 = qw(
                DEL
                PAD
                HOP
                BPH
                NBH
                IND
                NEL
                SSA
                ESA
                HTS
                HTJ
                VTS
                PLD
                PLU
                RI 
                SS2
                SS3
                DCS
                PU1
                PU2
                STS
                CCH
                MW 
                SPA
                EPA
                SOS
                SGC
                SCI
                CSI
                ST 
                OSC
                PM 
                APC
            );

my $out_fh = open_new('l1_char_class_tab.h', '>',
		      {style => '*', by => $0,
                      from => "property definitions"});

# Output the table using fairly short names for each char.
for my $ord (0..255) {
    my $name;
    if ($ord < 32) {    # A C0 control
        $name = $C0[$ord];
    } elsif ($ord > 32 && $ord < 127) { # Graphic
        $name = "'" . chr($ord) . "'";
    } elsif ($ord >= 127 && $ord <= 0x9f) {
        $name = $C1[$ord - 127];    # A C1 control + DEL
    } else {    # SPACE, or, if Latin1, shorten the name */
        use charnames();
        $name = charnames::viacode($ord);
        $name =~ s/LATIN CAPITAL LETTER //
        || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
    }
    printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
}

read_only_bottom_close_and_rename($out_fh)
Commit	Line	Data
9c68f0ab	1	#!perl -w
b72a36d4	2	use v5.15.8;
9c68f0ab KW	3	use strict;
9c68f0ab KW	4	use warnings;
cfb8fd6a	5	require 'regen/regen_lib.pl';
9c68f0ab	6
b1909af7 KW	7	# This program outputs l1_charclass_tab.h, which defines the guts of the
b1909af7 KW	8	# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1	9	# code point at the corresponding position in the table array has. The first
b1909af7	10	# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1 KW	11	# an application to see if the code point "i" has a particular property, it
8d4ab2a1 KW	12	# just does
9c68f0ab KW	13	# 'PL_charclass[i] & BIT'
	14	# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
	15	# character class, and 'property' is the corresponding property, and 'suffix'
	16	# is one of '_A' to mean the property is true only if the corresponding code
	17	# point is ASCII, and '_L1' means that the range includes any Latin1
	18	# character (ISO-8859-1 including the C0 and C1 controls). A property without
	19	# these suffixes does not have different forms for both ranges.
	20
b1909af7 KW	21	# This program need be run only when adding new properties to it, or upon a
b1909af7 KW	22	# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab KW	23
9c68f0ab KW	24	my @properties = qw(
f4cdb42c KW	25	ALNUMC
f4cdb42c KW	26	ALPHA
e48bdfbc	27	ASCII
f4cdb42c	28	BLANK
9c68f0ab	29	CHARNAME_CONT
f4cdb42c KW	30	CNTRL
	31	DIGIT
	32	GRAPH
	33	IDFIRST
	34	LOWER
b72a36d4	35	NON_FINAL_FOLD
f4cdb42c KW	36	PRINT
	37	PSXSPC
	38	PUNCT
9a022f3a	39	QUOTEMETA
f4cdb42c KW	40	SPACE
	41	UPPER
	42	WORDCHAR
	43	XDIGIT
9c68f0ab KW	44	);
9c68f0ab KW	45
00c072cf KW	46	# Read in the case fold mappings.
00c072cf KW	47	my %folded_closure;
cfb8fd6a	48	my $file="lib/unicore/CaseFolding.txt";
dbe1ba6b KW	49	my @folds;
	50	use Unicode::UCD;
	51
	52	# Use the Unicode data file if we are on an ASCII platform (which its data is
	53	# for), and it is in the modern format (starting in Unicode 3.1.0) and it is
	54	# available. This avoids being affected by potential bugs introduced by other
	55	# layers of Perl
	56	if (ord('A') == 65
	57	&& pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
	58	&& open my $fh, "<", $file)
	59	{
	60	@folds = <$fh>;
	61	}
	62	else {
	63	my ($invlist_ref, $invmap_ref, undef, $default)
	64	= Unicode::UCD::prop_invmap('Case_Folding');
	65	for my $i (0 .. @$invlist_ref - 1 - 1) {
	66	next if $invmap_ref->[$i] == $default;
	67	my $adjust = -1;
	68	for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
	69	$adjust++;
	70
	71	# Single-code point maps go to a 'C' type
	72	if (! ref $invmap_ref->[$i]) {
	73	push @folds, sprintf("%04X; C; %04X\n",
	74	$j,
	75	$invmap_ref->[$i] + $adjust);
	76	}
	77	else { # Multi-code point maps go to 'F'. prop_invmap()
	78	# guarantees that no adjustment is needed for these,
	79	# as the range will contain just one element
	80	push @folds, sprintf("%04X; F; %s\n",
	81	$j,
	82	join " ", map { sprintf "%04X", $_ }
	83	@{$invmap_ref->[$i]});
	84	}
	85	}
	86	}
	87	}
	88
	89	for (@folds) {
00c072cf KW	90	chomp;
	91
	92	# Lines look like (without the initial '#'
	93	#0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
e3136cf9 KW	94	# Get rid of comments, ignore blank or comment-only lines
	95	my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
	96	next unless length $line;
00c072cf KW	97	my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
	98
	99	my $from = hex $hex_from;
	100
	101	# Perl only deals with C and F folds
33e4950c	102	next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf KW	103
	104	# Get each code point in the range that participates in this line's fold.
	105	# The hash has keys of each code point in the range, and values of what it
	106	# folds to and what folds to it
	107	foreach my $hex_fold (@folded) {
	108	my $fold = hex $hex_fold;
	109	push @{$folded_closure{$fold}}, $from if $fold < 256;
	110	push @{$folded_closure{$from}}, $fold if $from < 256;
	111	}
	112	}
	113
	114	# Now having read all the lines, combine them into the full closure of each
	115	# code point in the range by adding lists together that share a common element
	116	foreach my $folded (keys %folded_closure) {
	117	foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
	118	push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
	119	}
	120	}
	121
9c68f0ab KW	122	my @bits; # Bit map for each code point
9c68f0ab KW	123
00c072cf	124	foreach my $folded (keys %folded_closure) {
265c1f46	125	$bits[$folded] = "(1U<<_CC_NONLATIN1_FOLD)" if grep { $_ > 255 }
00c072cf KW	126	@{$folded_closure{$folded}};
	127	}
	128
b1909af7	129	# For each character, calculate which properties it matches.
9c68f0ab KW	130	for my $ord (0..255) {
	131	my $char = chr($ord);
	132	utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7 KW	133
b1909af7 KW	134	# Look at all the properties we care about here.
9c68f0ab KW	135	for my $property (@properties) {
	136	my $name = $property;
	137
b1909af7 KW	138	# Remove the suffix to get the actual property name.
b1909af7 KW	139	# Currently the suffixes are '_L1', '_A', and none.
9c68f0ab KW	140	# If is a latin1 version, no further checking is needed.
	141	if (! ($name =~ s/_L1$//)) {
	142
b1909af7	143	# Here, isn't an _L1. If its _A, it's automatically false for
e48bdfbc KW	144	# non-ascii. The only one current one (besides ASCII) without a
e48bdfbc KW	145	# suffix is valid over the whole range.
9c68f0ab KW	146	next if $name =~ s/_A$// && $ord >= 128;
	147
	148	}
	149	my $re;
	150	if ($name eq 'PUNCT') {;
	151
	152	# Sadly, this is inconsistent: \pP and \pS for the ascii range,
	153	# just \pP outside it.
	154	$re = qr/\p{Punct}\|[^\P{Symbol}\P{ASCII}]/;
	155	} elsif ($name eq 'CHARNAME_CONT') {;
c6e8e4a9	156	$re = qr/[-\p{XPosixWord} ():\xa0]/;
9c68f0ab	157	} elsif ($name eq 'SPACE') {;
c6e8e4a9	158	$re = qr/\p{XPerlSpace}/;
9c68f0ab KW	159	} elsif ($name eq 'IDFIRST') {
	160	$re = qr/[_\p{Alpha}]/;
	161	} elsif ($name eq 'PSXSPC') {
	162	$re = qr/[\v\p{Space}]/;
	163	} elsif ($name eq 'WORDCHAR') {
c6e8e4a9	164	$re = qr/\p{XPosixWord}/;
9c68f0ab KW	165	} elsif ($name eq 'ALNUMC') {
9c68f0ab KW	166	# Like \w, but no underscore
aedd44b5	167	$re = qr/\p{Alnum}/;
9a022f3a KW	168	} elsif ($name eq 'QUOTEMETA') {
9a022f3a KW	169	$re = qr/\p{_Perl_Quotemeta}/;
b72a36d4 KW	170	} elsif ($name eq 'NON_FINAL_FOLD') {
b72a36d4 KW	171	$re = qr/\p{_Perl_Non_Final_Folds}/;
9c68f0ab KW	172	} else { # The remainder have the same name and values as Unicode
	173	$re = eval "qr/\\p{$name}/";
	174	use Carp;
	175	carp $@ if ! defined $re;
	176	}
	177	#print "$ord, $name $property, $re\n";
b72a36d4 KW	178	if ($char =~ $re # Add this property if matches
	179	\|\| ($name eq 'NON_FINAL_FOLD'
	180	# Also include chars that fold to the non-final
	181	&& CORE::fc($char) =~ $re))
	182	{
9c68f0ab	183	$bits[$ord] .= '\|' if $bits[$ord];
265c1f46	184	$bits[$ord] .= "(1U<<_CC_$property)";
9c68f0ab KW	185	}
	186	}
	187	#print __LINE__, " $ord $char $bits[$ord]\n";
	188	}
	189
	190	# Names of C0 controls
	191	my @C0 = qw (
	192	NUL
	193	SOH
	194	STX
	195	ETX
	196	EOT
	197	ENQ
	198	ACK
	199	BEL
	200	BS
	201	HT
	202	LF
	203	VT
	204	FF
	205	CR
	206	SO
	207	SI
	208	DLE
	209	DC1
	210	DC2
	211	DC3
	212	DC4
	213	NAK
	214	SYN
	215	ETB
	216	CAN
	217	EOM
	218	SUB
	219	ESC
	220	FS
	221	GS
	222	RS
	223	US
	224	);
	225
	226	# Names of C1 controls, plus the adjacent DEL
	227	my @C1 = qw(
	228	DEL
	229	PAD
	230	HOP
	231	BPH
	232	NBH
	233	IND
	234	NEL
	235	SSA
	236	ESA
	237	HTS
	238	HTJ
	239	VTS
	240	PLD
	241	PLU
	242	RI
	243	SS2
	244	SS3
	245	DCS
	246	PU1
	247	PU2
	248	STS
249	CCH
250	MW
251	SPA
252	EPA
253	SOS
254	SGC
255	SCI
256	CSI
257	ST
258	OSC
259	PM
260	APC
261	);
262
cc49830d	263	my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7	264	{style => '*', by => $0,
dbe1ba6b	265	from => "property definitions"});
cfb8fd6a	266
9c68f0ab KW	267	# Output the table using fairly short names for each char.
	268	for my $ord (0..255) {
	269	my $name;
	270	if ($ord < 32) { # A C0 control
	271	$name = $C0[$ord];
	272	} elsif ($ord > 32 && $ord < 127) { # Graphic
	273	$name = "'" . chr($ord) . "'";
	274	} elsif ($ord >= 127 && $ord <= 0x9f) {
	275	$name = $C1[$ord - 127]; # A C1 control + DEL
	276	} else { # SPACE, or, if Latin1, shorten the name */
	277	use charnames();
	278	$name = charnames::viacode($ord);
	279	$name =~ s/LATIN CAPITAL LETTER //
	280	\|\| $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
	281	}
cfb8fd6a	282	printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab KW	283	}
9c68f0ab KW	284
cfb8fd6a	285	read_only_bottom_close_and_rename($out_fh)