[perl5.git] / regen / mk_PL_charclass.pl

#!perl -w
use v5.15.8;
use strict;
use warnings;
require 'regen/regen_lib.pl';

# This program outputs l1_charclass_tab.h, which defines the guts of the
# PL_charclass table.  Each line is a bit map of properties that the Unicode
# code point at the corresponding position in the table array has.  The first
# line corresponds to code point U+0000, NULL, the last line to U+00FF.  For
# an application to see if the code point "i" has a particular property, it
# just does
#    'PL_charclass[i] & BIT'
# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
# character class, and 'property' is the corresponding property, and 'suffix'
# is one of '_A' to mean the property is true only if the corresponding code
# point is ASCII, and '_L1' means that the range includes any Latin1
# character (ISO-8859-1 including the C0 and C1 controls).  A property without
# these suffixes does not have different forms for both ranges.

# This program need be run only when adding new properties to it, or upon a
# new Unicode release, to make sure things haven't been changed by it.

my @properties = qw(
    NONLATIN1_FOLD
    ALPHANUMERIC
    ALPHA
    ASCII
    BLANK
    CASED
    CHARNAME_CONT
    CNTRL
    DIGIT
    GRAPH
    IDFIRST
    LOWER
    NON_FINAL_FOLD
    PRINT
    PSXSPC
    PUNCT
    QUOTEMETA
    SPACE
    UPPER
    WORDCHAR
    XDIGIT
    VERTSPACE
    IS_IN_SOME_FOLD
    BACKSLASH_FOO_LBRACE_IS_META
);

# Read in the case fold mappings.
my %folded_closure;
my @hex_non_final_folds;
my @folds;
use Unicode::UCD;

BEGIN { # Have to do this at compile time because using user-defined \p{property}

    # Use the Unicode data file if we are on an ASCII platform (which its data
    # is for), and it is in the modern format (starting in Unicode 3.1.0) and
    # it is available.  This avoids being affected by potential bugs
    # introduced by other layers of Perl
    my $file="lib/unicore/CaseFolding.txt";

    if (ord('A') == 65
        && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
        && open my $fh, "<", $file)
    {
        @folds = <$fh>;
    }
    else {
        my ($invlist_ref, $invmap_ref, undef, $default)
                                    = Unicode::UCD::prop_invmap('Case_Folding');
        for my $i (0 .. @$invlist_ref - 1 - 1) {
            next if $invmap_ref->[$i] == $default;
            my $adjust = -1;
            for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
                $adjust++;

                # Single-code point maps go to a 'C' type
                if (! ref $invmap_ref->[$i]) {
                    push @folds, sprintf("%04X; C; %04X\n",
                                        $j,
                                        $invmap_ref->[$i] + $adjust);
                }
                else {  # Multi-code point maps go to 'F'.  prop_invmap()
                        # guarantees that no adjustment is needed for these,
                        # as the range will contain just one element
                    push @folds, sprintf("%04X; F; %s\n",
                                        $j,
                                        join " ", map { sprintf "%04X", $_ }
                                                        @{$invmap_ref->[$i]});
                }
            }
        }
    }

    for (@folds) {
        chomp;

        # Lines look like (without the initial '#'
        #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
        # Get rid of comments, ignore blank or comment-only lines
        my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
        next unless length $line;
        my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;

        my $from = hex $hex_from;

        # Perl only deals with C and F folds
        next if $fold_type ne 'C' and $fold_type ne 'F';

        # Get each code point in the range that participates in this line's fold.
        # The hash has keys of each code point in the range, and values of what it
        # folds to and what folds to it
        for my $i (0 .. @folded - 1) {
            my $hex_fold = $folded[$i];
            my $fold = hex $hex_fold;
            push @{$folded_closure{$fold}}, $from if $fold < 256;
            push @{$folded_closure{$from}}, $fold if $from < 256;

            if ($i < @folded-1
                && $fold < 256
                && ! grep { $_ eq $hex_fold } @hex_non_final_folds)
            {
                push @hex_non_final_folds, $hex_fold;

                # Also add the upper case, which in the latin1 range folds to
                # $fold
                push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold;
            }
        }
    }

    # Now having read all the lines, combine them into the full closure of each
    # code point in the range by adding lists together that share a common
    # element
    foreach my $folded (keys %folded_closure) {
        foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
            push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
        }
    }
}

sub Is_Non_Latin1_Fold {
    my @return;

    foreach my $folded (keys %folded_closure) {
        push @return, sprintf("%X", $folded), if grep { $_ > 255 }
                                                     @{$folded_closure{$folded}};
    }
    return join("\n", @return) . "\n";
}

sub Is_Non_Final_Fold {
    return join("\n", @hex_non_final_folds) . "\n";
}

my @bits;   # Bit map for each code point

# For each character, calculate which properties it matches.
for my $ord (0..255) {
    my $char = chr($ord);
    utf8::upgrade($char);   # Important to use Unicode semantics!

    # Look at all the properties we care about here.
    for my $property (@properties) {
        my $name = $property;

        # Remove the suffix to get the actual property name.
        # Currently the suffixes are '_L1', '_A', and none.
        # If is a latin1 version, no further checking is needed.
        if (! ($name =~ s/_L1$//)) {

            # Here, isn't an _L1.  If its _A, it's automatically false for
            # non-ascii.  The only current ones (besides ASCII) without a
            # suffix are valid over the whole range.
            next if $name =~ s/_A$// && $ord >= 128;

        }
        my $re;
        if ($name eq 'PUNCT') {;

            # Sadly, this is inconsistent: \pP and \pS for the ascii range,
            # just \pP outside it.
            $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
        } elsif ($name eq 'CHARNAME_CONT') {;
            $re = qr/\p{_Perl_Charname_Continue}/,
        } elsif ($name eq 'SPACE') {;
            $re = qr/\p{XPerlSpace}/;
        } elsif ($name eq 'IDFIRST') {
            $re = qr/[_\p{Alpha}]/;
        } elsif ($name eq 'PSXSPC') {
            $re = qr/[\v\p{Space}]/;
        } elsif ($name eq 'WORDCHAR') {
            $re = qr/\p{XPosixWord}/;
        } elsif ($name eq 'ALPHANUMERIC') {
            # Like \w, but no underscore
            $re = qr/\p{Alnum}/;
        } elsif ($name eq 'QUOTEMETA') {
            $re = qr/\p{_Perl_Quotemeta}/;
        } elsif ($name eq 'NONLATIN1_FOLD') {
            $re = qr/\p{Is_Non_Latin1_Fold}/;
        } elsif ($name eq 'NON_FINAL_FOLD') {
            $re = qr/\p{Is_Non_Final_Fold}/;
        } elsif ($name eq 'IS_IN_SOME_FOLD') {
            $re = qr/\p{_Perl_Any_Folds}/;
        } elsif ($name eq 'BACKSLASH_FOO_LBRACE_IS_META') {

            # This is true for FOO where FOO is the varying character in:
            # \a{, \b{, \c{, ...
            # and the sequence has non-literal meaning to Perl; so it is true
            # for 'x' because \x{ is special, but not 'a' because \a{ isn't.
            $re = qr/[gkNopPx]/;
        } else {    # The remainder have the same name and values as Unicode
            $re = eval "qr/\\p{$name}/";
            use Carp;
            carp $@ if ! defined $re;
        }
        #print "$ord, $name $property, $re\n";
        if ($char =~ $re) {  # Add this property if matches
            $bits[$ord] .= '|' if $bits[$ord];
            $bits[$ord] .= "(1U<<_CC_$property)";
        }
    }
    #print __LINE__, " $ord $char $bits[$ord]\n";
}

# Names of C0 controls
my @C0 = qw (
                NUL
                SOH
                STX
                ETX
                EOT
                ENQ
                ACK
                BEL
                BS
                HT
                LF
                VT
                FF
                CR
                SO
                SI
                DLE
                DC1
                DC2
                DC3
                DC4
                NAK
                SYN
                ETB
                CAN
                EOM
                SUB
                ESC
                FS
                GS
                RS
                US
            );

# Names of C1 controls, plus the adjacent DEL
my @C1 = qw(
                DEL
                PAD
                HOP
                BPH
                NBH
                IND
                NEL
                SSA
                ESA
                HTS
                HTJ
                VTS
                PLD
                PLU
                RI 
                SS2
                SS3
                DCS
                PU1
                PU2
                STS
                CCH
                MW 
                SPA
                EPA
                SOS
                SGC
                SCI
                CSI
                ST 
                OSC
                PM 
                APC
            );

my $out_fh = open_new('l1_char_class_tab.h', '>',
		      {style => '*', by => $0,
                      from => "property definitions"});

# Output the table using fairly short names for each char.
for my $ord (0..255) {
    my $name;
    if ($ord < 32) {    # A C0 control
        $name = $C0[$ord];
    } elsif ($ord > 32 && $ord < 127) { # Graphic
        $name = "'" . chr($ord) . "'";
    } elsif ($ord >= 127 && $ord <= 0x9f) {
        $name = $C1[$ord - 127];    # A C1 control + DEL
    } else {    # SPACE, or, if Latin1, shorten the name */
        use charnames();
        $name = charnames::viacode($ord);
        $name =~ s/LATIN CAPITAL LETTER //
        || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
    }
    printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
}

read_only_bottom_close_and_rename($out_fh)
Commit	Line	Data
9c68f0ab	1	#!perl -w
b72a36d4	2	use v5.15.8;
9c68f0ab KW	3	use strict;
9c68f0ab KW	4	use warnings;
cfb8fd6a	5	require 'regen/regen_lib.pl';
9c68f0ab	6
b1909af7 KW	7	# This program outputs l1_charclass_tab.h, which defines the guts of the
b1909af7 KW	8	# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1	9	# code point at the corresponding position in the table array has. The first
b1909af7	10	# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1 KW	11	# an application to see if the code point "i" has a particular property, it
8d4ab2a1 KW	12	# just does
9c68f0ab KW	13	# 'PL_charclass[i] & BIT'
	14	# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
	15	# character class, and 'property' is the corresponding property, and 'suffix'
	16	# is one of '_A' to mean the property is true only if the corresponding code
	17	# point is ASCII, and '_L1' means that the range includes any Latin1
	18	# character (ISO-8859-1 including the C0 and C1 controls). A property without
	19	# these suffixes does not have different forms for both ranges.
	20
b1909af7 KW	21	# This program need be run only when adding new properties to it, or upon a
b1909af7 KW	22	# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab KW	23
9c68f0ab KW	24	my @properties = qw(
62841d05	25	NONLATIN1_FOLD
15861f94	26	ALPHANUMERIC
f4cdb42c	27	ALPHA
e48bdfbc	28	ASCII
f4cdb42c	29	BLANK
b0d691b2	30	CASED
9c68f0ab	31	CHARNAME_CONT
f4cdb42c KW	32	CNTRL
	33	DIGIT
	34	GRAPH
	35	IDFIRST
	36	LOWER
b72a36d4	37	NON_FINAL_FOLD
f4cdb42c KW	38	PRINT
	39	PSXSPC
	40	PUNCT
9a022f3a	41	QUOTEMETA
f4cdb42c KW	42	SPACE
	43	UPPER
	44	WORDCHAR
	45	XDIGIT
a0947d7b	46	VERTSPACE
f7993745	47	IS_IN_SOME_FOLD
4d68ffa0	48	BACKSLASH_FOO_LBRACE_IS_META
9c68f0ab KW	49	);
9c68f0ab KW	50
00c072cf KW	51	# Read in the case fold mappings.
00c072cf KW	52	my %folded_closure;
62841d05	53	my @hex_non_final_folds;
dbe1ba6b KW	54	my @folds;
	55	use Unicode::UCD;
	56
62841d05 KW	57	BEGIN { # Have to do this at compile time because using user-defined \p{property}
62841d05 KW	58
4ef0bd69 KW	59	# Use the Unicode data file if we are on an ASCII platform (which its data
	60	# is for), and it is in the modern format (starting in Unicode 3.1.0) and
	61	# it is available. This avoids being affected by potential bugs
	62	# introduced by other layers of Perl
	63	my $file="lib/unicore/CaseFolding.txt";
	64
	65	if (ord('A') == 65
	66	&& pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
	67	&& open my $fh, "<", $file)
	68	{
	69	@folds = <$fh>;
	70	}
	71	else {
	72	my ($invlist_ref, $invmap_ref, undef, $default)
dbe1ba6b	73	= Unicode::UCD::prop_invmap('Case_Folding');
4ef0bd69 KW	74	for my $i (0 .. @$invlist_ref - 1 - 1) {
	75	next if $invmap_ref->[$i] == $default;
	76	my $adjust = -1;
	77	for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
	78	$adjust++;
	79
	80	# Single-code point maps go to a 'C' type
	81	if (! ref $invmap_ref->[$i]) {
	82	push @folds, sprintf("%04X; C; %04X\n",
	83	$j,
	84	$invmap_ref->[$i] + $adjust);
	85	}
	86	else { # Multi-code point maps go to 'F'. prop_invmap()
	87	# guarantees that no adjustment is needed for these,
	88	# as the range will contain just one element
	89	push @folds, sprintf("%04X; F; %s\n",
	90	$j,
	91	join " ", map { sprintf "%04X", $_ }
	92	@{$invmap_ref->[$i]});
	93	}
dbe1ba6b KW	94	}
	95	}
	96	}
dbe1ba6b	97
4ef0bd69 KW	98	for (@folds) {
4ef0bd69 KW	99	chomp;
00c072cf	100
4ef0bd69 KW	101	# Lines look like (without the initial '#'
	102	#0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
	103	# Get rid of comments, ignore blank or comment-only lines
	104	my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
	105	next unless length $line;
	106	my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
00c072cf	107
4ef0bd69	108	my $from = hex $hex_from;
00c072cf	109
4ef0bd69 KW	110	# Perl only deals with C and F folds
4ef0bd69 KW	111	next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf	112
4ef0bd69 KW	113	# Get each code point in the range that participates in this line's fold.
	114	# The hash has keys of each code point in the range, and values of what it
	115	# folds to and what folds to it
	116	for my $i (0 .. @folded - 1) {
	117	my $hex_fold = $folded[$i];
	118	my $fold = hex $hex_fold;
	119	push @{$folded_closure{$fold}}, $from if $fold < 256;
	120	push @{$folded_closure{$from}}, $fold if $from < 256;
62841d05	121
07725c18 KW	122	if ($i < @folded-1
	123	&& $fold < 256
	124	&& ! grep { $_ eq $hex_fold } @hex_non_final_folds)
	125	{
	126	push @hex_non_final_folds, $hex_fold;
	127
	128	# Also add the upper case, which in the latin1 range folds to
	129	# $fold
	130	push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold;
	131	}
4ef0bd69	132	}
00c072cf	133	}
00c072cf	134
4ef0bd69 KW	135	# Now having read all the lines, combine them into the full closure of each
	136	# code point in the range by adding lists together that share a common
	137	# element
	138	foreach my $folded (keys %folded_closure) {
	139	foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
	140	push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
	141	}
00c072cf KW	142	}
	143	}
	144
62841d05 KW	145	sub Is_Non_Latin1_Fold {
	146	my @return;
	147
	148	foreach my $folded (keys %folded_closure) {
	149	push @return, sprintf("%X", $folded), if grep { $_ > 255 }
	150	@{$folded_closure{$folded}};
	151	}
	152	return join("\n", @return) . "\n";
00c072cf KW	153	}
00c072cf KW	154
62841d05 KW	155	sub Is_Non_Final_Fold {
	156	return join("\n", @hex_non_final_folds) . "\n";
	157	}
	158
	159	my @bits; # Bit map for each code point
	160
b1909af7	161	# For each character, calculate which properties it matches.
9c68f0ab KW	162	for my $ord (0..255) {
	163	my $char = chr($ord);
	164	utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7 KW	165
b1909af7 KW	166	# Look at all the properties we care about here.
9c68f0ab KW	167	for my $property (@properties) {
	168	my $name = $property;
	169
b1909af7 KW	170	# Remove the suffix to get the actual property name.
b1909af7 KW	171	# Currently the suffixes are '_L1', '_A', and none.
9c68f0ab KW	172	# If is a latin1 version, no further checking is needed.
	173	if (! ($name =~ s/_L1$//)) {
	174
b1909af7	175	# Here, isn't an _L1. If its _A, it's automatically false for
a0947d7b KW	176	# non-ascii. The only current ones (besides ASCII) without a
a0947d7b KW	177	# suffix are valid over the whole range.
9c68f0ab KW	178	next if $name =~ s/_A$// && $ord >= 128;
	179
	180	}
	181	my $re;
	182	if ($name eq 'PUNCT') {;
	183
	184	# Sadly, this is inconsistent: \pP and \pS for the ascii range,
	185	# just \pP outside it.
	186	$re = qr/\p{Punct}\|[^\P{Symbol}\P{ASCII}]/;
	187	} elsif ($name eq 'CHARNAME_CONT') {;
699ffc5e	188	$re = qr/\p{_Perl_Charname_Continue}/,
9c68f0ab	189	} elsif ($name eq 'SPACE') {;
c6e8e4a9	190	$re = qr/\p{XPerlSpace}/;
9c68f0ab KW	191	} elsif ($name eq 'IDFIRST') {
	192	$re = qr/[_\p{Alpha}]/;
	193	} elsif ($name eq 'PSXSPC') {
	194	$re = qr/[\v\p{Space}]/;
	195	} elsif ($name eq 'WORDCHAR') {
c6e8e4a9	196	$re = qr/\p{XPosixWord}/;
15861f94	197	} elsif ($name eq 'ALPHANUMERIC') {
9c68f0ab	198	# Like \w, but no underscore
aedd44b5	199	$re = qr/\p{Alnum}/;
9a022f3a KW	200	} elsif ($name eq 'QUOTEMETA') {
9a022f3a KW	201	$re = qr/\p{_Perl_Quotemeta}/;
62841d05 KW	202	} elsif ($name eq 'NONLATIN1_FOLD') {
62841d05 KW	203	$re = qr/\p{Is_Non_Latin1_Fold}/;
b72a36d4	204	} elsif ($name eq 'NON_FINAL_FOLD') {
62841d05	205	$re = qr/\p{Is_Non_Final_Fold}/;
f7993745 KW	206	} elsif ($name eq 'IS_IN_SOME_FOLD') {
f7993745 KW	207	$re = qr/\p{_Perl_Any_Folds}/;
4d68ffa0 KW	208	} elsif ($name eq 'BACKSLASH_FOO_LBRACE_IS_META') {
	209
	210	# This is true for FOO where FOO is the varying character in:
	211	# \a{, \b{, \c{, ...
	212	# and the sequence has non-literal meaning to Perl; so it is true
	213	# for 'x' because \x{ is special, but not 'a' because \a{ isn't.
	214	$re = qr/[gkNopPx]/;
9c68f0ab KW	215	} else { # The remainder have the same name and values as Unicode
	216	$re = eval "qr/\\p{$name}/";
	217	use Carp;
	218	carp $@ if ! defined $re;
	219	}
	220	#print "$ord, $name $property, $re\n";
07725c18	221	if ($char =~ $re) { # Add this property if matches
9c68f0ab	222	$bits[$ord] .= '\|' if $bits[$ord];
265c1f46	223	$bits[$ord] .= "(1U<<_CC_$property)";
9c68f0ab KW	224	}
	225	}
	226	#print __LINE__, " $ord $char $bits[$ord]\n";
	227	}
	228
	229	# Names of C0 controls
	230	my @C0 = qw (
	231	NUL
	232	SOH
	233	STX
	234	ETX
	235	EOT
	236	ENQ
	237	ACK
	238	BEL
	239	BS
	240	HT
	241	LF
	242	VT
	243	FF
	244	CR
	245	SO
	246	SI
	247	DLE
	248	DC1
	249	DC2
	250	DC3
	251	DC4
	252	NAK
	253	SYN
	254	ETB
	255	CAN
	256	EOM
	257	SUB
	258	ESC
	259	FS
	260	GS
	261	RS
	262	US
	263	);
	264
	265	# Names of C1 controls, plus the adjacent DEL
	266	my @C1 = qw(
	267	DEL
	268	PAD
	269	HOP
	270	BPH
	271	NBH
	272	IND
	273	NEL
	274	SSA
	275	ESA
	276	HTS
	277	HTJ
	278	VTS
	279	PLD
	280	PLU
	281	RI
	282	SS2
	283	SS3
	284	DCS
	285	PU1
	286	PU2
	287	STS
288	CCH
289	MW
290	SPA
291	EPA
292	SOS
293	SGC
294	SCI
295	CSI
296	ST
297	OSC
298	PM
299	APC
300	);
301
cc49830d	302	my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7	303	{style => '*', by => $0,
dbe1ba6b	304	from => "property definitions"});
cfb8fd6a	305
9c68f0ab KW	306	# Output the table using fairly short names for each char.
	307	for my $ord (0..255) {
	308	my $name;
	309	if ($ord < 32) { # A C0 control
	310	$name = $C0[$ord];
	311	} elsif ($ord > 32 && $ord < 127) { # Graphic
	312	$name = "'" . chr($ord) . "'";
	313	} elsif ($ord >= 127 && $ord <= 0x9f) {
	314	$name = $C1[$ord - 127]; # A C1 control + DEL
	315	} else { # SPACE, or, if Latin1, shorten the name */
	316	use charnames();
	317	$name = charnames::viacode($ord);
	318	$name =~ s/LATIN CAPITAL LETTER //
	319	\|\| $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
	320	}
cfb8fd6a	321	printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab KW	322	}
9c68f0ab KW	323
cfb8fd6a	324	read_only_bottom_close_and_rename($out_fh)