[perl5.git] / regen / mk_PL_charclass.pl

#!perl -w
use v5.15.8;
use strict;
use warnings;
require 'regen/regen_lib.pl';

# This program outputs l1_charclass_tab.h, which defines the guts of the
# PL_charclass table.  Each line is a bit map of properties that the Unicode
# code point at the corresponding position in the table array has.  The first
# line corresponds to code point U+0000, NULL, the last line to U+00FF.  For
# an application to see if the code point "i" has a particular property, it
# just does
#    'PL_charclass[i] & BIT'
# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
# character class, and 'property' is the corresponding property, and 'suffix'
# is one of '_A' to mean the property is true only if the corresponding code
# point is ASCII, and '_L1' means that the range includes any Latin1
# character (ISO-8859-1 including the C0 and C1 controls).  A property without
# these suffixes does not have different forms for both ranges.

# This program need be run only when adding new properties to it, or upon a
# new Unicode release, to make sure things haven't been changed by it.

my @properties = qw(
    NONLATIN1_FOLD
    ALNUMC
    ALPHA
    ASCII
    BLANK
    CHARNAME_CONT
    CNTRL
    DIGIT
    GRAPH
    IDFIRST
    LOWER
    NON_FINAL_FOLD
    PRINT
    PSXSPC
    PUNCT
    QUOTEMETA
    SPACE
    UPPER
    WORDCHAR
    XDIGIT
    VERTSPACE
    IS_IN_SOME_FOLD
);

# Read in the case fold mappings.
my %folded_closure;
my @hex_non_final_folds;
my @folds;
use Unicode::UCD;

BEGIN { # Have to do this at compile time because using user-defined \p{property}

    # Use the Unicode data file if we are on an ASCII platform (which its data
    # is for), and it is in the modern format (starting in Unicode 3.1.0) and
    # it is available.  This avoids being affected by potential bugs
    # introduced by other layers of Perl
    my $file="lib/unicore/CaseFolding.txt";

    if (ord('A') == 65
        && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
        && open my $fh, "<", $file)
    {
        @folds = <$fh>;
    }
    else {
        my ($invlist_ref, $invmap_ref, undef, $default)
                                    = Unicode::UCD::prop_invmap('Case_Folding');
        for my $i (0 .. @$invlist_ref - 1 - 1) {
            next if $invmap_ref->[$i] == $default;
            my $adjust = -1;
            for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
                $adjust++;

                # Single-code point maps go to a 'C' type
                if (! ref $invmap_ref->[$i]) {
                    push @folds, sprintf("%04X; C; %04X\n",
                                        $j,
                                        $invmap_ref->[$i] + $adjust);
                }
                else {  # Multi-code point maps go to 'F'.  prop_invmap()
                        # guarantees that no adjustment is needed for these,
                        # as the range will contain just one element
                    push @folds, sprintf("%04X; F; %s\n",
                                        $j,
                                        join " ", map { sprintf "%04X", $_ }
                                                        @{$invmap_ref->[$i]});
                }
            }
        }
    }

    for (@folds) {
        chomp;

        # Lines look like (without the initial '#'
        #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
        # Get rid of comments, ignore blank or comment-only lines
        my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
        next unless length $line;
        my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;

        my $from = hex $hex_from;

        # Perl only deals with C and F folds
        next if $fold_type ne 'C' and $fold_type ne 'F';

        # Get each code point in the range that participates in this line's fold.
        # The hash has keys of each code point in the range, and values of what it
        # folds to and what folds to it
        for my $i (0 .. @folded - 1) {
            my $hex_fold = $folded[$i];
            my $fold = hex $hex_fold;
            push @{$folded_closure{$fold}}, $from if $fold < 256;
            push @{$folded_closure{$from}}, $fold if $from < 256;

            if ($i < @folded-1
                && $fold < 256
                && ! grep { $_ eq $hex_fold } @hex_non_final_folds)
            {
                push @hex_non_final_folds, $hex_fold;

                # Also add the upper case, which in the latin1 range folds to
                # $fold
                push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold;
            }
        }
    }

    # Now having read all the lines, combine them into the full closure of each
    # code point in the range by adding lists together that share a common
    # element
    foreach my $folded (keys %folded_closure) {
        foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
            push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
        }
    }
}

sub Is_Non_Latin1_Fold {
    my @return;

    foreach my $folded (keys %folded_closure) {
        push @return, sprintf("%X", $folded), if grep { $_ > 255 }
                                                     @{$folded_closure{$folded}};
    }
    return join("\n", @return) . "\n";
}

sub Is_Non_Final_Fold {
    return join("\n", @hex_non_final_folds) . "\n";
}

my @bits;   # Bit map for each code point

# For each character, calculate which properties it matches.
for my $ord (0..255) {
    my $char = chr($ord);
    utf8::upgrade($char);   # Important to use Unicode semantics!

    # Look at all the properties we care about here.
    for my $property (@properties) {
        my $name = $property;

        # Remove the suffix to get the actual property name.
        # Currently the suffixes are '_L1', '_A', and none.
        # If is a latin1 version, no further checking is needed.
        if (! ($name =~ s/_L1$//)) {

            # Here, isn't an _L1.  If its _A, it's automatically false for
            # non-ascii.  The only current ones (besides ASCII) without a
            # suffix are valid over the whole range.
            next if $name =~ s/_A$// && $ord >= 128;

        }
        my $re;
        if ($name eq 'PUNCT') {;

            # Sadly, this is inconsistent: \pP and \pS for the ascii range,
            # just \pP outside it.
            $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
        } elsif ($name eq 'CHARNAME_CONT') {;
            $re = qr/\p{_Perl_Charname_Continue}/,
        } elsif ($name eq 'SPACE') {;
            $re = qr/\p{XPerlSpace}/;
        } elsif ($name eq 'IDFIRST') {
            $re = qr/[_\p{Alpha}]/;
        } elsif ($name eq 'PSXSPC') {
            $re = qr/[\v\p{Space}]/;
        } elsif ($name eq 'WORDCHAR') {
            $re = qr/\p{XPosixWord}/;
        } elsif ($name eq 'ALNUMC') {
            # Like \w, but no underscore
            $re = qr/\p{Alnum}/;
        } elsif ($name eq 'QUOTEMETA') {
            $re = qr/\p{_Perl_Quotemeta}/;
        } elsif ($name eq 'NONLATIN1_FOLD') {
            $re = qr/\p{Is_Non_Latin1_Fold}/;
        } elsif ($name eq 'NON_FINAL_FOLD') {
            $re = qr/\p{Is_Non_Final_Fold}/;
        } elsif ($name eq 'IS_IN_SOME_FOLD') {
            $re = qr/\p{_Perl_Any_Folds}/;
        } else {    # The remainder have the same name and values as Unicode
            $re = eval "qr/\\p{$name}/";
            use Carp;
            carp $@ if ! defined $re;
        }
        #print "$ord, $name $property, $re\n";
        if ($char =~ $re) {  # Add this property if matches
            $bits[$ord] .= '|' if $bits[$ord];
            $bits[$ord] .= "(1U<<_CC_$property)";
        }
    }
    #print __LINE__, " $ord $char $bits[$ord]\n";
}

# Names of C0 controls
my @C0 = qw (
                NUL
                SOH
                STX
                ETX
                EOT
                ENQ
                ACK
                BEL
                BS
                HT
                LF
                VT
                FF
                CR
                SO
                SI
                DLE
                DC1
                DC2
                DC3
                DC4
                NAK
                SYN
                ETB
                CAN
                EOM
                SUB
                ESC
                FS
                GS
                RS
                US
            );

# Names of C1 controls, plus the adjacent DEL
my @C1 = qw(
                DEL
                PAD
                HOP
                BPH
                NBH
                IND
                NEL
                SSA
                ESA
                HTS
                HTJ
                VTS
                PLD
                PLU
                RI 
                SS2
                SS3
                DCS
                PU1
                PU2
                STS
                CCH
                MW 
                SPA
                EPA
                SOS
                SGC
                SCI
                CSI
                ST 
                OSC
                PM 
                APC
            );

my $out_fh = open_new('l1_char_class_tab.h', '>',
		      {style => '*', by => $0,
                      from => "property definitions"});

# Output the table using fairly short names for each char.
for my $ord (0..255) {
    my $name;
    if ($ord < 32) {    # A C0 control
        $name = $C0[$ord];
    } elsif ($ord > 32 && $ord < 127) { # Graphic
        $name = "'" . chr($ord) . "'";
    } elsif ($ord >= 127 && $ord <= 0x9f) {
        $name = $C1[$ord - 127];    # A C1 control + DEL
    } else {    # SPACE, or, if Latin1, shorten the name */
        use charnames();
        $name = charnames::viacode($ord);
        $name =~ s/LATIN CAPITAL LETTER //
        || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
    }
    printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
}

read_only_bottom_close_and_rename($out_fh)
Commit	Line	Data
9c68f0ab	1	#!perl -w
b72a36d4	2	use v5.15.8;
9c68f0ab KW	3	use strict;
9c68f0ab KW	4	use warnings;
cfb8fd6a	5	require 'regen/regen_lib.pl';
9c68f0ab	6
b1909af7 KW	7	# This program outputs l1_charclass_tab.h, which defines the guts of the
b1909af7 KW	8	# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1	9	# code point at the corresponding position in the table array has. The first
b1909af7	10	# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1 KW	11	# an application to see if the code point "i" has a particular property, it
8d4ab2a1 KW	12	# just does
9c68f0ab KW	13	# 'PL_charclass[i] & BIT'
	14	# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
	15	# character class, and 'property' is the corresponding property, and 'suffix'
	16	# is one of '_A' to mean the property is true only if the corresponding code
	17	# point is ASCII, and '_L1' means that the range includes any Latin1
	18	# character (ISO-8859-1 including the C0 and C1 controls). A property without
	19	# these suffixes does not have different forms for both ranges.
	20
b1909af7 KW	21	# This program need be run only when adding new properties to it, or upon a
b1909af7 KW	22	# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab KW	23
9c68f0ab KW	24	my @properties = qw(
62841d05	25	NONLATIN1_FOLD
f4cdb42c KW	26	ALNUMC
f4cdb42c KW	27	ALPHA
e48bdfbc	28	ASCII
f4cdb42c	29	BLANK
9c68f0ab	30	CHARNAME_CONT
f4cdb42c KW	31	CNTRL
	32	DIGIT
	33	GRAPH
	34	IDFIRST
	35	LOWER
b72a36d4	36	NON_FINAL_FOLD
f4cdb42c KW	37	PRINT
	38	PSXSPC
	39	PUNCT
9a022f3a	40	QUOTEMETA
f4cdb42c KW	41	SPACE
	42	UPPER
	43	WORDCHAR
	44	XDIGIT
a0947d7b	45	VERTSPACE
f7993745	46	IS_IN_SOME_FOLD
9c68f0ab KW	47	);
9c68f0ab KW	48
00c072cf KW	49	# Read in the case fold mappings.
00c072cf KW	50	my %folded_closure;
62841d05	51	my @hex_non_final_folds;
dbe1ba6b KW	52	my @folds;
	53	use Unicode::UCD;
	54
62841d05 KW	55	BEGIN { # Have to do this at compile time because using user-defined \p{property}
62841d05 KW	56
4ef0bd69 KW	57	# Use the Unicode data file if we are on an ASCII platform (which its data
	58	# is for), and it is in the modern format (starting in Unicode 3.1.0) and
	59	# it is available. This avoids being affected by potential bugs
	60	# introduced by other layers of Perl
	61	my $file="lib/unicore/CaseFolding.txt";
	62
	63	if (ord('A') == 65
	64	&& pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
	65	&& open my $fh, "<", $file)
	66	{
	67	@folds = <$fh>;
	68	}
	69	else {
	70	my ($invlist_ref, $invmap_ref, undef, $default)
dbe1ba6b	71	= Unicode::UCD::prop_invmap('Case_Folding');
4ef0bd69 KW	72	for my $i (0 .. @$invlist_ref - 1 - 1) {
	73	next if $invmap_ref->[$i] == $default;
	74	my $adjust = -1;
	75	for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
	76	$adjust++;
	77
	78	# Single-code point maps go to a 'C' type
	79	if (! ref $invmap_ref->[$i]) {
	80	push @folds, sprintf("%04X; C; %04X\n",
	81	$j,
	82	$invmap_ref->[$i] + $adjust);
	83	}
	84	else { # Multi-code point maps go to 'F'. prop_invmap()
	85	# guarantees that no adjustment is needed for these,
	86	# as the range will contain just one element
	87	push @folds, sprintf("%04X; F; %s\n",
	88	$j,
	89	join " ", map { sprintf "%04X", $_ }
	90	@{$invmap_ref->[$i]});
	91	}
dbe1ba6b KW	92	}
	93	}
	94	}
dbe1ba6b	95
4ef0bd69 KW	96	for (@folds) {
4ef0bd69 KW	97	chomp;
00c072cf	98
4ef0bd69 KW	99	# Lines look like (without the initial '#'
	100	#0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
	101	# Get rid of comments, ignore blank or comment-only lines
	102	my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
	103	next unless length $line;
	104	my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
00c072cf	105
4ef0bd69	106	my $from = hex $hex_from;
00c072cf	107
4ef0bd69 KW	108	# Perl only deals with C and F folds
4ef0bd69 KW	109	next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf	110
4ef0bd69 KW	111	# Get each code point in the range that participates in this line's fold.
	112	# The hash has keys of each code point in the range, and values of what it
	113	# folds to and what folds to it
	114	for my $i (0 .. @folded - 1) {
	115	my $hex_fold = $folded[$i];
	116	my $fold = hex $hex_fold;
	117	push @{$folded_closure{$fold}}, $from if $fold < 256;
	118	push @{$folded_closure{$from}}, $fold if $from < 256;
62841d05	119
07725c18 KW	120	if ($i < @folded-1
	121	&& $fold < 256
	122	&& ! grep { $_ eq $hex_fold } @hex_non_final_folds)
	123	{
	124	push @hex_non_final_folds, $hex_fold;
	125
	126	# Also add the upper case, which in the latin1 range folds to
	127	# $fold
	128	push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold;
	129	}
4ef0bd69	130	}
00c072cf	131	}
00c072cf	132
4ef0bd69 KW	133	# Now having read all the lines, combine them into the full closure of each
	134	# code point in the range by adding lists together that share a common
	135	# element
	136	foreach my $folded (keys %folded_closure) {
	137	foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
	138	push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
	139	}
00c072cf KW	140	}
	141	}
	142
62841d05 KW	143	sub Is_Non_Latin1_Fold {
	144	my @return;
	145
	146	foreach my $folded (keys %folded_closure) {
	147	push @return, sprintf("%X", $folded), if grep { $_ > 255 }
	148	@{$folded_closure{$folded}};
	149	}
	150	return join("\n", @return) . "\n";
00c072cf KW	151	}
00c072cf KW	152
62841d05 KW	153	sub Is_Non_Final_Fold {
	154	return join("\n", @hex_non_final_folds) . "\n";
	155	}
	156
	157	my @bits; # Bit map for each code point
	158
b1909af7	159	# For each character, calculate which properties it matches.
9c68f0ab KW	160	for my $ord (0..255) {
	161	my $char = chr($ord);
	162	utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7 KW	163
b1909af7 KW	164	# Look at all the properties we care about here.
9c68f0ab KW	165	for my $property (@properties) {
	166	my $name = $property;
	167
b1909af7 KW	168	# Remove the suffix to get the actual property name.
b1909af7 KW	169	# Currently the suffixes are '_L1', '_A', and none.
9c68f0ab KW	170	# If is a latin1 version, no further checking is needed.
	171	if (! ($name =~ s/_L1$//)) {
	172
b1909af7	173	# Here, isn't an _L1. If its _A, it's automatically false for
a0947d7b KW	174	# non-ascii. The only current ones (besides ASCII) without a
a0947d7b KW	175	# suffix are valid over the whole range.
9c68f0ab KW	176	next if $name =~ s/_A$// && $ord >= 128;
	177
	178	}
	179	my $re;
	180	if ($name eq 'PUNCT') {;
	181
	182	# Sadly, this is inconsistent: \pP and \pS for the ascii range,
	183	# just \pP outside it.
	184	$re = qr/\p{Punct}\|[^\P{Symbol}\P{ASCII}]/;
	185	} elsif ($name eq 'CHARNAME_CONT') {;
699ffc5e	186	$re = qr/\p{_Perl_Charname_Continue}/,
9c68f0ab	187	} elsif ($name eq 'SPACE') {;
c6e8e4a9	188	$re = qr/\p{XPerlSpace}/;
9c68f0ab KW	189	} elsif ($name eq 'IDFIRST') {
	190	$re = qr/[_\p{Alpha}]/;
	191	} elsif ($name eq 'PSXSPC') {
	192	$re = qr/[\v\p{Space}]/;
	193	} elsif ($name eq 'WORDCHAR') {
c6e8e4a9	194	$re = qr/\p{XPosixWord}/;
9c68f0ab KW	195	} elsif ($name eq 'ALNUMC') {
9c68f0ab KW	196	# Like \w, but no underscore
aedd44b5	197	$re = qr/\p{Alnum}/;
9a022f3a KW	198	} elsif ($name eq 'QUOTEMETA') {
9a022f3a KW	199	$re = qr/\p{_Perl_Quotemeta}/;
62841d05 KW	200	} elsif ($name eq 'NONLATIN1_FOLD') {
62841d05 KW	201	$re = qr/\p{Is_Non_Latin1_Fold}/;
b72a36d4	202	} elsif ($name eq 'NON_FINAL_FOLD') {
62841d05	203	$re = qr/\p{Is_Non_Final_Fold}/;
f7993745 KW	204	} elsif ($name eq 'IS_IN_SOME_FOLD') {
f7993745 KW	205	$re = qr/\p{_Perl_Any_Folds}/;
9c68f0ab KW	206	} else { # The remainder have the same name and values as Unicode
	207	$re = eval "qr/\\p{$name}/";
	208	use Carp;
	209	carp $@ if ! defined $re;
	210	}
	211	#print "$ord, $name $property, $re\n";
07725c18	212	if ($char =~ $re) { # Add this property if matches
9c68f0ab	213	$bits[$ord] .= '\|' if $bits[$ord];
265c1f46	214	$bits[$ord] .= "(1U<<_CC_$property)";
9c68f0ab KW	215	}
	216	}
	217	#print __LINE__, " $ord $char $bits[$ord]\n";
	218	}
	219
	220	# Names of C0 controls
	221	my @C0 = qw (
	222	NUL
	223	SOH
	224	STX
	225	ETX
	226	EOT
	227	ENQ
	228	ACK
	229	BEL
	230	BS
	231	HT
	232	LF
	233	VT
	234	FF
	235	CR
	236	SO
	237	SI
	238	DLE
	239	DC1
	240	DC2
	241	DC3
	242	DC4
	243	NAK
	244	SYN
	245	ETB
	246	CAN
	247	EOM
	248	SUB
	249	ESC
	250	FS
	251	GS
	252	RS
	253	US
	254	);
	255
	256	# Names of C1 controls, plus the adjacent DEL
	257	my @C1 = qw(
	258	DEL
	259	PAD
	260	HOP
	261	BPH
	262	NBH
	263	IND
	264	NEL
	265	SSA
	266	ESA
	267	HTS
	268	HTJ
	269	VTS
	270	PLD
	271	PLU
	272	RI
	273	SS2
	274	SS3
	275	DCS
	276	PU1
	277	PU2
	278	STS
279	CCH
280	MW
281	SPA
282	EPA
283	SOS
284	SGC
285	SCI
286	CSI
287	ST
288	OSC
289	PM
290	APC
291	);
292
cc49830d	293	my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7	294	{style => '*', by => $0,
dbe1ba6b	295	from => "property definitions"});
cfb8fd6a	296
9c68f0ab KW	297	# Output the table using fairly short names for each char.
	298	for my $ord (0..255) {
	299	my $name;
	300	if ($ord < 32) { # A C0 control
	301	$name = $C0[$ord];
	302	} elsif ($ord > 32 && $ord < 127) { # Graphic
	303	$name = "'" . chr($ord) . "'";
	304	} elsif ($ord >= 127 && $ord <= 0x9f) {
	305	$name = $C1[$ord - 127]; # A C1 control + DEL
	306	} else { # SPACE, or, if Latin1, shorten the name */
	307	use charnames();
	308	$name = charnames::viacode($ord);
	309	$name =~ s/LATIN CAPITAL LETTER //
	310	\|\| $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
	311	}
cfb8fd6a	312	printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab KW	313	}
9c68f0ab KW	314
cfb8fd6a	315	read_only_bottom_close_and_rename($out_fh)