[perl5.git] / regen / mk_PL_charclass.pl

#!perl -w
use v5.15.8;
use strict;
use warnings;
require 'regen/regen_lib.pl';

# This program outputs l1_charclass_tab.h, which defines the guts of the
# PL_charclass table.  Each line is a bit map of properties that the Unicode
# code point at the corresponding position in the table array has.  The first
# line corresponds to code point U+0000, NULL, the last line to U+00FF.  For
# an application to see if the code point "i" has a particular property, it
# just does
#    'PL_charclass[i] & BIT'
# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
# character class, and 'property' is the corresponding property, and 'suffix'
# is one of '_A' to mean the property is true only if the corresponding code
# point is ASCII, and '_L1' means that the range includes any Latin1
# character (ISO-8859-1 including the C0 and C1 controls).  A property without
# these suffixes does not have different forms for both ranges.

# This program need be run only when adding new properties to it, or upon a
# new Unicode release, to make sure things haven't been changed by it.

my @properties = qw(
    NONLATIN1_FOLD
    ALNUMC
    ALPHA
    ASCII
    BLANK
    CHARNAME_CONT
    CNTRL
    DIGIT
    GRAPH
    IDFIRST
    LOWER
    NON_FINAL_FOLD
    PRINT
    PSXSPC
    PUNCT
    QUOTEMETA
    SPACE
    UPPER
    WORDCHAR
    XDIGIT
    IS_IN_SOME_FOLD
);

# Read in the case fold mappings.
my %folded_closure;
my @hex_non_final_folds;
my @folds;
use Unicode::UCD;

BEGIN { # Have to do this at compile time because using user-defined \p{property}

    # Use the Unicode data file if we are on an ASCII platform (which its data
    # is for), and it is in the modern format (starting in Unicode 3.1.0) and
    # it is available.  This avoids being affected by potential bugs
    # introduced by other layers of Perl
    my $file="lib/unicore/CaseFolding.txt";

    if (ord('A') == 65
        && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
        && open my $fh, "<", $file)
    {
        @folds = <$fh>;
    }
    else {
        my ($invlist_ref, $invmap_ref, undef, $default)
                                    = Unicode::UCD::prop_invmap('Case_Folding');
        for my $i (0 .. @$invlist_ref - 1 - 1) {
            next if $invmap_ref->[$i] == $default;
            my $adjust = -1;
            for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
                $adjust++;

                # Single-code point maps go to a 'C' type
                if (! ref $invmap_ref->[$i]) {
                    push @folds, sprintf("%04X; C; %04X\n",
                                        $j,
                                        $invmap_ref->[$i] + $adjust);
                }
                else {  # Multi-code point maps go to 'F'.  prop_invmap()
                        # guarantees that no adjustment is needed for these,
                        # as the range will contain just one element
                    push @folds, sprintf("%04X; F; %s\n",
                                        $j,
                                        join " ", map { sprintf "%04X", $_ }
                                                        @{$invmap_ref->[$i]});
                }
            }
        }
    }

    for (@folds) {
        chomp;

        # Lines look like (without the initial '#'
        #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
        # Get rid of comments, ignore blank or comment-only lines
        my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
        next unless length $line;
        my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;

        my $from = hex $hex_from;

        # Perl only deals with C and F folds
        next if $fold_type ne 'C' and $fold_type ne 'F';

        # Get each code point in the range that participates in this line's fold.
        # The hash has keys of each code point in the range, and values of what it
        # folds to and what folds to it
        for my $i (0 .. @folded - 1) {
            my $hex_fold = $folded[$i];
            my $fold = hex $hex_fold;
            push @{$folded_closure{$fold}}, $from if $fold < 256;
            push @{$folded_closure{$from}}, $fold if $from < 256;

            if ($i < @folded-1
                && $fold < 256
                && ! grep { $_ eq $hex_fold } @hex_non_final_folds)
            {
                push @hex_non_final_folds, $hex_fold;

                # Also add the upper case, which in the latin1 range folds to
                # $fold
                push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold;
            }
        }
    }

    # Now having read all the lines, combine them into the full closure of each
    # code point in the range by adding lists together that share a common
    # element
    foreach my $folded (keys %folded_closure) {
        foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
            push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
        }
    }
}

sub Is_Non_Latin1_Fold {
    my @return;

    foreach my $folded (keys %folded_closure) {
        push @return, sprintf("%X", $folded), if grep { $_ > 255 }
                                                     @{$folded_closure{$folded}};
    }
    return join("\n", @return) . "\n";
}

sub Is_Non_Final_Fold {
    return join("\n", @hex_non_final_folds) . "\n";
}

my @bits;   # Bit map for each code point

# For each character, calculate which properties it matches.
for my $ord (0..255) {
    my $char = chr($ord);
    utf8::upgrade($char);   # Important to use Unicode semantics!

    # Look at all the properties we care about here.
    for my $property (@properties) {
        my $name = $property;

        # Remove the suffix to get the actual property name.
        # Currently the suffixes are '_L1', '_A', and none.
        # If is a latin1 version, no further checking is needed.
        if (! ($name =~ s/_L1$//)) {

            # Here, isn't an _L1.  If its _A, it's automatically false for
            # non-ascii.  The only one current one (besides ASCII) without a
            # suffix is valid over the whole range.
            next if $name =~ s/_A$// && $ord >= 128;

        }
        my $re;
        if ($name eq 'PUNCT') {;

            # Sadly, this is inconsistent: \pP and \pS for the ascii range,
            # just \pP outside it.
            $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
        } elsif ($name eq 'CHARNAME_CONT') {;
            $re = qr/\p{_Perl_Charname_Continue}/,
        } elsif ($name eq 'SPACE') {;
            $re = qr/\p{XPerlSpace}/;
        } elsif ($name eq 'IDFIRST') {
            $re = qr/[_\p{Alpha}]/;
        } elsif ($name eq 'PSXSPC') {
            $re = qr/[\v\p{Space}]/;
        } elsif ($name eq 'WORDCHAR') {
            $re = qr/\p{XPosixWord}/;
        } elsif ($name eq 'ALNUMC') {
            # Like \w, but no underscore
            $re = qr/\p{Alnum}/;
        } elsif ($name eq 'QUOTEMETA') {
            $re = qr/\p{_Perl_Quotemeta}/;
        } elsif ($name eq 'NONLATIN1_FOLD') {
            $re = qr/\p{Is_Non_Latin1_Fold}/;
        } elsif ($name eq 'NON_FINAL_FOLD') {
            $re = qr/\p{Is_Non_Final_Fold}/;
        } elsif ($name eq 'IS_IN_SOME_FOLD') {
            $re = qr/\p{_Perl_Any_Folds}/;
        } else {    # The remainder have the same name and values as Unicode
            $re = eval "qr/\\p{$name}/";
            use Carp;
            carp $@ if ! defined $re;
        }
        #print "$ord, $name $property, $re\n";
        if ($char =~ $re) {  # Add this property if matches
            $bits[$ord] .= '|' if $bits[$ord];
            $bits[$ord] .= "(1U<<_CC_$property)";
        }
    }
    #print __LINE__, " $ord $char $bits[$ord]\n";
}

# Names of C0 controls
my @C0 = qw (
                NUL
                SOH
                STX
                ETX
                EOT
                ENQ
                ACK
                BEL
                BS
                HT
                LF
                VT
                FF
                CR
                SO
                SI
                DLE
                DC1
                DC2
                DC3
                DC4
                NAK
                SYN
                ETB
                CAN
                EOM
                SUB
                ESC
                FS
                GS
                RS
                US
            );

# Names of C1 controls, plus the adjacent DEL
my @C1 = qw(
                DEL
                PAD
                HOP
                BPH
                NBH
                IND
                NEL
                SSA
                ESA
                HTS
                HTJ
                VTS
                PLD
                PLU
                RI 
                SS2
                SS3
                DCS
                PU1
                PU2
                STS
                CCH
                MW 
                SPA
                EPA
                SOS
                SGC
                SCI
                CSI
                ST 
                OSC
                PM 
                APC
            );

my $out_fh = open_new('l1_char_class_tab.h', '>',
		      {style => '*', by => $0,
                      from => "property definitions"});

# Output the table using fairly short names for each char.
for my $ord (0..255) {
    my $name;
    if ($ord < 32) {    # A C0 control
        $name = $C0[$ord];
    } elsif ($ord > 32 && $ord < 127) { # Graphic
        $name = "'" . chr($ord) . "'";
    } elsif ($ord >= 127 && $ord <= 0x9f) {
        $name = $C1[$ord - 127];    # A C1 control + DEL
    } else {    # SPACE, or, if Latin1, shorten the name */
        use charnames();
        $name = charnames::viacode($ord);
        $name =~ s/LATIN CAPITAL LETTER //
        || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
    }
    printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
}

read_only_bottom_close_and_rename($out_fh)
Commit	Line	Data
9c68f0ab	1	#!perl -w
b72a36d4	2	use v5.15.8;
9c68f0ab KW	3	use strict;
9c68f0ab KW	4	use warnings;
cfb8fd6a	5	require 'regen/regen_lib.pl';
9c68f0ab	6
b1909af7 KW	7	# This program outputs l1_charclass_tab.h, which defines the guts of the
b1909af7 KW	8	# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1	9	# code point at the corresponding position in the table array has. The first
b1909af7	10	# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1 KW	11	# an application to see if the code point "i" has a particular property, it
8d4ab2a1 KW	12	# just does
9c68f0ab KW	13	# 'PL_charclass[i] & BIT'
	14	# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
	15	# character class, and 'property' is the corresponding property, and 'suffix'
	16	# is one of '_A' to mean the property is true only if the corresponding code
	17	# point is ASCII, and '_L1' means that the range includes any Latin1
	18	# character (ISO-8859-1 including the C0 and C1 controls). A property without
	19	# these suffixes does not have different forms for both ranges.
	20
b1909af7 KW	21	# This program need be run only when adding new properties to it, or upon a
b1909af7 KW	22	# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab KW	23
9c68f0ab KW	24	my @properties = qw(
62841d05	25	NONLATIN1_FOLD
f4cdb42c KW	26	ALNUMC
f4cdb42c KW	27	ALPHA
e48bdfbc	28	ASCII
f4cdb42c	29	BLANK
9c68f0ab	30	CHARNAME_CONT
f4cdb42c KW	31	CNTRL
	32	DIGIT
	33	GRAPH
	34	IDFIRST
	35	LOWER
b72a36d4	36	NON_FINAL_FOLD
f4cdb42c KW	37	PRINT
	38	PSXSPC
	39	PUNCT
9a022f3a	40	QUOTEMETA
f4cdb42c KW	41	SPACE
	42	UPPER
	43	WORDCHAR
	44	XDIGIT
f7993745	45	IS_IN_SOME_FOLD
9c68f0ab KW	46	);
9c68f0ab KW	47
00c072cf KW	48	# Read in the case fold mappings.
00c072cf KW	49	my %folded_closure;
62841d05	50	my @hex_non_final_folds;
dbe1ba6b KW	51	my @folds;
	52	use Unicode::UCD;
	53
62841d05 KW	54	BEGIN { # Have to do this at compile time because using user-defined \p{property}
62841d05 KW	55
4ef0bd69 KW	56	# Use the Unicode data file if we are on an ASCII platform (which its data
	57	# is for), and it is in the modern format (starting in Unicode 3.1.0) and
	58	# it is available. This avoids being affected by potential bugs
	59	# introduced by other layers of Perl
	60	my $file="lib/unicore/CaseFolding.txt";
	61
	62	if (ord('A') == 65
	63	&& pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
	64	&& open my $fh, "<", $file)
	65	{
	66	@folds = <$fh>;
	67	}
	68	else {
	69	my ($invlist_ref, $invmap_ref, undef, $default)
dbe1ba6b	70	= Unicode::UCD::prop_invmap('Case_Folding');
4ef0bd69 KW	71	for my $i (0 .. @$invlist_ref - 1 - 1) {
	72	next if $invmap_ref->[$i] == $default;
	73	my $adjust = -1;
	74	for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
	75	$adjust++;
	76
	77	# Single-code point maps go to a 'C' type
	78	if (! ref $invmap_ref->[$i]) {
	79	push @folds, sprintf("%04X; C; %04X\n",
	80	$j,
	81	$invmap_ref->[$i] + $adjust);
	82	}
	83	else { # Multi-code point maps go to 'F'. prop_invmap()
	84	# guarantees that no adjustment is needed for these,
	85	# as the range will contain just one element
	86	push @folds, sprintf("%04X; F; %s\n",
	87	$j,
	88	join " ", map { sprintf "%04X", $_ }
	89	@{$invmap_ref->[$i]});
	90	}
dbe1ba6b KW	91	}
	92	}
	93	}
dbe1ba6b	94
4ef0bd69 KW	95	for (@folds) {
4ef0bd69 KW	96	chomp;
00c072cf	97
4ef0bd69 KW	98	# Lines look like (without the initial '#'
	99	#0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
	100	# Get rid of comments, ignore blank or comment-only lines
	101	my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
	102	next unless length $line;
	103	my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
00c072cf	104
4ef0bd69	105	my $from = hex $hex_from;
00c072cf	106
4ef0bd69 KW	107	# Perl only deals with C and F folds
4ef0bd69 KW	108	next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf	109
4ef0bd69 KW	110	# Get each code point in the range that participates in this line's fold.
	111	# The hash has keys of each code point in the range, and values of what it
	112	# folds to and what folds to it
	113	for my $i (0 .. @folded - 1) {
	114	my $hex_fold = $folded[$i];
	115	my $fold = hex $hex_fold;
	116	push @{$folded_closure{$fold}}, $from if $fold < 256;
	117	push @{$folded_closure{$from}}, $fold if $from < 256;
62841d05	118
07725c18 KW	119	if ($i < @folded-1
	120	&& $fold < 256
	121	&& ! grep { $_ eq $hex_fold } @hex_non_final_folds)
	122	{
	123	push @hex_non_final_folds, $hex_fold;
	124
	125	# Also add the upper case, which in the latin1 range folds to
	126	# $fold
	127	push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold;
	128	}
4ef0bd69	129	}
00c072cf	130	}
00c072cf	131
4ef0bd69 KW	132	# Now having read all the lines, combine them into the full closure of each
	133	# code point in the range by adding lists together that share a common
	134	# element
	135	foreach my $folded (keys %folded_closure) {
	136	foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
	137	push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
	138	}
00c072cf KW	139	}
	140	}
	141
62841d05 KW	142	sub Is_Non_Latin1_Fold {
	143	my @return;
	144
	145	foreach my $folded (keys %folded_closure) {
	146	push @return, sprintf("%X", $folded), if grep { $_ > 255 }
	147	@{$folded_closure{$folded}};
	148	}
	149	return join("\n", @return) . "\n";
00c072cf KW	150	}
00c072cf KW	151
62841d05 KW	152	sub Is_Non_Final_Fold {
	153	return join("\n", @hex_non_final_folds) . "\n";
	154	}
	155
	156	my @bits; # Bit map for each code point
	157
b1909af7	158	# For each character, calculate which properties it matches.
9c68f0ab KW	159	for my $ord (0..255) {
	160	my $char = chr($ord);
	161	utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7 KW	162
b1909af7 KW	163	# Look at all the properties we care about here.
9c68f0ab KW	164	for my $property (@properties) {
	165	my $name = $property;
	166
b1909af7 KW	167	# Remove the suffix to get the actual property name.
b1909af7 KW	168	# Currently the suffixes are '_L1', '_A', and none.
9c68f0ab KW	169	# If is a latin1 version, no further checking is needed.
	170	if (! ($name =~ s/_L1$//)) {
	171
b1909af7	172	# Here, isn't an _L1. If its _A, it's automatically false for
e48bdfbc KW	173	# non-ascii. The only one current one (besides ASCII) without a
e48bdfbc KW	174	# suffix is valid over the whole range.
9c68f0ab KW	175	next if $name =~ s/_A$// && $ord >= 128;
	176
	177	}
	178	my $re;
	179	if ($name eq 'PUNCT') {;
	180
	181	# Sadly, this is inconsistent: \pP and \pS for the ascii range,
	182	# just \pP outside it.
	183	$re = qr/\p{Punct}\|[^\P{Symbol}\P{ASCII}]/;
	184	} elsif ($name eq 'CHARNAME_CONT') {;
699ffc5e	185	$re = qr/\p{_Perl_Charname_Continue}/,
9c68f0ab	186	} elsif ($name eq 'SPACE') {;
c6e8e4a9	187	$re = qr/\p{XPerlSpace}/;
9c68f0ab KW	188	} elsif ($name eq 'IDFIRST') {
	189	$re = qr/[_\p{Alpha}]/;
	190	} elsif ($name eq 'PSXSPC') {
	191	$re = qr/[\v\p{Space}]/;
	192	} elsif ($name eq 'WORDCHAR') {
c6e8e4a9	193	$re = qr/\p{XPosixWord}/;
9c68f0ab KW	194	} elsif ($name eq 'ALNUMC') {
9c68f0ab KW	195	# Like \w, but no underscore
aedd44b5	196	$re = qr/\p{Alnum}/;
9a022f3a KW	197	} elsif ($name eq 'QUOTEMETA') {
9a022f3a KW	198	$re = qr/\p{_Perl_Quotemeta}/;
62841d05 KW	199	} elsif ($name eq 'NONLATIN1_FOLD') {
62841d05 KW	200	$re = qr/\p{Is_Non_Latin1_Fold}/;
b72a36d4	201	} elsif ($name eq 'NON_FINAL_FOLD') {
62841d05	202	$re = qr/\p{Is_Non_Final_Fold}/;
f7993745 KW	203	} elsif ($name eq 'IS_IN_SOME_FOLD') {
f7993745 KW	204	$re = qr/\p{_Perl_Any_Folds}/;
9c68f0ab KW	205	} else { # The remainder have the same name and values as Unicode
	206	$re = eval "qr/\\p{$name}/";
	207	use Carp;
	208	carp $@ if ! defined $re;
	209	}
	210	#print "$ord, $name $property, $re\n";
07725c18	211	if ($char =~ $re) { # Add this property if matches
9c68f0ab	212	$bits[$ord] .= '\|' if $bits[$ord];
265c1f46	213	$bits[$ord] .= "(1U<<_CC_$property)";
9c68f0ab KW	214	}
	215	}
	216	#print __LINE__, " $ord $char $bits[$ord]\n";
	217	}
	218
	219	# Names of C0 controls
	220	my @C0 = qw (
	221	NUL
	222	SOH
	223	STX
	224	ETX
	225	EOT
	226	ENQ
	227	ACK
	228	BEL
	229	BS
	230	HT
	231	LF
	232	VT
	233	FF
	234	CR
	235	SO
	236	SI
	237	DLE
	238	DC1
	239	DC2
	240	DC3
	241	DC4
	242	NAK
	243	SYN
	244	ETB
	245	CAN
	246	EOM
	247	SUB
	248	ESC
	249	FS
	250	GS
	251	RS
	252	US
	253	);
	254
	255	# Names of C1 controls, plus the adjacent DEL
	256	my @C1 = qw(
	257	DEL
	258	PAD
	259	HOP
	260	BPH
	261	NBH
	262	IND
	263	NEL
	264	SSA
	265	ESA
	266	HTS
	267	HTJ
	268	VTS
	269	PLD
	270	PLU
	271	RI
	272	SS2
	273	SS3
	274	DCS
	275	PU1
	276	PU2
	277	STS
278	CCH
279	MW
280	SPA
281	EPA
282	SOS
283	SGC
284	SCI
285	CSI
286	ST
287	OSC
288	PM
289	APC
290	);
291
cc49830d	292	my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7	293	{style => '*', by => $0,
dbe1ba6b	294	from => "property definitions"});
cfb8fd6a	295
9c68f0ab KW	296	# Output the table using fairly short names for each char.
	297	for my $ord (0..255) {
	298	my $name;
	299	if ($ord < 32) { # A C0 control
	300	$name = $C0[$ord];
	301	} elsif ($ord > 32 && $ord < 127) { # Graphic
	302	$name = "'" . chr($ord) . "'";
	303	} elsif ($ord >= 127 && $ord <= 0x9f) {
	304	$name = $C1[$ord - 127]; # A C1 control + DEL
	305	} else { # SPACE, or, if Latin1, shorten the name */
	306	use charnames();
	307	$name = charnames::viacode($ord);
	308	$name =~ s/LATIN CAPITAL LETTER //
	309	\|\| $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
	310	}
cfb8fd6a	311	printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab KW	312	}
9c68f0ab KW	313
cfb8fd6a	314	read_only_bottom_close_and_rename($out_fh)