[perl5.git] / regen / mk_PL_charclass.pl

#!perl -w
use 5.012;
use strict;
use warnings;
require 'regen/regen_lib.pl';

# This program outputs l1_charclass_tab.h, which defines the guts of the
# PL_charclass table.  Each line is a bit map of properties that the Unicode
# code point at the corresponding position in the table array has.  The first
# line corresponds to code point U+0000, NULL, the last line to U+00FF.  For
# an application to see if the code point "i" has a particular property, it
# just does
#    'PL_charclass[i] & BIT'
# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
# character class, and 'property' is the corresponding property, and 'suffix'
# is one of '_A' to mean the property is true only if the corresponding code
# point is ASCII, and '_L1' means that the range includes any Latin1
# character (ISO-8859-1 including the C0 and C1 controls).  A property without
# these suffixes does not have different forms for both ranges.

# This program need be run only when adding new properties to it, or upon a
# new Unicode release, to make sure things haven't been changed by it.

my @properties = qw(
    ALNUMC_A
    ALNUMC_L1
    ALPHA_A
    ALPHA_L1
    BLANK_A
    BLANK_L1
    CHARNAME_CONT
    CNTRL_A
    CNTRL_L1
    DIGIT_A
    GRAPH_A
    GRAPH_L1
    IDFIRST_A
    IDFIRST_L1
    LOWER_A
    LOWER_L1
    PRINT_A
    PRINT_L1
    PSXSPC_A
    PSXSPC_L1
    PUNCT_A
    PUNCT_L1
    SPACE_A
    SPACE_L1
    UPPER_A
    UPPER_L1
    WORDCHAR_A
    WORDCHAR_L1
    XDIGIT_A
    QUOTEMETA
);

# Read in the case fold mappings.
my %folded_closure;
my $file="lib/unicore/CaseFolding.txt";
my @folds;
use Unicode::UCD;

# Use the Unicode data file if we are on an ASCII platform (which its data is
# for), and it is in the modern format (starting in Unicode 3.1.0) and it is
# available.  This avoids being affected by potential bugs introduced by other
# layers of Perl
if (ord('A') == 65
    && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
    && open my $fh, "<", $file)
{
    @folds = <$fh>;
}
else {
    my ($invlist_ref, $invmap_ref, undef, $default)
                                    = Unicode::UCD::prop_invmap('Case_Folding');
    for my $i (0 .. @$invlist_ref - 1 - 1) {
        next if $invmap_ref->[$i] == $default;
        my $adjust = -1;
        for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
            $adjust++;

            # Single-code point maps go to a 'C' type
            if (! ref $invmap_ref->[$i]) {
                push @folds, sprintf("%04X; C; %04X\n",
                                     $j,
                                     $invmap_ref->[$i] + $adjust);
            }
            else {  # Multi-code point maps go to 'F'.  prop_invmap()
                    # guarantees that no adjustment is needed for these,
                    # as the range will contain just one element
                push @folds, sprintf("%04X; F; %s\n",
                                    $j,
                                    join " ", map { sprintf "%04X", $_ }
                                                    @{$invmap_ref->[$i]});
            }
        }
    }
}

for (@folds) {
    chomp;

    # Lines look like (without the initial '#'
    #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
    # Get rid of comments, ignore blank or comment-only lines
    my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
    next unless length $line;
    my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;

    my $from = hex $hex_from;

    # Perl only deals with C and F folds
    next if $fold_type ne 'C' and $fold_type ne 'F';

    # Get each code point in the range that participates in this line's fold.
    # The hash has keys of each code point in the range, and values of what it
    # folds to and what folds to it
    foreach my $hex_fold (@folded) {
        my $fold = hex $hex_fold;
        push @{$folded_closure{$fold}}, $from if $fold < 256;
        push @{$folded_closure{$from}}, $fold if $from < 256;
    }
}

# Now having read all the lines, combine them into the full closure of each
# code point in the range by adding lists together that share a common element
foreach my $folded (keys %folded_closure) {
    foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
        push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
    }
}

my @bits;   # Bit map for each code point

foreach my $folded (keys %folded_closure) {
    $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
                                                @{$folded_closure{$folded}};
}

# For each character, calculate which properties it matches.
for my $ord (0..255) {
    my $char = chr($ord);
    utf8::upgrade($char);   # Important to use Unicode semantics!

    # Look at all the properties we care about here.
    for my $property (@properties) {
        my $name = $property;

        # Remove the suffix to get the actual property name.
        # Currently the suffixes are '_L1', '_A', and none.
        # If is a latin1 version, no further checking is needed.
        if (! ($name =~ s/_L1$//)) {

            # Here, isn't an _L1.  If its _A, it's automatically false for
            # non-ascii.  The only one current one without a suffix is valid
            # over the whole range.
            next if $name =~ s/_A$// && $ord >= 128;

        }
        my $re;
        if ($name eq 'PUNCT') {;

            # Sadly, this is inconsistent: \pP and \pS for the ascii range,
            # just \pP outside it.
            $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
        } elsif ($name eq 'CHARNAME_CONT') {;
            $re = qr/[-\p{XPosixWord} ():\xa0]/;
        } elsif ($name eq 'SPACE') {;
            $re = qr/\p{XPerlSpace}/;
        } elsif ($name eq 'IDFIRST') {
            $re = qr/[_\p{Alpha}]/;
        } elsif ($name eq 'PSXSPC') {
            $re = qr/[\v\p{Space}]/;
        } elsif ($name eq 'WORDCHAR') {
            $re = qr/\p{XPosixWord}/;
        } elsif ($name eq 'ALNUMC') {
            # Like \w, but no underscore
            $re = qr/\p{Alnum}/;
        } elsif ($name eq 'OCTAL') {
            $re = qr/[0-7]/;
        } elsif ($name eq 'QUOTEMETA') {
            $re = qr/\p{_Perl_Quotemeta}/;
        } else {    # The remainder have the same name and values as Unicode
            $re = eval "qr/\\p{$name}/";
            use Carp;
            carp $@ if ! defined $re;
        }
        #print "$ord, $name $property, $re\n";
        if ($char =~ $re) {  # Add this property if matches
            $bits[$ord] .= '|' if $bits[$ord];
            $bits[$ord] .= "_CC_$property";
        }
    }
    #print __LINE__, " $ord $char $bits[$ord]\n";
}

# Names of C0 controls
my @C0 = qw (
                NUL
                SOH
                STX
                ETX
                EOT
                ENQ
                ACK
                BEL
                BS
                HT
                LF
                VT
                FF
                CR
                SO
                SI
                DLE
                DC1
                DC2
                DC3
                DC4
                NAK
                SYN
                ETB
                CAN
                EOM
                SUB
                ESC
                FS
                GS
                RS
                US
            );

# Names of C1 controls, plus the adjacent DEL
my @C1 = qw(
                DEL
                PAD
                HOP
                BPH
                NBH
                IND
                NEL
                SSA
                ESA
                HTS
                HTJ
                VTS
                PLD
                PLU
                RI 
                SS2
                SS3
                DCS
                PU1
                PU2
                STS
                CCH
                MW 
                SPA
                EPA
                SOS
                SGC
                SCI
                CSI
                ST 
                OSC
                PM 
                APC
            );

my $out_fh = open_new('l1_char_class_tab.h', '>',
		      {style => '*', by => $0,
                      from => "property definitions"});

# Output the table using fairly short names for each char.
for my $ord (0..255) {
    my $name;
    if ($ord < 32) {    # A C0 control
        $name = $C0[$ord];
    } elsif ($ord > 32 && $ord < 127) { # Graphic
        $name = "'" . chr($ord) . "'";
    } elsif ($ord >= 127 && $ord <= 0x9f) {
        $name = $C1[$ord - 127];    # A C1 control + DEL
    } else {    # SPACE, or, if Latin1, shorten the name */
        use charnames();
        $name = charnames::viacode($ord);
        $name =~ s/LATIN CAPITAL LETTER //
        || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
    }
    printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
}

read_only_bottom_close_and_rename($out_fh)
Commit	Line	Data
9c68f0ab KW	1	#!perl -w
	2	use 5.012;
	3	use strict;
	4	use warnings;
cfb8fd6a	5	require 'regen/regen_lib.pl';
9c68f0ab	6
b1909af7 KW	7	# This program outputs l1_charclass_tab.h, which defines the guts of the
b1909af7 KW	8	# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1	9	# code point at the corresponding position in the table array has. The first
b1909af7	10	# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1 KW	11	# an application to see if the code point "i" has a particular property, it
8d4ab2a1 KW	12	# just does
9c68f0ab KW	13	# 'PL_charclass[i] & BIT'
	14	# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
	15	# character class, and 'property' is the corresponding property, and 'suffix'
	16	# is one of '_A' to mean the property is true only if the corresponding code
	17	# point is ASCII, and '_L1' means that the range includes any Latin1
	18	# character (ISO-8859-1 including the C0 and C1 controls). A property without
	19	# these suffixes does not have different forms for both ranges.
	20
b1909af7 KW	21	# This program need be run only when adding new properties to it, or upon a
b1909af7 KW	22	# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab KW	23
	24	my @properties = qw(
	25	ALNUMC_A
	26	ALNUMC_L1
	27	ALPHA_A
	28	ALPHA_L1
	29	BLANK_A
	30	BLANK_L1
	31	CHARNAME_CONT
	32	CNTRL_A
	33	CNTRL_L1
	34	DIGIT_A
	35	GRAPH_A
	36	GRAPH_L1
	37	IDFIRST_A
	38	IDFIRST_L1
	39	LOWER_A
	40	LOWER_L1
9c68f0ab KW	41	PRINT_A
	42	PRINT_L1
	43	PSXSPC_A
	44	PSXSPC_L1
	45	PUNCT_A
	46	PUNCT_L1
	47	SPACE_A
	48	SPACE_L1
	49	UPPER_A
	50	UPPER_L1
	51	WORDCHAR_A
	52	WORDCHAR_L1
	53	XDIGIT_A
9a022f3a	54	QUOTEMETA
9c68f0ab KW	55	);
9c68f0ab KW	56
00c072cf KW	57	# Read in the case fold mappings.
00c072cf KW	58	my %folded_closure;
cfb8fd6a	59	my $file="lib/unicore/CaseFolding.txt";
dbe1ba6b KW	60	my @folds;
	61	use Unicode::UCD;
	62
	63	# Use the Unicode data file if we are on an ASCII platform (which its data is
	64	# for), and it is in the modern format (starting in Unicode 3.1.0) and it is
	65	# available. This avoids being affected by potential bugs introduced by other
	66	# layers of Perl
	67	if (ord('A') == 65
	68	&& pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
	69	&& open my $fh, "<", $file)
	70	{
	71	@folds = <$fh>;
	72	}
	73	else {
	74	my ($invlist_ref, $invmap_ref, undef, $default)
	75	= Unicode::UCD::prop_invmap('Case_Folding');
	76	for my $i (0 .. @$invlist_ref - 1 - 1) {
	77	next if $invmap_ref->[$i] == $default;
	78	my $adjust = -1;
	79	for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
	80	$adjust++;
	81
	82	# Single-code point maps go to a 'C' type
	83	if (! ref $invmap_ref->[$i]) {
	84	push @folds, sprintf("%04X; C; %04X\n",
	85	$j,
	86	$invmap_ref->[$i] + $adjust);
	87	}
	88	else { # Multi-code point maps go to 'F'. prop_invmap()
	89	# guarantees that no adjustment is needed for these,
	90	# as the range will contain just one element
	91	push @folds, sprintf("%04X; F; %s\n",
	92	$j,
	93	join " ", map { sprintf "%04X", $_ }
	94	@{$invmap_ref->[$i]});
	95	}
	96	}
	97	}
	98	}
	99
	100	for (@folds) {
00c072cf KW	101	chomp;
	102
	103	# Lines look like (without the initial '#'
	104	#0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
e3136cf9 KW	105	# Get rid of comments, ignore blank or comment-only lines
	106	my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
	107	next unless length $line;
00c072cf KW	108	my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
	109
	110	my $from = hex $hex_from;
	111
	112	# Perl only deals with C and F folds
33e4950c	113	next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf KW	114
	115	# Get each code point in the range that participates in this line's fold.
	116	# The hash has keys of each code point in the range, and values of what it
	117	# folds to and what folds to it
	118	foreach my $hex_fold (@folded) {
	119	my $fold = hex $hex_fold;
	120	push @{$folded_closure{$fold}}, $from if $fold < 256;
	121	push @{$folded_closure{$from}}, $fold if $from < 256;
	122	}
	123	}
	124
	125	# Now having read all the lines, combine them into the full closure of each
	126	# code point in the range by adding lists together that share a common element
	127	foreach my $folded (keys %folded_closure) {
	128	foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
	129	push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
	130	}
	131	}
	132
9c68f0ab KW	133	my @bits; # Bit map for each code point
9c68f0ab KW	134
00c072cf KW	135	foreach my $folded (keys %folded_closure) {
	136	$bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
	137	@{$folded_closure{$folded}};
	138	}
	139
b1909af7	140	# For each character, calculate which properties it matches.
9c68f0ab KW	141	for my $ord (0..255) {
	142	my $char = chr($ord);
	143	utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7 KW	144
b1909af7 KW	145	# Look at all the properties we care about here.
9c68f0ab KW	146	for my $property (@properties) {
	147	my $name = $property;
	148
b1909af7 KW	149	# Remove the suffix to get the actual property name.
b1909af7 KW	150	# Currently the suffixes are '_L1', '_A', and none.
9c68f0ab KW	151	# If is a latin1 version, no further checking is needed.
	152	if (! ($name =~ s/_L1$//)) {
	153
b1909af7 KW	154	# Here, isn't an _L1. If its _A, it's automatically false for
	155	# non-ascii. The only one current one without a suffix is valid
	156	# over the whole range.
9c68f0ab KW	157	next if $name =~ s/_A$// && $ord >= 128;
	158
	159	}
	160	my $re;
	161	if ($name eq 'PUNCT') {;
	162
	163	# Sadly, this is inconsistent: \pP and \pS for the ascii range,
	164	# just \pP outside it.
	165	$re = qr/\p{Punct}\|[^\P{Symbol}\P{ASCII}]/;
	166	} elsif ($name eq 'CHARNAME_CONT') {;
c6e8e4a9	167	$re = qr/[-\p{XPosixWord} ():\xa0]/;
9c68f0ab	168	} elsif ($name eq 'SPACE') {;
c6e8e4a9	169	$re = qr/\p{XPerlSpace}/;
9c68f0ab KW	170	} elsif ($name eq 'IDFIRST') {
	171	$re = qr/[_\p{Alpha}]/;
	172	} elsif ($name eq 'PSXSPC') {
	173	$re = qr/[\v\p{Space}]/;
	174	} elsif ($name eq 'WORDCHAR') {
c6e8e4a9	175	$re = qr/\p{XPosixWord}/;
9c68f0ab KW	176	} elsif ($name eq 'ALNUMC') {
9c68f0ab KW	177	# Like \w, but no underscore
aedd44b5	178	$re = qr/\p{Alnum}/;
9c68f0ab KW	179	} elsif ($name eq 'OCTAL') {
9c68f0ab KW	180	$re = qr/[0-7]/;
9a022f3a KW	181	} elsif ($name eq 'QUOTEMETA') {
9a022f3a KW	182	$re = qr/\p{_Perl_Quotemeta}/;
9c68f0ab KW	183	} else { # The remainder have the same name and values as Unicode
	184	$re = eval "qr/\\p{$name}/";
	185	use Carp;
	186	carp $@ if ! defined $re;
	187	}
	188	#print "$ord, $name $property, $re\n";
	189	if ($char =~ $re) { # Add this property if matches
	190	$bits[$ord] .= '\|' if $bits[$ord];
	191	$bits[$ord] .= "_CC_$property";
	192	}
	193	}
	194	#print __LINE__, " $ord $char $bits[$ord]\n";
	195	}
	196
	197	# Names of C0 controls
	198	my @C0 = qw (
	199	NUL
	200	SOH
	201	STX
	202	ETX
	203	EOT
	204	ENQ
	205	ACK
	206	BEL
	207	BS
	208	HT
	209	LF
	210	VT
	211	FF
	212	CR
	213	SO
	214	SI
	215	DLE
	216	DC1
	217	DC2
	218	DC3
	219	DC4
	220	NAK
	221	SYN
	222	ETB
	223	CAN
	224	EOM
	225	SUB
	226	ESC
	227	FS
	228	GS
	229	RS
	230	US
	231	);
	232
	233	# Names of C1 controls, plus the adjacent DEL
	234	my @C1 = qw(
	235	DEL
	236	PAD
	237	HOP
	238	BPH
	239	NBH
	240	IND
	241	NEL
	242	SSA
	243	ESA
	244	HTS
	245	HTJ
	246	VTS
247	PLD
248	PLU
249	RI
250	SS2
251	SS3
252	DCS
253	PU1
254	PU2
255	STS
256	CCH
257	MW
258	SPA
259	EPA
260	SOS
261	SGC
262	SCI
263	CSI
264	ST
265	OSC
266	PM
267	APC
268	);
269
cc49830d	270	my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7	271	{style => '*', by => $0,
dbe1ba6b	272	from => "property definitions"});
cfb8fd6a	273
9c68f0ab KW	274	# Output the table using fairly short names for each char.
	275	for my $ord (0..255) {
	276	my $name;
	277	if ($ord < 32) { # A C0 control
	278	$name = $C0[$ord];
	279	} elsif ($ord > 32 && $ord < 127) { # Graphic
	280	$name = "'" . chr($ord) . "'";
	281	} elsif ($ord >= 127 && $ord <= 0x9f) {
	282	$name = $C1[$ord - 127]; # A C1 control + DEL
	283	} else { # SPACE, or, if Latin1, shorten the name */
	284	use charnames();
	285	$name = charnames::viacode($ord);
	286	$name =~ s/LATIN CAPITAL LETTER //
	287	\|\| $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
	288	}
cfb8fd6a	289	printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab KW	290	}
9c68f0ab KW	291
cfb8fd6a	292	read_only_bottom_close_and_rename($out_fh)