[perl5.git] / regen / mk_invlists.pl

#!perl -w
use 5.015;
use strict;
use warnings;
use Unicode::UCD qw(prop_invlist prop_invmap);
require 'regen/regen_lib.pl';

# This program outputs charclass_invlists.h, which contains various inversion
# lists in the form of C arrays that are to be used as-is for inversion lists.
# Thus, the lists it contains are essentially pre-compiled, and need only a
# light-weight fast wrapper to make them usable at run-time.

# As such, this code knows about the internal structure of these lists, and
# any change made to that has to be done here as well.  A random number stored
# in the headers is used to minimize the possibility of things getting
# out-of-sync, or the wrong data structure being passed.  Currently that
# random number is:
my $VERSION_DATA_STRUCTURE_TYPE = 148565664;

my $out_fh = open_new('charclass_invlists.h', '>',
		      {style => '*', by => $0,
                      from => "Unicode::UCD"});

print $out_fh "/* See the generating file for comments */\n\n";

my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );

sub output_invlist ($$) {
    my $name = shift;
    my $invlist = shift;     # Reference to inversion list array

    die "No inversion list for $name" unless defined $invlist
                                             && ref $invlist eq 'ARRAY'
                                             && @$invlist;

    # Output the inversion list $invlist using the name $name for it.
    # It is output in the exact internal form for inversion lists.

    # Is the last element of the header 0, or 1 ?
    my $zero_or_one = 0;
    if ($invlist->[0] != 0) {
        unshift @$invlist, 0;
        $zero_or_one = 1;
    }
    my $count = @$invlist;

    print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
    print $out_fh "\nstatic const UV ${name}_invlist[] = {\n";

    print $out_fh "\t$count,\t/* Number of elements */\n";
    print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
    print $out_fh "\t", $zero_or_one,
                  ",\t/* 0 if the list starts at 0;",
                  "\n\t\t   1 if it starts at the element beyond 0 */\n";

    # The main body are the UVs passed in to this routine.  Do the final
    # element separately
    for my $i (0 .. @$invlist - 1 - 1) {
        print $out_fh "\t$invlist->[$i],\n";
    }

    # The final element does not have a trailing comma, as C can't handle it.
    print $out_fh "\t$invlist->[-1]\n";

    print $out_fh "};\n";
    print $out_fh "\n#endif\n" unless exists $include_in_ext_re{$name};

}

sub mk_invlist_from_cp_list {

    # Returns an inversion list constructed from the sorted input array of
    # code points

    my $list_ref = shift;

    # Initialize to just the first element
    my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);

    # For each succeeding element, if it extends the previous range, adjust
    # up, otherwise add it.
    for my $i (1 .. @$list_ref - 1) {
        if ($invlist[-1] == $list_ref->[$i]) {
            $invlist[-1]++;
        }
        else {
            push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
        }
    }
    return @invlist;
}

# Read in the Case Folding rules, and construct arrays of code points for the
# properties we need.
my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
die "Could not find inversion map for Case_Folding" unless defined $format;
die "Incorrect format '$format' for Case_Folding inversion map"
                                                    unless $format eq 'al';
my @has_multi_char_fold;
my @is_non_final_fold;

for my $i (0 .. @$folds_ref - 1) {
    next unless ref $folds_ref->[$i];   # Skip single-char folds
    push @has_multi_char_fold, $cp_ref->[$i];

    # Add to the non-finals list each code point that is in a non-final
    # position
    for my $j (0 .. @{$folds_ref->[$i]} - 2) {
        push @is_non_final_fold, $folds_ref->[$i][$j]
                unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
    }
}

sub _Perl_Multi_Char_Folds {
    @has_multi_char_fold = sort { $a <=> $b } @has_multi_char_fold;
    return mk_invlist_from_cp_list(\@has_multi_char_fold);
}

sub _Perl_Non_Final_Folds {
    @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
    return mk_invlist_from_cp_list(\@is_non_final_fold);
}

sub UpperLatin1 {
    my @upper_latin1;
    for my $i (0 .. 255) {  # Complicated because of EBCDIC
        push @upper_latin1, $i if chr($i) =~ /[[:^ascii:]]/;
    }
    return mk_invlist_from_cp_list(\@upper_latin1);
}

output_invlist("Latin1", [ 0, 256 ]);
output_invlist("AboveLatin1", [ 256 ]);

# We construct lists for all the POSIX and backslash sequence character
# classes in two forms:
#   1) ones which match only in the ASCII range
#   2) ones which match either in the Latin1 range, or the entire Unicode range
#
# These get compiled in, and hence affect the memory footprint of every Perl
# program, even those not using Unicode.  To minimize the size, currently
# the Latin1 version is generated for the beyond ASCII range except for those
# lists that are quite small for the entire range, such as for \s, which is 22
# UVs long plus 4 UVs (currently) for the header.
#
# To save even more memory, the ASCII versions could be derived from the
# larger ones at runtime, saving some memory (minus the expense of the machine
# instructions to do so), but these are all small anyway, so their total is
# about 100 UVs.
#
# In the list of properties below that get generated, the L1 prefix is a fake
# property that means just the Latin1 range of the full property (whose name
# has an X prefix instead of L1).
#
# An initial & means to use the subroutine from this file instead of an
# official inversion list.

for my $prop (qw(
                ASCII
                L1Cased
		VertSpace
                PerlSpace
                    XPerlSpace
                PosixAlnum
                    L1PosixAlnum
                PosixAlpha
                    L1PosixAlpha
                PosixBlank
                    XPosixBlank
                PosixCntrl
                    XPosixCntrl
                PosixDigit
                PosixGraph
                    L1PosixGraph
                PosixLower
                    L1PosixLower
                PosixPrint
                    L1PosixPrint
                PosixPunct
                    L1PosixPunct
                PosixSpace
                    XPosixSpace
                PosixUpper
                    L1PosixUpper
                PosixWord
                    L1PosixWord
                PosixXDigit
                    XPosixXDigit
                &NonL1_Perl_Non_Final_Folds
                &_Perl_Multi_Char_Folds
                &UpperLatin1
    )
) {

    # For the Latin1 properties, we change to use the eXtended version of the
    # base property, then go through the result and get rid of everything not
    # in Latin1 (above 255).  Actually, we retain the element for the range
    # that crosses the 255/256 boundary if it is one that matches the
    # property.  For example, in the Word property, there is a range of code
    # points that start at U+00F8 and goes through U+02C1.  Instead of
    # artificially cutting that off at 256 because 256 is the first code point
    # above Latin1, we let the range go to its natural ending.  That gives us
    # extra information with no added space taken.  But if the range that
    # crosses the boundary is one that doesn't match the property, we don't
    # start a new range above 255, as that could be construed as going to
    # infinity.  For example, the Upper property doesn't include the character
    # at 255, but does include the one at 256.  We don't include the 256 one.
    my $prop_name = $prop;
    my $is_local_sub = $prop_name =~ s/^&//;
    my $lookup_prop = $prop_name;
    my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//);
    my $nonl1_only = 0;
    $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;

    my @invlist;
    if ($is_local_sub) {
        @invlist = eval $lookup_prop;
    }
    else {
        @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
    }
    die "Could not find inversion list for '$lookup_prop'" unless @invlist;

    if ($l1_only) {
        for my $i (0 .. @invlist - 1 - 1) {
            if ($invlist[$i] > 255) {

                # In an inversion list, even-numbered elements give the code
                # points that begin ranges that match the property;
                # odd-numbered give ones that begin ranges that don't match.
                # If $i is odd, we are at the first code point above 255 that
                # doesn't match, which means the range it is ending does
                # match, and crosses the 255/256 boundary.  We want to include
                # this ending point, so increment $i, so the splice below
                # includes it.  Conversely, if $i is even, it is the first
                # code point above 255 that matches, which means there was no
                # matching range that crossed the boundary, and we don't want
                # to include this code point, so splice before it.
                $i++ if $i % 2 != 0;

                # Remove everything past this.
                splice @invlist, $i;
                last;
            }
        }
    }
    elsif ($nonl1_only) {
        my $found_nonl1 = 0;
        for my $i (0 .. @invlist - 1 - 1) {
            next if $invlist[$i] < 256;

            # Here, we have the first element in the array that indicates an
            # element above Latin1.  Get rid of all previous ones.
            splice @invlist, 0, $i;

            # If this one's index is not divisible by 2, it means that this
            # element is inverting away from being in the list, which means
            # all code points from 256 to this one are in this list.
            unshift @invlist, 256 if $i % 2 != 0;
            $found_nonl1 = 1;
            last;
        }
        die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
    }

    output_invlist($prop_name, \@invlist);
}

read_only_bottom_close_and_rename($out_fh)
Commit	Line	Data
9d9177be KW	1	#!perl -w
	2	use 5.015;
	3	use strict;
	4	use warnings;
a02047bf	5	use Unicode::UCD qw(prop_invlist prop_invmap);
9d9177be KW	6	require 'regen/regen_lib.pl';
	7
	8	# This program outputs charclass_invlists.h, which contains various inversion
	9	# lists in the form of C arrays that are to be used as-is for inversion lists.
	10	# Thus, the lists it contains are essentially pre-compiled, and need only a
	11	# light-weight fast wrapper to make them usable at run-time.
	12
	13	# As such, this code knows about the internal structure of these lists, and
	14	# any change made to that has to be done here as well. A random number stored
	15	# in the headers is used to minimize the possibility of things getting
	16	# out-of-sync, or the wrong data structure being passed. Currently that
	17	# random number is:
0a07b44b	18	my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be KW	19
	20	my $out_fh = open_new('charclass_invlists.h', '>',
	21	{style => '*', by => $0,
	22	from => "Unicode::UCD"});
	23
	24	print $out_fh "/* See the generating file for comments */\n\n";
	25
015bb97c CB	26	my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );
015bb97c CB	27
9d9177be KW	28	sub output_invlist ($$) {
	29	my $name = shift;
	30	my $invlist = shift; # Reference to inversion list array
	31
76d3994c KW	32	die "No inversion list for $name" unless defined $invlist
	33	&& ref $invlist eq 'ARRAY'
	34	&& @$invlist;
	35
9d9177be KW	36	# Output the inversion list $invlist using the name $name for it.
	37	# It is output in the exact internal form for inversion lists.
	38
a0316a6c KW	39	# Is the last element of the header 0, or 1 ?
a0316a6c KW	40	my $zero_or_one = 0;
a0316a6c KW	41	if ($invlist->[0] != 0) {
a0316a6c KW	42	unshift @$invlist, 0;
9d9177be KW	43	$zero_or_one = 1;
9d9177be KW	44	}
0a07b44b	45	my $count = @$invlist;
9d9177be	46
015bb97c	47	print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
af495420	48	print $out_fh "\nstatic const UV ${name}_invlist[] = {\n";
9d9177be	49
a0316a6c	50	print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be KW	51	print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
9d9177be KW	52	print $out_fh "\t", $zero_or_one,
a0316a6c KW	53	",\t/* 0 if the list starts at 0;",
a0316a6c KW	54	"\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be KW	55
	56	# The main body are the UVs passed in to this routine. Do the final
	57	# element separately
	58	for my $i (0 .. @$invlist - 1 - 1) {
	59	print $out_fh "\t$invlist->[$i],\n";
	60	}
	61
	62	# The final element does not have a trailing comma, as C can't handle it.
	63	print $out_fh "\t$invlist->[-1]\n";
	64
	65	print $out_fh "};\n";
015bb97c CB	66	print $out_fh "\n#endif\n" unless exists $include_in_ext_re{$name};
015bb97c CB	67
9d9177be KW	68	}
9d9177be KW	69
a02047bf KW	70	sub mk_invlist_from_cp_list {
	71
	72	# Returns an inversion list constructed from the sorted input array of
	73	# code points
	74
	75	my $list_ref = shift;
	76
	77	# Initialize to just the first element
	78	my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
	79
	80	# For each succeeding element, if it extends the previous range, adjust
	81	# up, otherwise add it.
	82	for my $i (1 .. @$list_ref - 1) {
	83	if ($invlist[-1] == $list_ref->[$i]) {
	84	$invlist[-1]++;
	85	}
	86	else {
	87	push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
	88	}
	89	}
	90	return @invlist;
	91	}
	92
	93	# Read in the Case Folding rules, and construct arrays of code points for the
	94	# properties we need.
	95	my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
	96	die "Could not find inversion map for Case_Folding" unless defined $format;
	97	die "Incorrect format '$format' for Case_Folding inversion map"
	98	unless $format eq 'al';
	99	my @has_multi_char_fold;
	100	my @is_non_final_fold;
	101
	102	for my $i (0 .. @$folds_ref - 1) {
	103	next unless ref $folds_ref->[$i]; # Skip single-char folds
	104	push @has_multi_char_fold, $cp_ref->[$i];
	105
b6a6e956	106	# Add to the non-finals list each code point that is in a non-final
a02047bf KW	107	# position
	108	for my $j (0 .. @{$folds_ref->[$i]} - 2) {
	109	push @is_non_final_fold, $folds_ref->[$i][$j]
	110	unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
	111	}
	112	}
	113
	114	sub _Perl_Multi_Char_Folds {
	115	@has_multi_char_fold = sort { $a <=> $b } @has_multi_char_fold;
	116	return mk_invlist_from_cp_list(\@has_multi_char_fold);
	117	}
	118
	119	sub _Perl_Non_Final_Folds {
	120	@is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
	121	return mk_invlist_from_cp_list(\@is_non_final_fold);
	122	}
	123
892d8259 KW	124	sub UpperLatin1 {
	125	my @upper_latin1;
	126	for my $i (0 .. 255) { # Complicated because of EBCDIC
	127	push @upper_latin1, $i if chr($i) =~ /[[:^ascii:]]/;
	128	}
	129	return mk_invlist_from_cp_list(\@upper_latin1);
	130	}
	131
9d9177be KW	132	output_invlist("Latin1", [ 0, 256 ]);
	133	output_invlist("AboveLatin1", [ 256 ]);
	134
3f427fd9 KW	135	# We construct lists for all the POSIX and backslash sequence character
	136	# classes in two forms:
	137	# 1) ones which match only in the ASCII range
	138	# 2) ones which match either in the Latin1 range, or the entire Unicode range
	139	#
	140	# These get compiled in, and hence affect the memory footprint of every Perl
	141	# program, even those not using Unicode. To minimize the size, currently
	142	# the Latin1 version is generated for the beyond ASCII range except for those
	143	# lists that are quite small for the entire range, such as for \s, which is 22
	144	# UVs long plus 4 UVs (currently) for the header.
	145	#
	146	# To save even more memory, the ASCII versions could be derived from the
	147	# larger ones at runtime, saving some memory (minus the expense of the machine
	148	# instructions to do so), but these are all small anyway, so their total is
	149	# about 100 UVs.
	150	#
	151	# In the list of properties below that get generated, the L1 prefix is a fake
	152	# property that means just the Latin1 range of the full property (whose name
	153	# has an X prefix instead of L1).
a02047bf KW	154	#
	155	# An initial & means to use the subroutine from this file instead of an
	156	# official inversion list.
3f427fd9	157
9d9177be KW	158	for my $prop (qw(
9d9177be KW	159	ASCII
dab0c3e7	160	L1Cased
3f427fd9 KW	161	VertSpace
	162	PerlSpace
	163	XPerlSpace
	164	PosixAlnum
	165	L1PosixAlnum
	166	PosixAlpha
	167	L1PosixAlpha
	168	PosixBlank
	169	XPosixBlank
	170	PosixCntrl
	171	XPosixCntrl
	172	PosixDigit
	173	PosixGraph
	174	L1PosixGraph
	175	PosixLower
	176	L1PosixLower
	177	PosixPrint
	178	L1PosixPrint
	179	PosixPunct
	180	L1PosixPunct
	181	PosixSpace
	182	XPosixSpace
	183	PosixUpper
	184	L1PosixUpper
	185	PosixWord
	186	L1PosixWord
	187	PosixXDigit
	188	XPosixXDigit
a02047bf KW	189	&NonL1_Perl_Non_Final_Folds
a02047bf KW	190	&_Perl_Multi_Char_Folds
892d8259	191	&UpperLatin1
9d9177be KW	192	)
	193	) {
	194
3f427fd9 KW	195	# For the Latin1 properties, we change to use the eXtended version of the
3f427fd9 KW	196	# base property, then go through the result and get rid of everything not
b4069bca KW	197	# in Latin1 (above 255). Actually, we retain the element for the range
	198	# that crosses the 255/256 boundary if it is one that matches the
	199	# property. For example, in the Word property, there is a range of code
	200	# points that start at U+00F8 and goes through U+02C1. Instead of
b6a6e956	201	# artificially cutting that off at 256 because 256 is the first code point
3f427fd9	202	# above Latin1, we let the range go to its natural ending. That gives us
b4069bca KW	203	# extra information with no added space taken. But if the range that
	204	# crosses the boundary is one that doesn't match the property, we don't
	205	# start a new range above 255, as that could be construed as going to
	206	# infinity. For example, the Upper property doesn't include the character
	207	# at 255, but does include the one at 256. We don't include the 256 one.
a02047bf KW	208	my $prop_name = $prop;
	209	my $is_local_sub = $prop_name =~ s/^&//;
	210	my $lookup_prop = $prop_name;
c4854dea KW	211	my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//);
	212	my $nonl1_only = 0;
	213	$nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
a02047bf KW	214
	215	my @invlist;
	216	if ($is_local_sub) {
	217	@invlist = eval $lookup_prop;
	218	}
	219	else {
	220	@invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
	221	}
ad89228c	222	die "Could not find inversion list for '$lookup_prop'" unless @invlist;
3f427fd9	223
c4854dea	224	if ($l1_only) {
3f427fd9 KW	225	for my $i (0 .. @invlist - 1 - 1) {
3f427fd9 KW	226	if ($invlist[$i] > 255) {
b4069bca KW	227
	228	# In an inversion list, even-numbered elements give the code
	229	# points that begin ranges that match the property;
	230	# odd-numbered give ones that begin ranges that don't match.
	231	# If $i is odd, we are at the first code point above 255 that
	232	# doesn't match, which means the range it is ending does
	233	# match, and crosses the 255/256 boundary. We want to include
	234	# this ending point, so increment $i, so the splice below
	235	# includes it. Conversely, if $i is even, it is the first
	236	# code point above 255 that matches, which means there was no
	237	# matching range that crossed the boundary, and we don't want
	238	# to include this code point, so splice before it.
	239	$i++ if $i % 2 != 0;
	240
	241	# Remove everything past this.
	242	splice @invlist, $i;
3f427fd9 KW	243	last;
	244	}
	245	}
	246	}
c4854dea KW	247	elsif ($nonl1_only) {
	248	my $found_nonl1 = 0;
	249	for my $i (0 .. @invlist - 1 - 1) {
	250	next if $invlist[$i] < 256;
	251
	252	# Here, we have the first element in the array that indicates an
	253	# element above Latin1. Get rid of all previous ones.
	254	splice @invlist, 0, $i;
	255
	256	# If this one's index is not divisible by 2, it means that this
	257	# element is inverting away from being in the list, which means
	258	# all code points from 256 to this one are in this list.
	259	unshift @invlist, 256 if $i % 2 != 0;
	260	$found_nonl1 = 1;
	261	last;
	262	}
	263	die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
	264	}
3f427fd9	265
a02047bf	266	output_invlist($prop_name, \@invlist);
9d9177be KW	267	}
	268
	269	read_only_bottom_close_and_rename($out_fh)