[perl5.git] / regen / mk_invlists.pl

#!perl -w
use 5.015;
use strict;
use warnings;
use Unicode::UCD qw(prop_invlist prop_invmap);
require 'regen/regen_lib.pl';
require 'regen/charset_translations.pl';

# This program outputs charclass_invlists.h, which contains various inversion
# lists in the form of C arrays that are to be used as-is for inversion lists.
# Thus, the lists it contains are essentially pre-compiled, and need only a
# light-weight fast wrapper to make them usable at run-time.

# As such, this code knows about the internal structure of these lists, and
# any change made to that has to be done here as well.  A random number stored
# in the headers is used to minimize the possibility of things getting
# out-of-sync, or the wrong data structure being passed.  Currently that
# random number is:
my $VERSION_DATA_STRUCTURE_TYPE = 148565664;

my $out_fh = open_new('charclass_invlists.h', '>',
		      {style => '*', by => $0,
                      from => "Unicode::UCD"});

my $is_in_ifndef_ext_re = 0;

print $out_fh "/* See the generating file for comments */\n\n";

my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );

sub end_ifndef_ext_re {
    if ($is_in_ifndef_ext_re) {
        print $out_fh "\n#endif\t/* #ifndef PERL_IN_XSUB_RE */\n";
        $is_in_ifndef_ext_re = 0;
    }
}

sub output_invlist ($$;$) {
    my $name = shift;
    my $invlist = shift;     # Reference to inversion list array
    my $charset = shift // "";  # name of character set for comment

    die "No inversion list for $name" unless defined $invlist
                                             && ref $invlist eq 'ARRAY'
                                             && @$invlist;

    # Output the inversion list $invlist using the name $name for it.
    # It is output in the exact internal form for inversion lists.

    # Is the last element of the header 0, or 1 ?
    my $zero_or_one = 0;
    if ($invlist->[0] != 0) {
        unshift @$invlist, 0;
        $zero_or_one = 1;
    }
    my $count = @$invlist;

    if ($is_in_ifndef_ext_re) {
        if (exists $include_in_ext_re{$name}) {
            end_ifndef_ext_re;
        }
    }
    elsif (! exists $include_in_ext_re{$name}) {
        print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
        $is_in_ifndef_ext_re = 1;
    }

    print $out_fh "\nstatic const UV ${name}_invlist[] = {";
    print $out_fh " /* for $charset */" if $charset;
    print $out_fh "\n";

    print $out_fh "\t$count,\t/* Number of elements */\n";
    print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
    print $out_fh "\t", $zero_or_one,
                  ",\t/* 0 if the list starts at 0;",
                  "\n\t\t   1 if it starts at the element beyond 0 */\n";

    # The main body are the UVs passed in to this routine.  Do the final
    # element separately
    for my $i (0 .. @$invlist - 1 - 1) {
        print $out_fh "\t$invlist->[$i],\n";
    }

    # The final element does not have a trailing comma, as C can't handle it.
    print $out_fh "\t$invlist->[-1]\n";

    print $out_fh "};\n";
}

sub mk_invlist_from_cp_list {

    # Returns an inversion list constructed from the sorted input array of
    # code points

    my $list_ref = shift;

    # Initialize to just the first element
    my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);

    # For each succeeding element, if it extends the previous range, adjust
    # up, otherwise add it.
    for my $i (1 .. @$list_ref - 1) {
        if ($invlist[-1] == $list_ref->[$i]) {
            $invlist[-1]++;
        }
        else {
            push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
        }
    }
    return @invlist;
}

# Read in the Case Folding rules, and construct arrays of code points for the
# properties we need.
my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
die "Could not find inversion map for Case_Folding" unless defined $format;
die "Incorrect format '$format' for Case_Folding inversion map"
                                                    unless $format eq 'al';
my @has_multi_char_fold;
my @is_non_final_fold;

for my $i (0 .. @$folds_ref - 1) {
    next unless ref $folds_ref->[$i];   # Skip single-char folds
    push @has_multi_char_fold, $cp_ref->[$i];

    # Add to the non-finals list each code point that is in a non-final
    # position
    for my $j (0 .. @{$folds_ref->[$i]} - 2) {
        push @is_non_final_fold, $folds_ref->[$i][$j]
                unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
    }
}

sub _Perl_Non_Final_Folds {
    @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
    return mk_invlist_from_cp_list(\@is_non_final_fold);
}

sub UpperLatin1 {
    return mk_invlist_from_cp_list([ 128 .. 255 ]);
}

output_invlist("Latin1", [ 0, 256 ]);
output_invlist("AboveLatin1", [ 256 ]);

end_ifndef_ext_re;

# We construct lists for all the POSIX and backslash sequence character
# classes in two forms:
#   1) ones which match only in the ASCII range
#   2) ones which match either in the Latin1 range, or the entire Unicode range
#
# These get compiled in, and hence affect the memory footprint of every Perl
# program, even those not using Unicode.  To minimize the size, currently
# the Latin1 version is generated for the beyond ASCII range except for those
# lists that are quite small for the entire range, such as for \s, which is 22
# UVs long plus 4 UVs (currently) for the header.
#
# To save even more memory, the ASCII versions could be derived from the
# larger ones at runtime, saving some memory (minus the expense of the machine
# instructions to do so), but these are all small anyway, so their total is
# about 100 UVs.
#
# In the list of properties below that get generated, the L1 prefix is a fake
# property that means just the Latin1 range of the full property (whose name
# has an X prefix instead of L1).
#
# An initial & means to use the subroutine from this file instead of an
# official inversion list.

for my $charset (get_supported_code_pages()) {
    print $out_fh "\n" . get_conditional_compile_line_start($charset);

    my @a2n = get_a2n($charset);
    for my $prop (qw(
                    ASCII
                    Cased
                    VertSpace
                    XPerlSpace
                    XPosixAlnum
                    XPosixAlpha
                    XPosixBlank
                    XPosixCntrl
                    XPosixDigit
                    XPosixGraph
                    XPosixLower
                    XPosixPrint
                    XPosixPunct
                    XPosixSpace
                    XPosixUpper
                    XPosixWord
                    XPosixXDigit
                    _Perl_Any_Folds
                    &NonL1_Perl_Non_Final_Folds
                    _Perl_Folds_To_Multi_Char
                    &UpperLatin1
                    _Perl_IDStart
                    _Perl_IDCont
        )
    ) {

        # For the Latin1 properties, we change to use the eXtended version of the
        # base property, then go through the result and get rid of everything not
        # in Latin1 (above 255).  Actually, we retain the element for the range
        # that crosses the 255/256 boundary if it is one that matches the
        # property.  For example, in the Word property, there is a range of code
        # points that start at U+00F8 and goes through U+02C1.  Instead of
        # artificially cutting that off at 256 because 256 is the first code point
        # above Latin1, we let the range go to its natural ending.  That gives us
        # extra information with no added space taken.  But if the range that
        # crosses the boundary is one that doesn't match the property, we don't
        # start a new range above 255, as that could be construed as going to
        # infinity.  For example, the Upper property doesn't include the character
        # at 255, but does include the one at 256.  We don't include the 256 one.
        my $prop_name = $prop;
        my $is_local_sub = $prop_name =~ s/^&//;
        my $lookup_prop = $prop_name;
        my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
                       or $lookup_prop =~ s/^L1//);
        my $nonl1_only = 0;
        $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;

        my @invlist;
        if ($is_local_sub) {
            @invlist = eval $lookup_prop;
        }
        else {
            @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
        }
        die "Could not find inversion list for '$lookup_prop'" unless @invlist;
        my @full_list;
        for (my $i = 0; $i < @invlist; $i += 2) {
            my $upper = ($i + 1) < @invlist
                        ? $invlist[$i+1] - 1      # In range
                        : $Unicode::UCD::MAX_CP;  # To infinity.  You may want
                                                # to stop much much earlier;
                                                # going this high may expose
                                                # perl deficiencies with very
                                                # large numbers.
            for my $j ($invlist[$i] .. $upper) {
                if ($j < 256) {
                    push @full_list, $a2n[$j];
                }
                else {
                    push @full_list, $j;
                }
            }
        }
        @full_list = sort { $a <=> $b } @full_list;
        @invlist = mk_invlist_from_cp_list(\@full_list);

        if ($l1_only) {
            for my $i (0 .. @invlist - 1 - 1) {
                if ($invlist[$i] > 255) {

                    # In an inversion list, even-numbered elements give the code
                    # points that begin ranges that match the property;
                    # odd-numbered give ones that begin ranges that don't match.
                    # If $i is odd, we are at the first code point above 255 that
                    # doesn't match, which means the range it is ending does
                    # match, and crosses the 255/256 boundary.  We want to include
                    # this ending point, so increment $i, so the splice below
                    # includes it.  Conversely, if $i is even, it is the first
                    # code point above 255 that matches, which means there was no
                    # matching range that crossed the boundary, and we don't want
                    # to include this code point, so splice before it.
                    $i++ if $i % 2 != 0;

                    # Remove everything past this.
                    splice @invlist, $i;
                    last;
                }
            }
        }
        elsif ($nonl1_only) {
            my $found_nonl1 = 0;
            for my $i (0 .. @invlist - 1 - 1) {
                next if $invlist[$i] < 256;

                # Here, we have the first element in the array that indicates an
                # element above Latin1.  Get rid of all previous ones.
                splice @invlist, 0, $i;

                # If this one's index is not divisible by 2, it means that this
                # element is inverting away from being in the list, which means
                # all code points from 256 to this one are in this list.
                unshift @invlist, 256 if $i % 2 != 0;
                $found_nonl1 = 1;
                last;
            }
            die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
        }

        output_invlist($prop_name, \@invlist, $charset);
    }
    end_ifndef_ext_re;
    print $out_fh "\n" . get_conditional_compile_line_end();
}

read_only_bottom_close_and_rename($out_fh)
Commit	Line	Data
9d9177be KW	1	#!perl -w
	2	use 5.015;
	3	use strict;
	4	use warnings;
a02047bf	5	use Unicode::UCD qw(prop_invlist prop_invmap);
9d9177be	6	require 'regen/regen_lib.pl';
0c4ecf42	7	require 'regen/charset_translations.pl';
9d9177be KW	8
	9	# This program outputs charclass_invlists.h, which contains various inversion
	10	# lists in the form of C arrays that are to be used as-is for inversion lists.
	11	# Thus, the lists it contains are essentially pre-compiled, and need only a
	12	# light-weight fast wrapper to make them usable at run-time.
	13
	14	# As such, this code knows about the internal structure of these lists, and
	15	# any change made to that has to be done here as well. A random number stored
	16	# in the headers is used to minimize the possibility of things getting
	17	# out-of-sync, or the wrong data structure being passed. Currently that
	18	# random number is:
0a07b44b	19	my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
9d9177be KW	20
	21	my $out_fh = open_new('charclass_invlists.h', '>',
	22	{style => '*', by => $0,
	23	from => "Unicode::UCD"});
	24
43b443dd KW	25	my $is_in_ifndef_ext_re = 0;
43b443dd KW	26
9d9177be KW	27	print $out_fh "/* See the generating file for comments */\n\n";
9d9177be KW	28
015bb97c CB	29	my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );
015bb97c CB	30
43b443dd KW	31	sub end_ifndef_ext_re {
	32	if ($is_in_ifndef_ext_re) {
	33	print $out_fh "\n#endif\t/* #ifndef PERL_IN_XSUB_RE */\n";
	34	$is_in_ifndef_ext_re = 0;
	35	}
	36	}
	37
0c4ecf42	38	sub output_invlist ($$;$) {
9d9177be KW	39	my $name = shift;
9d9177be KW	40	my $invlist = shift; # Reference to inversion list array
0c4ecf42	41	my $charset = shift // ""; # name of character set for comment
9d9177be	42
76d3994c KW	43	die "No inversion list for $name" unless defined $invlist
	44	&& ref $invlist eq 'ARRAY'
	45	&& @$invlist;
	46
9d9177be KW	47	# Output the inversion list $invlist using the name $name for it.
	48	# It is output in the exact internal form for inversion lists.
	49
a0316a6c KW	50	# Is the last element of the header 0, or 1 ?
a0316a6c KW	51	my $zero_or_one = 0;
a0316a6c KW	52	if ($invlist->[0] != 0) {
a0316a6c KW	53	unshift @$invlist, 0;
9d9177be KW	54	$zero_or_one = 1;
9d9177be KW	55	}
0a07b44b	56	my $count = @$invlist;
9d9177be	57
43b443dd KW	58	if ($is_in_ifndef_ext_re) {
	59	if (exists $include_in_ext_re{$name}) {
	60	end_ifndef_ext_re;
	61	}
	62	}
	63	elsif (! exists $include_in_ext_re{$name}) {
	64	print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
	65	$is_in_ifndef_ext_re = 1;
	66	}
	67
0c4ecf42 KW	68	print $out_fh "\nstatic const UV ${name}_invlist[] = {";
	69	print $out_fh " /* for $charset */" if $charset;
	70	print $out_fh "\n";
9d9177be	71
a0316a6c	72	print $out_fh "\t$count,\t/* Number of elements */\n";
9d9177be KW	73	print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
9d9177be KW	74	print $out_fh "\t", $zero_or_one,
a0316a6c KW	75	",\t/* 0 if the list starts at 0;",
a0316a6c KW	76	"\n\t\t 1 if it starts at the element beyond 0 */\n";
9d9177be KW	77
	78	# The main body are the UVs passed in to this routine. Do the final
	79	# element separately
	80	for my $i (0 .. @$invlist - 1 - 1) {
	81	print $out_fh "\t$invlist->[$i],\n";
	82	}
	83
	84	# The final element does not have a trailing comma, as C can't handle it.
	85	print $out_fh "\t$invlist->[-1]\n";
	86
	87	print $out_fh "};\n";
	88	}
	89
a02047bf KW	90	sub mk_invlist_from_cp_list {
	91
	92	# Returns an inversion list constructed from the sorted input array of
	93	# code points
	94
	95	my $list_ref = shift;
	96
	97	# Initialize to just the first element
	98	my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);
	99
	100	# For each succeeding element, if it extends the previous range, adjust
	101	# up, otherwise add it.
	102	for my $i (1 .. @$list_ref - 1) {
	103	if ($invlist[-1] == $list_ref->[$i]) {
	104	$invlist[-1]++;
	105	}
	106	else {
	107	push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
	108	}
	109	}
	110	return @invlist;
	111	}
	112
	113	# Read in the Case Folding rules, and construct arrays of code points for the
	114	# properties we need.
	115	my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
	116	die "Could not find inversion map for Case_Folding" unless defined $format;
	117	die "Incorrect format '$format' for Case_Folding inversion map"
	118	unless $format eq 'al';
	119	my @has_multi_char_fold;
	120	my @is_non_final_fold;
	121
	122	for my $i (0 .. @$folds_ref - 1) {
	123	next unless ref $folds_ref->[$i]; # Skip single-char folds
	124	push @has_multi_char_fold, $cp_ref->[$i];
	125
b6a6e956	126	# Add to the non-finals list each code point that is in a non-final
a02047bf KW	127	# position
	128	for my $j (0 .. @{$folds_ref->[$i]} - 2) {
	129	push @is_non_final_fold, $folds_ref->[$i][$j]
	130	unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
	131	}
	132	}
	133
a02047bf KW	134	sub _Perl_Non_Final_Folds {
	135	@is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
	136	return mk_invlist_from_cp_list(\@is_non_final_fold);
	137	}
	138
892d8259	139	sub UpperLatin1 {
0c4ecf42	140	return mk_invlist_from_cp_list([ 128 .. 255 ]);
892d8259 KW	141	}
892d8259 KW	142
9d9177be KW	143	output_invlist("Latin1", [ 0, 256 ]);
	144	output_invlist("AboveLatin1", [ 256 ]);
	145
43b443dd KW	146	end_ifndef_ext_re;
43b443dd KW	147
3f427fd9 KW	148	# We construct lists for all the POSIX and backslash sequence character
	149	# classes in two forms:
	150	# 1) ones which match only in the ASCII range
	151	# 2) ones which match either in the Latin1 range, or the entire Unicode range
	152	#
	153	# These get compiled in, and hence affect the memory footprint of every Perl
	154	# program, even those not using Unicode. To minimize the size, currently
	155	# the Latin1 version is generated for the beyond ASCII range except for those
	156	# lists that are quite small for the entire range, such as for \s, which is 22
	157	# UVs long plus 4 UVs (currently) for the header.
	158	#
	159	# To save even more memory, the ASCII versions could be derived from the
	160	# larger ones at runtime, saving some memory (minus the expense of the machine
	161	# instructions to do so), but these are all small anyway, so their total is
	162	# about 100 UVs.
	163	#
	164	# In the list of properties below that get generated, the L1 prefix is a fake
	165	# property that means just the Latin1 range of the full property (whose name
	166	# has an X prefix instead of L1).
a02047bf KW	167	#
	168	# An initial & means to use the subroutine from this file instead of an
	169	# official inversion list.
3f427fd9	170
0c4ecf42 KW	171	for my $charset (get_supported_code_pages()) {
	172	print $out_fh "\n" . get_conditional_compile_line_start($charset);
	173
	174	my @a2n = get_a2n($charset);
0f5e3c71 KW	175	for my $prop (qw(
	176	ASCII
	177	Cased
	178	VertSpace
	179	XPerlSpace
	180	XPosixAlnum
	181	XPosixAlpha
	182	XPosixBlank
	183	XPosixCntrl
	184	XPosixDigit
	185	XPosixGraph
	186	XPosixLower
	187	XPosixPrint
	188	XPosixPunct
	189	XPosixSpace
	190	XPosixUpper
	191	XPosixWord
	192	XPosixXDigit
	193	_Perl_Any_Folds
	194	&NonL1_Perl_Non_Final_Folds
	195	_Perl_Folds_To_Multi_Char
	196	&UpperLatin1
	197	_Perl_IDStart
	198	_Perl_IDCont
	199	)
	200	) {
	201
	202	# For the Latin1 properties, we change to use the eXtended version of the
	203	# base property, then go through the result and get rid of everything not
	204	# in Latin1 (above 255). Actually, we retain the element for the range
	205	# that crosses the 255/256 boundary if it is one that matches the
	206	# property. For example, in the Word property, there is a range of code
	207	# points that start at U+00F8 and goes through U+02C1. Instead of
	208	# artificially cutting that off at 256 because 256 is the first code point
	209	# above Latin1, we let the range go to its natural ending. That gives us
	210	# extra information with no added space taken. But if the range that
	211	# crosses the boundary is one that doesn't match the property, we don't
	212	# start a new range above 255, as that could be construed as going to
	213	# infinity. For example, the Upper property doesn't include the character
	214	# at 255, but does include the one at 256. We don't include the 256 one.
	215	my $prop_name = $prop;
	216	my $is_local_sub = $prop_name =~ s/^&//;
	217	my $lookup_prop = $prop_name;
	218	my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
	219	or $lookup_prop =~ s/^L1//);
	220	my $nonl1_only = 0;
	221	$nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
	222
	223	my @invlist;
	224	if ($is_local_sub) {
	225	@invlist = eval $lookup_prop;
	226	}
	227	else {
	228	@invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
	229	}
	230	die "Could not find inversion list for '$lookup_prop'" unless @invlist;
	231	my @full_list;
	232	for (my $i = 0; $i < @invlist; $i += 2) {
	233	my $upper = ($i + 1) < @invlist
	234	? $invlist[$i+1] - 1 # In range
	235	: $Unicode::UCD::MAX_CP; # To infinity. You may want
	236	# to stop much much earlier;
	237	# going this high may expose
	238	# perl deficiencies with very
239	# large numbers.
240	for my $j ($invlist[$i] .. $upper) {
241	if ($j < 256) {
242	push @full_list, $a2n[$j];
243	}
244	else {
245	push @full_list, $j;
246	}
0c4ecf42	247	}
0f5e3c71 KW	248	}
	249	@full_list = sort { $a <=> $b } @full_list;
	250	@invlist = mk_invlist_from_cp_list(\@full_list);
	251
	252	if ($l1_only) {
	253	for my $i (0 .. @invlist - 1 - 1) {
	254	if ($invlist[$i] > 255) {
	255
	256	# In an inversion list, even-numbered elements give the code
	257	# points that begin ranges that match the property;
	258	# odd-numbered give ones that begin ranges that don't match.
	259	# If $i is odd, we are at the first code point above 255 that
	260	# doesn't match, which means the range it is ending does
	261	# match, and crosses the 255/256 boundary. We want to include
	262	# this ending point, so increment $i, so the splice below
	263	# includes it. Conversely, if $i is even, it is the first
	264	# code point above 255 that matches, which means there was no
	265	# matching range that crossed the boundary, and we don't want
	266	# to include this code point, so splice before it.
	267	$i++ if $i % 2 != 0;
	268
	269	# Remove everything past this.
	270	splice @invlist, $i;
	271	last;
	272	}
0c4ecf42 KW	273	}
0c4ecf42 KW	274	}
0f5e3c71 KW	275	elsif ($nonl1_only) {
	276	my $found_nonl1 = 0;
	277	for my $i (0 .. @invlist - 1 - 1) {
	278	next if $invlist[$i] < 256;
	279
	280	# Here, we have the first element in the array that indicates an
	281	# element above Latin1. Get rid of all previous ones.
	282	splice @invlist, 0, $i;
	283
	284	# If this one's index is not divisible by 2, it means that this
	285	# element is inverting away from being in the list, which means
	286	# all code points from 256 to this one are in this list.
	287	unshift @invlist, 256 if $i % 2 != 0;
	288	$found_nonl1 = 1;
3f427fd9 KW	289	last;
3f427fd9 KW	290	}
0f5e3c71	291	die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
3f427fd9	292	}
3f427fd9	293
0f5e3c71 KW	294	output_invlist($prop_name, \@invlist, $charset);
0f5e3c71 KW	295	}
43b443dd	296	end_ifndef_ext_re;
0c4ecf42	297	print $out_fh "\n" . get_conditional_compile_line_end();
9d9177be KW	298	}
	299
	300	read_only_bottom_close_and_rename($out_fh)