[perl5.git] / regen / mk_invlists.pl

#!perl -w
use 5.015;
use strict;
use warnings;
use Unicode::UCD "prop_invlist";
require 'regen/regen_lib.pl';

# This program outputs charclass_invlists.h, which contains various inversion
# lists in the form of C arrays that are to be used as-is for inversion lists.
# Thus, the lists it contains are essentially pre-compiled, and need only a
# light-weight fast wrapper to make them usable at run-time.

# As such, this code knows about the internal structure of these lists, and
# any change made to that has to be done here as well.  A random number stored
# in the headers is used to minimize the possibility of things getting
# out-of-sync, or the wrong data structure being passed.  Currently that
# random number is:
my $VERSION_DATA_STRUCTURE_TYPE = 1064334010;

my $out_fh = open_new('charclass_invlists.h', '>',
		      {style => '*', by => $0,
                      from => "Unicode::UCD"});

print $out_fh "/* See the generating file for comments */\n\n";

sub output_invlist ($$) {
    my $name = shift;
    my $invlist = shift;     # Reference to inversion list array

    die "No inversion list for $name" unless defined $invlist
                                             && ref $invlist eq 'ARRAY'
                                             && @$invlist;

    # Output the inversion list $invlist using the name $name for it.
    # It is output in the exact internal form for inversion lists.

    my $zero_or_one;    # Is the last element of the header 0, or 1 ?

    # If the first element is 0, it goes in the header, instead of the body
    if ($invlist->[0] == 0) {
        shift @$invlist;

        $zero_or_one = 0;

        # Add a dummy 0 at the end so that the length is constant.  inversion
        # lists are always stored with enough room so that if they change from
        # beginning with 0, they don't have to grow.
        push @$invlist, 0;
    }
    else {
        $zero_or_one = 1;
    }

    print $out_fh "\nUV ${name}_invlist[] = {\n";

    print $out_fh "\t", scalar @$invlist, ",\t/* Number of elements */\n";
    print $out_fh "\t0,\t/* Current iteration position */\n";
    print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
    print $out_fh "\t", $zero_or_one,
                  ",\t/* 0 if this is the first element of the list proper;",
                  "\n\t\t   1 if the next element is the first */\n";

    # The main body are the UVs passed in to this routine.  Do the final
    # element separately
    for my $i (0 .. @$invlist - 1 - 1) {
        print $out_fh "\t$invlist->[$i],\n";
    }

    # The final element does not have a trailing comma, as C can't handle it.
    print $out_fh "\t$invlist->[-1]\n";

    print $out_fh "};\n";
}

output_invlist("Latin1", [ 0, 256 ]);
output_invlist("AboveLatin1", [ 256 ]);

# We construct lists for all the POSIX and backslash sequence character
# classes in two forms:
#   1) ones which match only in the ASCII range
#   2) ones which match either in the Latin1 range, or the entire Unicode range
#
# These get compiled in, and hence affect the memory footprint of every Perl
# program, even those not using Unicode.  To minimize the size, currently
# the Latin1 version is generated for the beyond ASCII range except for those
# lists that are quite small for the entire range, such as for \s, which is 22
# UVs long plus 4 UVs (currently) for the header.
#
# To save even more memory, the ASCII versions could be derived from the
# larger ones at runtime, saving some memory (minus the expense of the machine
# instructions to do so), but these are all small anyway, so their total is
# about 100 UVs.
#
# In the list of properties below that get generated, the L1 prefix is a fake
# property that means just the Latin1 range of the full property (whose name
# has an X prefix instead of L1).

for my $prop (qw(
                ASCII
                L1Cased
		VertSpace
                PerlSpace
                    XPerlSpace
                PosixAlnum
                    L1PosixAlnum
                PosixAlpha
                    L1PosixAlpha
                PosixBlank
                    XPosixBlank
                PosixCntrl
                    XPosixCntrl
                PosixDigit
                PosixGraph
                    L1PosixGraph
                PosixLower
                    L1PosixLower
                PosixPrint
                    L1PosixPrint
                PosixPunct
                    L1PosixPunct
                PosixSpace
                    XPosixSpace
                PosixUpper
                    L1PosixUpper
                PosixWord
                    L1PosixWord
                PosixXDigit
                    XPosixXDigit
                NonL1_Perl_Non_Final_Folds
    )
) {

    # For the Latin1 properties, we change to use the eXtended version of the
    # base property, then go through the result and get rid of everything not
    # in Latin1 (above 255).  Actually, we retain the element for the range
    # that crosses the 255/256 boundary if it is one that matches the
    # property.  For example, in the Word property, there is a range of code
    # points that start at U+00F8 and goes through U+02C1.  Instead of
    # artifically cutting that off at 256 because 256 is the first code point
    # above Latin1, we let the range go to its natural ending.  That gives us
    # extra information with no added space taken.  But if the range that
    # crosses the boundary is one that doesn't match the property, we don't
    # start a new range above 255, as that could be construed as going to
    # infinity.  For example, the Upper property doesn't include the character
    # at 255, but does include the one at 256.  We don't include the 256 one.
    my $lookup_prop = $prop;
    my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//);
    my $nonl1_only = 0;
    $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
    my @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
    die "Could not find inversion list for '$lookup_prop'" unless @invlist;

    if ($l1_only) {
        for my $i (0 .. @invlist - 1 - 1) {
            if ($invlist[$i] > 255) {

                # In an inversion list, even-numbered elements give the code
                # points that begin ranges that match the property;
                # odd-numbered give ones that begin ranges that don't match.
                # If $i is odd, we are at the first code point above 255 that
                # doesn't match, which means the range it is ending does
                # match, and crosses the 255/256 boundary.  We want to include
                # this ending point, so increment $i, so the splice below
                # includes it.  Conversely, if $i is even, it is the first
                # code point above 255 that matches, which means there was no
                # matching range that crossed the boundary, and we don't want
                # to include this code point, so splice before it.
                $i++ if $i % 2 != 0;

                # Remove everything past this.
                splice @invlist, $i;
                last;
            }
        }
    }
    elsif ($nonl1_only) {
        my $found_nonl1 = 0;
        for my $i (0 .. @invlist - 1 - 1) {
            next if $invlist[$i] < 256;

            # Here, we have the first element in the array that indicates an
            # element above Latin1.  Get rid of all previous ones.
            splice @invlist, 0, $i;

            # If this one's index is not divisible by 2, it means that this
            # element is inverting away from being in the list, which means
            # all code points from 256 to this one are in this list.
            unshift @invlist, 256 if $i % 2 != 0;
            $found_nonl1 = 1;
            last;
        }
        die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
    }

    output_invlist($prop, \@invlist);
}

read_only_bottom_close_and_rename($out_fh)
Commit	Line	Data
9d9177be KW	1	#!perl -w
	2	use 5.015;
	3	use strict;
	4	use warnings;
	5	use Unicode::UCD "prop_invlist";
	6	require 'regen/regen_lib.pl';
	7
	8	# This program outputs charclass_invlists.h, which contains various inversion
	9	# lists in the form of C arrays that are to be used as-is for inversion lists.
	10	# Thus, the lists it contains are essentially pre-compiled, and need only a
	11	# light-weight fast wrapper to make them usable at run-time.
	12
	13	# As such, this code knows about the internal structure of these lists, and
	14	# any change made to that has to be done here as well. A random number stored
	15	# in the headers is used to minimize the possibility of things getting
	16	# out-of-sync, or the wrong data structure being passed. Currently that
	17	# random number is:
	18	my $VERSION_DATA_STRUCTURE_TYPE = 1064334010;
	19
	20	my $out_fh = open_new('charclass_invlists.h', '>',
	21	{style => '*', by => $0,
	22	from => "Unicode::UCD"});
	23
	24	print $out_fh "/* See the generating file for comments */\n\n";
	25
	26	sub output_invlist ($$) {
	27	my $name = shift;
	28	my $invlist = shift; # Reference to inversion list array
	29
76d3994c KW	30	die "No inversion list for $name" unless defined $invlist
	31	&& ref $invlist eq 'ARRAY'
	32	&& @$invlist;
	33
9d9177be KW	34	# Output the inversion list $invlist using the name $name for it.
	35	# It is output in the exact internal form for inversion lists.
	36
	37	my $zero_or_one; # Is the last element of the header 0, or 1 ?
	38
	39	# If the first element is 0, it goes in the header, instead of the body
	40	if ($invlist->[0] == 0) {
	41	shift @$invlist;
	42
	43	$zero_or_one = 0;
	44
	45	# Add a dummy 0 at the end so that the length is constant. inversion
	46	# lists are always stored with enough room so that if they change from
	47	# beginning with 0, they don't have to grow.
	48	push @$invlist, 0;
	49	}
	50	else {
	51	$zero_or_one = 1;
	52	}
	53
	54	print $out_fh "\nUV ${name}_invlist[] = {\n";
	55
	56	print $out_fh "\t", scalar @$invlist, ",\t/* Number of elements */\n";
	57	print $out_fh "\t0,\t/* Current iteration position */\n";
	58	print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
	59	print $out_fh "\t", $zero_or_one,
	60	",\t/* 0 if this is the first element of the list proper;",
	61	"\n\t\t 1 if the next element is the first */\n";
	62
	63	# The main body are the UVs passed in to this routine. Do the final
	64	# element separately
	65	for my $i (0 .. @$invlist - 1 - 1) {
	66	print $out_fh "\t$invlist->[$i],\n";
	67	}
	68
	69	# The final element does not have a trailing comma, as C can't handle it.
	70	print $out_fh "\t$invlist->[-1]\n";
	71
	72	print $out_fh "};\n";
	73	}
	74
	75	output_invlist("Latin1", [ 0, 256 ]);
	76	output_invlist("AboveLatin1", [ 256 ]);
	77
3f427fd9 KW	78	# We construct lists for all the POSIX and backslash sequence character
	79	# classes in two forms:
	80	# 1) ones which match only in the ASCII range
	81	# 2) ones which match either in the Latin1 range, or the entire Unicode range
	82	#
	83	# These get compiled in, and hence affect the memory footprint of every Perl
	84	# program, even those not using Unicode. To minimize the size, currently
	85	# the Latin1 version is generated for the beyond ASCII range except for those
	86	# lists that are quite small for the entire range, such as for \s, which is 22
	87	# UVs long plus 4 UVs (currently) for the header.
	88	#
	89	# To save even more memory, the ASCII versions could be derived from the
	90	# larger ones at runtime, saving some memory (minus the expense of the machine
	91	# instructions to do so), but these are all small anyway, so their total is
	92	# about 100 UVs.
	93	#
	94	# In the list of properties below that get generated, the L1 prefix is a fake
	95	# property that means just the Latin1 range of the full property (whose name
	96	# has an X prefix instead of L1).
	97
9d9177be KW	98	for my $prop (qw(
9d9177be KW	99	ASCII
dab0c3e7	100	L1Cased
3f427fd9 KW	101	VertSpace
	102	PerlSpace
	103	XPerlSpace
	104	PosixAlnum
	105	L1PosixAlnum
	106	PosixAlpha
	107	L1PosixAlpha
	108	PosixBlank
	109	XPosixBlank
	110	PosixCntrl
	111	XPosixCntrl
	112	PosixDigit
	113	PosixGraph
	114	L1PosixGraph
	115	PosixLower
	116	L1PosixLower
	117	PosixPrint
	118	L1PosixPrint
	119	PosixPunct
	120	L1PosixPunct
	121	PosixSpace
	122	XPosixSpace
	123	PosixUpper
	124	L1PosixUpper
	125	PosixWord
	126	L1PosixWord
	127	PosixXDigit
	128	XPosixXDigit
b72a36d4	129	NonL1_Perl_Non_Final_Folds
9d9177be KW	130	)
	131	) {
	132
3f427fd9 KW	133	# For the Latin1 properties, we change to use the eXtended version of the
3f427fd9 KW	134	# base property, then go through the result and get rid of everything not
b4069bca KW	135	# in Latin1 (above 255). Actually, we retain the element for the range
	136	# that crosses the 255/256 boundary if it is one that matches the
	137	# property. For example, in the Word property, there is a range of code
	138	# points that start at U+00F8 and goes through U+02C1. Instead of
3f427fd9 KW	139	# artifically cutting that off at 256 because 256 is the first code point
3f427fd9 KW	140	# above Latin1, we let the range go to its natural ending. That gives us
b4069bca KW	141	# extra information with no added space taken. But if the range that
	142	# crosses the boundary is one that doesn't match the property, we don't
	143	# start a new range above 255, as that could be construed as going to
	144	# infinity. For example, the Upper property doesn't include the character
	145	# at 255, but does include the one at 256. We don't include the 256 one.
dab0c3e7	146	my $lookup_prop = $prop;
c4854dea KW	147	my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//);
	148	my $nonl1_only = 0;
	149	$nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
	150	my @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
ad89228c	151	die "Could not find inversion list for '$lookup_prop'" unless @invlist;
3f427fd9	152
c4854dea	153	if ($l1_only) {
3f427fd9 KW	154	for my $i (0 .. @invlist - 1 - 1) {
3f427fd9 KW	155	if ($invlist[$i] > 255) {
b4069bca KW	156
	157	# In an inversion list, even-numbered elements give the code
	158	# points that begin ranges that match the property;
	159	# odd-numbered give ones that begin ranges that don't match.
	160	# If $i is odd, we are at the first code point above 255 that
	161	# doesn't match, which means the range it is ending does
	162	# match, and crosses the 255/256 boundary. We want to include
	163	# this ending point, so increment $i, so the splice below
	164	# includes it. Conversely, if $i is even, it is the first
	165	# code point above 255 that matches, which means there was no
	166	# matching range that crossed the boundary, and we don't want
	167	# to include this code point, so splice before it.
	168	$i++ if $i % 2 != 0;
	169
	170	# Remove everything past this.
	171	splice @invlist, $i;
3f427fd9 KW	172	last;
	173	}
	174	}
	175	}
c4854dea KW	176	elsif ($nonl1_only) {
	177	my $found_nonl1 = 0;
	178	for my $i (0 .. @invlist - 1 - 1) {
	179	next if $invlist[$i] < 256;
	180
	181	# Here, we have the first element in the array that indicates an
	182	# element above Latin1. Get rid of all previous ones.
	183	splice @invlist, 0, $i;
	184
	185	# If this one's index is not divisible by 2, it means that this
	186	# element is inverting away from being in the list, which means
	187	# all code points from 256 to this one are in this list.
	188	unshift @invlist, 256 if $i % 2 != 0;
	189	$found_nonl1 = 1;
	190	last;
	191	}
	192	die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
	193	}
3f427fd9	194
9d9177be KW	195	output_invlist($prop, \@invlist);
	196	}
	197
	198	read_only_bottom_close_and_rename($out_fh)