This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Patch [perl #111400] [:upper:] broken for above Latin1
[perl5.git] / regen / mk_invlists.pl
CommitLineData
9d9177be
KW
1#!perl -w
2use 5.015;
3use strict;
4use warnings;
5use Unicode::UCD "prop_invlist";
6require 'regen/regen_lib.pl';
7
8# This program outputs charclass_invlists.h, which contains various inversion
9# lists in the form of C arrays that are to be used as-is for inversion lists.
10# Thus, the lists it contains are essentially pre-compiled, and need only a
11# light-weight fast wrapper to make them usable at run-time.
12
13# As such, this code knows about the internal structure of these lists, and
14# any change made to that has to be done here as well. A random number stored
15# in the headers is used to minimize the possibility of things getting
16# out-of-sync, or the wrong data structure being passed. Currently that
17# random number is:
18my $VERSION_DATA_STRUCTURE_TYPE = 1064334010;
19
20my $out_fh = open_new('charclass_invlists.h', '>',
21 {style => '*', by => $0,
22 from => "Unicode::UCD"});
23
24print $out_fh "/* See the generating file for comments */\n\n";
25
26sub output_invlist ($$) {
27 my $name = shift;
28 my $invlist = shift; # Reference to inversion list array
29
30 # Output the inversion list $invlist using the name $name for it.
31 # It is output in the exact internal form for inversion lists.
32
33 my $zero_or_one; # Is the last element of the header 0, or 1 ?
34
35 # If the first element is 0, it goes in the header, instead of the body
36 if ($invlist->[0] == 0) {
37 shift @$invlist;
38
39 $zero_or_one = 0;
40
41 # Add a dummy 0 at the end so that the length is constant. inversion
42 # lists are always stored with enough room so that if they change from
43 # beginning with 0, they don't have to grow.
44 push @$invlist, 0;
45 }
46 else {
47 $zero_or_one = 1;
48 }
49
50 print $out_fh "\nUV ${name}_invlist[] = {\n";
51
52 print $out_fh "\t", scalar @$invlist, ",\t/* Number of elements */\n";
53 print $out_fh "\t0,\t/* Current iteration position */\n";
54 print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
55 print $out_fh "\t", $zero_or_one,
56 ",\t/* 0 if this is the first element of the list proper;",
57 "\n\t\t 1 if the next element is the first */\n";
58
59 # The main body are the UVs passed in to this routine. Do the final
60 # element separately
61 for my $i (0 .. @$invlist - 1 - 1) {
62 print $out_fh "\t$invlist->[$i],\n";
63 }
64
65 # The final element does not have a trailing comma, as C can't handle it.
66 print $out_fh "\t$invlist->[-1]\n";
67
68 print $out_fh "};\n";
69}
70
71output_invlist("Latin1", [ 0, 256 ]);
72output_invlist("AboveLatin1", [ 256 ]);
73
3f427fd9
KW
74# We construct lists for all the POSIX and backslash sequence character
75# classes in two forms:
76# 1) ones which match only in the ASCII range
77# 2) ones which match either in the Latin1 range, or the entire Unicode range
78#
79# These get compiled in, and hence affect the memory footprint of every Perl
80# program, even those not using Unicode. To minimize the size, currently
81# the Latin1 version is generated for the beyond ASCII range except for those
82# lists that are quite small for the entire range, such as for \s, which is 22
83# UVs long plus 4 UVs (currently) for the header.
84#
85# To save even more memory, the ASCII versions could be derived from the
86# larger ones at runtime, saving some memory (minus the expense of the machine
87# instructions to do so), but these are all small anyway, so their total is
88# about 100 UVs.
89#
90# In the list of properties below that get generated, the L1 prefix is a fake
91# property that means just the Latin1 range of the full property (whose name
92# has an X prefix instead of L1).
93
9d9177be
KW
94for my $prop (qw(
95 ASCII
dab0c3e7 96 L1Cased
3f427fd9
KW
97 VertSpace
98 PerlSpace
99 XPerlSpace
100 PosixAlnum
101 L1PosixAlnum
102 PosixAlpha
103 L1PosixAlpha
104 PosixBlank
105 XPosixBlank
106 PosixCntrl
107 XPosixCntrl
108 PosixDigit
109 PosixGraph
110 L1PosixGraph
111 PosixLower
112 L1PosixLower
113 PosixPrint
114 L1PosixPrint
115 PosixPunct
116 L1PosixPunct
117 PosixSpace
118 XPosixSpace
119 PosixUpper
120 L1PosixUpper
121 PosixWord
122 L1PosixWord
123 PosixXDigit
124 XPosixXDigit
9d9177be
KW
125 )
126) {
127
3f427fd9
KW
128 # For the Latin1 properties, we change to use the eXtended version of the
129 # base property, then go through the result and get rid of everything not
b4069bca
KW
130 # in Latin1 (above 255). Actually, we retain the element for the range
131 # that crosses the 255/256 boundary if it is one that matches the
132 # property. For example, in the Word property, there is a range of code
133 # points that start at U+00F8 and goes through U+02C1. Instead of
3f427fd9
KW
134 # artifically cutting that off at 256 because 256 is the first code point
135 # above Latin1, we let the range go to its natural ending. That gives us
b4069bca
KW
136 # extra information with no added space taken. But if the range that
137 # crosses the boundary is one that doesn't match the property, we don't
138 # start a new range above 255, as that could be construed as going to
139 # infinity. For example, the Upper property doesn't include the character
140 # at 255, but does include the one at 256. We don't include the 256 one.
dab0c3e7
KW
141 my $lookup_prop = $prop;
142 $lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//;
3f427fd9
KW
143 my @invlist = prop_invlist($lookup_prop);
144
145 if ($lookup_prop ne $prop) {
146 for my $i (0 .. @invlist - 1 - 1) {
147 if ($invlist[$i] > 255) {
b4069bca
KW
148
149 # In an inversion list, even-numbered elements give the code
150 # points that begin ranges that match the property;
151 # odd-numbered give ones that begin ranges that don't match.
152 # If $i is odd, we are at the first code point above 255 that
153 # doesn't match, which means the range it is ending does
154 # match, and crosses the 255/256 boundary. We want to include
155 # this ending point, so increment $i, so the splice below
156 # includes it. Conversely, if $i is even, it is the first
157 # code point above 255 that matches, which means there was no
158 # matching range that crossed the boundary, and we don't want
159 # to include this code point, so splice before it.
160 $i++ if $i % 2 != 0;
161
162 # Remove everything past this.
163 splice @invlist, $i;
3f427fd9
KW
164 last;
165 }
166 }
167 }
168
9d9177be
KW
169 output_invlist($prop, \@invlist);
170}
171
172read_only_bottom_close_and_rename($out_fh)