This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #113718] Add inline.h
[perl5.git] / regen / mk_PL_charclass.pl
CommitLineData
9c68f0ab 1#!perl -w
b72a36d4 2use v5.15.8;
9c68f0ab
KW
3use strict;
4use warnings;
cfb8fd6a 5require 'regen/regen_lib.pl';
9c68f0ab 6
b1909af7
KW
7# This program outputs l1_charclass_tab.h, which defines the guts of the
8# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1 9# code point at the corresponding position in the table array has. The first
b1909af7 10# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1
KW
11# an application to see if the code point "i" has a particular property, it
12# just does
9c68f0ab
KW
13# 'PL_charclass[i] & BIT'
14# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
15# character class, and 'property' is the corresponding property, and 'suffix'
16# is one of '_A' to mean the property is true only if the corresponding code
17# point is ASCII, and '_L1' means that the range includes any Latin1
18# character (ISO-8859-1 including the C0 and C1 controls). A property without
19# these suffixes does not have different forms for both ranges.
20
b1909af7
KW
21# This program need be run only when adding new properties to it, or upon a
22# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab
KW
23
24my @properties = qw(
f4cdb42c
KW
25 ALNUMC
26 ALPHA
e48bdfbc 27 ASCII
f4cdb42c 28 BLANK
9c68f0ab 29 CHARNAME_CONT
f4cdb42c
KW
30 CNTRL
31 DIGIT
32 GRAPH
33 IDFIRST
34 LOWER
b72a36d4 35 NON_FINAL_FOLD
f4cdb42c
KW
36 PRINT
37 PSXSPC
38 PUNCT
9a022f3a 39 QUOTEMETA
f4cdb42c
KW
40 SPACE
41 UPPER
42 WORDCHAR
43 XDIGIT
9c68f0ab
KW
44);
45
00c072cf
KW
46# Read in the case fold mappings.
47my %folded_closure;
cfb8fd6a 48my $file="lib/unicore/CaseFolding.txt";
dbe1ba6b
KW
49my @folds;
50use Unicode::UCD;
51
52# Use the Unicode data file if we are on an ASCII platform (which its data is
53# for), and it is in the modern format (starting in Unicode 3.1.0) and it is
54# available. This avoids being affected by potential bugs introduced by other
55# layers of Perl
56if (ord('A') == 65
57 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
58 && open my $fh, "<", $file)
59{
60 @folds = <$fh>;
61}
62else {
63 my ($invlist_ref, $invmap_ref, undef, $default)
64 = Unicode::UCD::prop_invmap('Case_Folding');
65 for my $i (0 .. @$invlist_ref - 1 - 1) {
66 next if $invmap_ref->[$i] == $default;
67 my $adjust = -1;
68 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
69 $adjust++;
70
71 # Single-code point maps go to a 'C' type
72 if (! ref $invmap_ref->[$i]) {
73 push @folds, sprintf("%04X; C; %04X\n",
74 $j,
75 $invmap_ref->[$i] + $adjust);
76 }
77 else { # Multi-code point maps go to 'F'. prop_invmap()
78 # guarantees that no adjustment is needed for these,
79 # as the range will contain just one element
80 push @folds, sprintf("%04X; F; %s\n",
81 $j,
82 join " ", map { sprintf "%04X", $_ }
83 @{$invmap_ref->[$i]});
84 }
85 }
86 }
87}
88
89for (@folds) {
00c072cf
KW
90 chomp;
91
92 # Lines look like (without the initial '#'
93 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
e3136cf9
KW
94 # Get rid of comments, ignore blank or comment-only lines
95 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
96 next unless length $line;
00c072cf
KW
97 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
98
99 my $from = hex $hex_from;
100
101 # Perl only deals with C and F folds
33e4950c 102 next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf
KW
103
104 # Get each code point in the range that participates in this line's fold.
105 # The hash has keys of each code point in the range, and values of what it
106 # folds to and what folds to it
107 foreach my $hex_fold (@folded) {
108 my $fold = hex $hex_fold;
109 push @{$folded_closure{$fold}}, $from if $fold < 256;
110 push @{$folded_closure{$from}}, $fold if $from < 256;
111 }
112}
113
114# Now having read all the lines, combine them into the full closure of each
115# code point in the range by adding lists together that share a common element
116foreach my $folded (keys %folded_closure) {
117 foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
118 push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
119 }
120}
121
9c68f0ab
KW
122my @bits; # Bit map for each code point
123
00c072cf 124foreach my $folded (keys %folded_closure) {
265c1f46 125 $bits[$folded] = "(1U<<_CC_NONLATIN1_FOLD)" if grep { $_ > 255 }
00c072cf
KW
126 @{$folded_closure{$folded}};
127}
128
b1909af7 129# For each character, calculate which properties it matches.
9c68f0ab
KW
130for my $ord (0..255) {
131 my $char = chr($ord);
132 utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7
KW
133
134 # Look at all the properties we care about here.
9c68f0ab
KW
135 for my $property (@properties) {
136 my $name = $property;
137
b1909af7
KW
138 # Remove the suffix to get the actual property name.
139 # Currently the suffixes are '_L1', '_A', and none.
9c68f0ab
KW
140 # If is a latin1 version, no further checking is needed.
141 if (! ($name =~ s/_L1$//)) {
142
b1909af7 143 # Here, isn't an _L1. If its _A, it's automatically false for
e48bdfbc
KW
144 # non-ascii. The only one current one (besides ASCII) without a
145 # suffix is valid over the whole range.
9c68f0ab
KW
146 next if $name =~ s/_A$// && $ord >= 128;
147
148 }
149 my $re;
150 if ($name eq 'PUNCT') {;
151
152 # Sadly, this is inconsistent: \pP and \pS for the ascii range,
153 # just \pP outside it.
154 $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
155 } elsif ($name eq 'CHARNAME_CONT') {;
c6e8e4a9 156 $re = qr/[-\p{XPosixWord} ():\xa0]/;
9c68f0ab 157 } elsif ($name eq 'SPACE') {;
c6e8e4a9 158 $re = qr/\p{XPerlSpace}/;
9c68f0ab
KW
159 } elsif ($name eq 'IDFIRST') {
160 $re = qr/[_\p{Alpha}]/;
161 } elsif ($name eq 'PSXSPC') {
162 $re = qr/[\v\p{Space}]/;
163 } elsif ($name eq 'WORDCHAR') {
c6e8e4a9 164 $re = qr/\p{XPosixWord}/;
9c68f0ab
KW
165 } elsif ($name eq 'ALNUMC') {
166 # Like \w, but no underscore
aedd44b5 167 $re = qr/\p{Alnum}/;
9a022f3a
KW
168 } elsif ($name eq 'QUOTEMETA') {
169 $re = qr/\p{_Perl_Quotemeta}/;
b72a36d4
KW
170 } elsif ($name eq 'NON_FINAL_FOLD') {
171 $re = qr/\p{_Perl_Non_Final_Folds}/;
9c68f0ab
KW
172 } else { # The remainder have the same name and values as Unicode
173 $re = eval "qr/\\p{$name}/";
174 use Carp;
175 carp $@ if ! defined $re;
176 }
177 #print "$ord, $name $property, $re\n";
b72a36d4
KW
178 if ($char =~ $re # Add this property if matches
179 || ($name eq 'NON_FINAL_FOLD'
180 # Also include chars that fold to the non-final
181 && CORE::fc($char) =~ $re))
182 {
9c68f0ab 183 $bits[$ord] .= '|' if $bits[$ord];
265c1f46 184 $bits[$ord] .= "(1U<<_CC_$property)";
9c68f0ab
KW
185 }
186 }
187 #print __LINE__, " $ord $char $bits[$ord]\n";
188}
189
190# Names of C0 controls
191my @C0 = qw (
192 NUL
193 SOH
194 STX
195 ETX
196 EOT
197 ENQ
198 ACK
199 BEL
200 BS
201 HT
202 LF
203 VT
204 FF
205 CR
206 SO
207 SI
208 DLE
209 DC1
210 DC2
211 DC3
212 DC4
213 NAK
214 SYN
215 ETB
216 CAN
217 EOM
218 SUB
219 ESC
220 FS
221 GS
222 RS
223 US
224 );
225
226# Names of C1 controls, plus the adjacent DEL
227my @C1 = qw(
228 DEL
229 PAD
230 HOP
231 BPH
232 NBH
233 IND
234 NEL
235 SSA
236 ESA
237 HTS
238 HTJ
239 VTS
240 PLD
241 PLU
242 RI
243 SS2
244 SS3
245 DCS
246 PU1
247 PU2
248 STS
249 CCH
250 MW
251 SPA
252 EPA
253 SOS
254 SGC
255 SCI
256 CSI
257 ST
258 OSC
259 PM
260 APC
261 );
262
cc49830d 263my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7 264 {style => '*', by => $0,
dbe1ba6b 265 from => "property definitions"});
cfb8fd6a 266
9c68f0ab
KW
267# Output the table using fairly short names for each char.
268for my $ord (0..255) {
269 my $name;
270 if ($ord < 32) { # A C0 control
271 $name = $C0[$ord];
272 } elsif ($ord > 32 && $ord < 127) { # Graphic
273 $name = "'" . chr($ord) . "'";
274 } elsif ($ord >= 127 && $ord <= 0x9f) {
275 $name = $C1[$ord - 127]; # A C1 control + DEL
276 } else { # SPACE, or, if Latin1, shorten the name */
277 use charnames();
278 $name = charnames::viacode($ord);
279 $name =~ s/LATIN CAPITAL LETTER //
280 || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
281 }
cfb8fd6a 282 printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab
KW
283}
284
cfb8fd6a 285read_only_bottom_close_and_rename($out_fh)