This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Env.pm: Fix splice in list context
[perl5.git] / regen / mk_PL_charclass.pl
CommitLineData
9c68f0ab 1#!perl -w
b72a36d4 2use v5.15.8;
9c68f0ab
KW
3use strict;
4use warnings;
cfb8fd6a 5require 'regen/regen_lib.pl';
9c68f0ab 6
b1909af7
KW
7# This program outputs l1_charclass_tab.h, which defines the guts of the
8# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1 9# code point at the corresponding position in the table array has. The first
b1909af7 10# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1
KW
11# an application to see if the code point "i" has a particular property, it
12# just does
9c68f0ab
KW
13# 'PL_charclass[i] & BIT'
14# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
15# character class, and 'property' is the corresponding property, and 'suffix'
16# is one of '_A' to mean the property is true only if the corresponding code
17# point is ASCII, and '_L1' means that the range includes any Latin1
18# character (ISO-8859-1 including the C0 and C1 controls). A property without
19# these suffixes does not have different forms for both ranges.
20
b1909af7
KW
21# This program need be run only when adding new properties to it, or upon a
22# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab
KW
23
24my @properties = qw(
62841d05 25 NONLATIN1_FOLD
f4cdb42c
KW
26 ALNUMC
27 ALPHA
e48bdfbc 28 ASCII
f4cdb42c 29 BLANK
9c68f0ab 30 CHARNAME_CONT
f4cdb42c
KW
31 CNTRL
32 DIGIT
33 GRAPH
34 IDFIRST
35 LOWER
b72a36d4 36 NON_FINAL_FOLD
f4cdb42c
KW
37 PRINT
38 PSXSPC
39 PUNCT
9a022f3a 40 QUOTEMETA
f4cdb42c
KW
41 SPACE
42 UPPER
43 WORDCHAR
44 XDIGIT
f7993745 45 IS_IN_SOME_FOLD
9c68f0ab
KW
46);
47
00c072cf
KW
48# Read in the case fold mappings.
49my %folded_closure;
62841d05 50my @hex_non_final_folds;
dbe1ba6b
KW
51my @folds;
52use Unicode::UCD;
53
62841d05
KW
54BEGIN { # Have to do this at compile time because using user-defined \p{property}
55
4ef0bd69
KW
56 # Use the Unicode data file if we are on an ASCII platform (which its data
57 # is for), and it is in the modern format (starting in Unicode 3.1.0) and
58 # it is available. This avoids being affected by potential bugs
59 # introduced by other layers of Perl
60 my $file="lib/unicore/CaseFolding.txt";
61
62 if (ord('A') == 65
63 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
64 && open my $fh, "<", $file)
65 {
66 @folds = <$fh>;
67 }
68 else {
69 my ($invlist_ref, $invmap_ref, undef, $default)
dbe1ba6b 70 = Unicode::UCD::prop_invmap('Case_Folding');
4ef0bd69
KW
71 for my $i (0 .. @$invlist_ref - 1 - 1) {
72 next if $invmap_ref->[$i] == $default;
73 my $adjust = -1;
74 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
75 $adjust++;
76
77 # Single-code point maps go to a 'C' type
78 if (! ref $invmap_ref->[$i]) {
79 push @folds, sprintf("%04X; C; %04X\n",
80 $j,
81 $invmap_ref->[$i] + $adjust);
82 }
83 else { # Multi-code point maps go to 'F'. prop_invmap()
84 # guarantees that no adjustment is needed for these,
85 # as the range will contain just one element
86 push @folds, sprintf("%04X; F; %s\n",
87 $j,
88 join " ", map { sprintf "%04X", $_ }
89 @{$invmap_ref->[$i]});
90 }
dbe1ba6b
KW
91 }
92 }
93 }
dbe1ba6b 94
4ef0bd69
KW
95 for (@folds) {
96 chomp;
00c072cf 97
4ef0bd69
KW
98 # Lines look like (without the initial '#'
99 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
100 # Get rid of comments, ignore blank or comment-only lines
101 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
102 next unless length $line;
103 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
00c072cf 104
4ef0bd69 105 my $from = hex $hex_from;
00c072cf 106
4ef0bd69
KW
107 # Perl only deals with C and F folds
108 next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf 109
4ef0bd69
KW
110 # Get each code point in the range that participates in this line's fold.
111 # The hash has keys of each code point in the range, and values of what it
112 # folds to and what folds to it
113 for my $i (0 .. @folded - 1) {
114 my $hex_fold = $folded[$i];
115 my $fold = hex $hex_fold;
116 push @{$folded_closure{$fold}}, $from if $fold < 256;
117 push @{$folded_closure{$from}}, $fold if $from < 256;
62841d05 118
07725c18
KW
119 if ($i < @folded-1
120 && $fold < 256
121 && ! grep { $_ eq $hex_fold } @hex_non_final_folds)
122 {
123 push @hex_non_final_folds, $hex_fold;
124
125 # Also add the upper case, which in the latin1 range folds to
126 # $fold
127 push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold;
128 }
4ef0bd69 129 }
00c072cf 130 }
00c072cf 131
4ef0bd69
KW
132 # Now having read all the lines, combine them into the full closure of each
133 # code point in the range by adding lists together that share a common
134 # element
135 foreach my $folded (keys %folded_closure) {
136 foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
137 push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
138 }
00c072cf
KW
139 }
140}
141
62841d05
KW
142sub Is_Non_Latin1_Fold {
143 my @return;
144
145 foreach my $folded (keys %folded_closure) {
146 push @return, sprintf("%X", $folded), if grep { $_ > 255 }
147 @{$folded_closure{$folded}};
148 }
149 return join("\n", @return) . "\n";
00c072cf
KW
150}
151
62841d05
KW
152sub Is_Non_Final_Fold {
153 return join("\n", @hex_non_final_folds) . "\n";
154}
155
156my @bits; # Bit map for each code point
157
b1909af7 158# For each character, calculate which properties it matches.
9c68f0ab
KW
159for my $ord (0..255) {
160 my $char = chr($ord);
161 utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7
KW
162
163 # Look at all the properties we care about here.
9c68f0ab
KW
164 for my $property (@properties) {
165 my $name = $property;
166
b1909af7
KW
167 # Remove the suffix to get the actual property name.
168 # Currently the suffixes are '_L1', '_A', and none.
9c68f0ab
KW
169 # If is a latin1 version, no further checking is needed.
170 if (! ($name =~ s/_L1$//)) {
171
b1909af7 172 # Here, isn't an _L1. If its _A, it's automatically false for
e48bdfbc
KW
173 # non-ascii. The only one current one (besides ASCII) without a
174 # suffix is valid over the whole range.
9c68f0ab
KW
175 next if $name =~ s/_A$// && $ord >= 128;
176
177 }
178 my $re;
179 if ($name eq 'PUNCT') {;
180
181 # Sadly, this is inconsistent: \pP and \pS for the ascii range,
182 # just \pP outside it.
183 $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
184 } elsif ($name eq 'CHARNAME_CONT') {;
699ffc5e 185 $re = qr/\p{_Perl_Charname_Continue}/,
9c68f0ab 186 } elsif ($name eq 'SPACE') {;
c6e8e4a9 187 $re = qr/\p{XPerlSpace}/;
9c68f0ab
KW
188 } elsif ($name eq 'IDFIRST') {
189 $re = qr/[_\p{Alpha}]/;
190 } elsif ($name eq 'PSXSPC') {
191 $re = qr/[\v\p{Space}]/;
192 } elsif ($name eq 'WORDCHAR') {
c6e8e4a9 193 $re = qr/\p{XPosixWord}/;
9c68f0ab
KW
194 } elsif ($name eq 'ALNUMC') {
195 # Like \w, but no underscore
aedd44b5 196 $re = qr/\p{Alnum}/;
9a022f3a
KW
197 } elsif ($name eq 'QUOTEMETA') {
198 $re = qr/\p{_Perl_Quotemeta}/;
62841d05
KW
199 } elsif ($name eq 'NONLATIN1_FOLD') {
200 $re = qr/\p{Is_Non_Latin1_Fold}/;
b72a36d4 201 } elsif ($name eq 'NON_FINAL_FOLD') {
62841d05 202 $re = qr/\p{Is_Non_Final_Fold}/;
f7993745
KW
203 } elsif ($name eq 'IS_IN_SOME_FOLD') {
204 $re = qr/\p{_Perl_Any_Folds}/;
9c68f0ab
KW
205 } else { # The remainder have the same name and values as Unicode
206 $re = eval "qr/\\p{$name}/";
207 use Carp;
208 carp $@ if ! defined $re;
209 }
210 #print "$ord, $name $property, $re\n";
07725c18 211 if ($char =~ $re) { # Add this property if matches
9c68f0ab 212 $bits[$ord] .= '|' if $bits[$ord];
265c1f46 213 $bits[$ord] .= "(1U<<_CC_$property)";
9c68f0ab
KW
214 }
215 }
216 #print __LINE__, " $ord $char $bits[$ord]\n";
217}
218
219# Names of C0 controls
220my @C0 = qw (
221 NUL
222 SOH
223 STX
224 ETX
225 EOT
226 ENQ
227 ACK
228 BEL
229 BS
230 HT
231 LF
232 VT
233 FF
234 CR
235 SO
236 SI
237 DLE
238 DC1
239 DC2
240 DC3
241 DC4
242 NAK
243 SYN
244 ETB
245 CAN
246 EOM
247 SUB
248 ESC
249 FS
250 GS
251 RS
252 US
253 );
254
255# Names of C1 controls, plus the adjacent DEL
256my @C1 = qw(
257 DEL
258 PAD
259 HOP
260 BPH
261 NBH
262 IND
263 NEL
264 SSA
265 ESA
266 HTS
267 HTJ
268 VTS
269 PLD
270 PLU
271 RI
272 SS2
273 SS3
274 DCS
275 PU1
276 PU2
277 STS
278 CCH
279 MW
280 SPA
281 EPA
282 SOS
283 SGC
284 SCI
285 CSI
286 ST
287 OSC
288 PM
289 APC
290 );
291
cc49830d 292my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7 293 {style => '*', by => $0,
dbe1ba6b 294 from => "property definitions"});
cfb8fd6a 295
9c68f0ab
KW
296# Output the table using fairly short names for each char.
297for my $ord (0..255) {
298 my $name;
299 if ($ord < 32) { # A C0 control
300 $name = $C0[$ord];
301 } elsif ($ord > 32 && $ord < 127) { # Graphic
302 $name = "'" . chr($ord) . "'";
303 } elsif ($ord >= 127 && $ord <= 0x9f) {
304 $name = $C1[$ord - 127]; # A C1 control + DEL
305 } else { # SPACE, or, if Latin1, shorten the name */
306 use charnames();
307 $name = charnames::viacode($ord);
308 $name =~ s/LATIN CAPITAL LETTER //
309 || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
310 }
cfb8fd6a 311 printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab
KW
312}
313
cfb8fd6a 314read_only_bottom_close_and_rename($out_fh)