This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update File-Temp to CPAN version 0.2302
[perl5.git] / regen / mk_PL_charclass.pl
CommitLineData
9c68f0ab 1#!perl -w
b72a36d4 2use v5.15.8;
9c68f0ab
KW
3use strict;
4use warnings;
cfb8fd6a 5require 'regen/regen_lib.pl';
9c68f0ab 6
b1909af7
KW
7# This program outputs l1_charclass_tab.h, which defines the guts of the
8# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1 9# code point at the corresponding position in the table array has. The first
b1909af7 10# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1
KW
11# an application to see if the code point "i" has a particular property, it
12# just does
9c68f0ab
KW
13# 'PL_charclass[i] & BIT'
14# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
15# character class, and 'property' is the corresponding property, and 'suffix'
16# is one of '_A' to mean the property is true only if the corresponding code
17# point is ASCII, and '_L1' means that the range includes any Latin1
18# character (ISO-8859-1 including the C0 and C1 controls). A property without
19# these suffixes does not have different forms for both ranges.
20
b1909af7
KW
21# This program need be run only when adding new properties to it, or upon a
22# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab
KW
23
24my @properties = qw(
62841d05 25 NONLATIN1_FOLD
15861f94 26 ALPHANUMERIC
f4cdb42c 27 ALPHA
e48bdfbc 28 ASCII
f4cdb42c 29 BLANK
b0d691b2 30 CASED
9c68f0ab 31 CHARNAME_CONT
f4cdb42c
KW
32 CNTRL
33 DIGIT
34 GRAPH
35 IDFIRST
36 LOWER
b72a36d4 37 NON_FINAL_FOLD
f4cdb42c
KW
38 PRINT
39 PSXSPC
40 PUNCT
9a022f3a 41 QUOTEMETA
f4cdb42c
KW
42 SPACE
43 UPPER
44 WORDCHAR
45 XDIGIT
a0947d7b 46 VERTSPACE
f7993745 47 IS_IN_SOME_FOLD
4d68ffa0 48 BACKSLASH_FOO_LBRACE_IS_META
9c68f0ab
KW
49);
50
00c072cf
KW
51# Read in the case fold mappings.
52my %folded_closure;
62841d05 53my @hex_non_final_folds;
dbe1ba6b
KW
54my @folds;
55use Unicode::UCD;
56
62841d05
KW
57BEGIN { # Have to do this at compile time because using user-defined \p{property}
58
4ef0bd69
KW
59 # Use the Unicode data file if we are on an ASCII platform (which its data
60 # is for), and it is in the modern format (starting in Unicode 3.1.0) and
61 # it is available. This avoids being affected by potential bugs
62 # introduced by other layers of Perl
63 my $file="lib/unicore/CaseFolding.txt";
64
65 if (ord('A') == 65
66 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
67 && open my $fh, "<", $file)
68 {
69 @folds = <$fh>;
70 }
71 else {
72 my ($invlist_ref, $invmap_ref, undef, $default)
dbe1ba6b 73 = Unicode::UCD::prop_invmap('Case_Folding');
4ef0bd69
KW
74 for my $i (0 .. @$invlist_ref - 1 - 1) {
75 next if $invmap_ref->[$i] == $default;
76 my $adjust = -1;
77 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
78 $adjust++;
79
80 # Single-code point maps go to a 'C' type
81 if (! ref $invmap_ref->[$i]) {
82 push @folds, sprintf("%04X; C; %04X\n",
83 $j,
84 $invmap_ref->[$i] + $adjust);
85 }
86 else { # Multi-code point maps go to 'F'. prop_invmap()
87 # guarantees that no adjustment is needed for these,
88 # as the range will contain just one element
89 push @folds, sprintf("%04X; F; %s\n",
90 $j,
91 join " ", map { sprintf "%04X", $_ }
92 @{$invmap_ref->[$i]});
93 }
dbe1ba6b
KW
94 }
95 }
96 }
dbe1ba6b 97
4ef0bd69
KW
98 for (@folds) {
99 chomp;
00c072cf 100
4ef0bd69
KW
101 # Lines look like (without the initial '#'
102 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
103 # Get rid of comments, ignore blank or comment-only lines
104 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
105 next unless length $line;
106 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
00c072cf 107
4ef0bd69 108 my $from = hex $hex_from;
00c072cf 109
4ef0bd69
KW
110 # Perl only deals with C and F folds
111 next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf 112
4ef0bd69
KW
113 # Get each code point in the range that participates in this line's fold.
114 # The hash has keys of each code point in the range, and values of what it
115 # folds to and what folds to it
116 for my $i (0 .. @folded - 1) {
117 my $hex_fold = $folded[$i];
118 my $fold = hex $hex_fold;
119 push @{$folded_closure{$fold}}, $from if $fold < 256;
120 push @{$folded_closure{$from}}, $fold if $from < 256;
62841d05 121
07725c18
KW
122 if ($i < @folded-1
123 && $fold < 256
124 && ! grep { $_ eq $hex_fold } @hex_non_final_folds)
125 {
126 push @hex_non_final_folds, $hex_fold;
127
128 # Also add the upper case, which in the latin1 range folds to
129 # $fold
130 push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold;
131 }
4ef0bd69 132 }
00c072cf 133 }
00c072cf 134
4ef0bd69
KW
135 # Now having read all the lines, combine them into the full closure of each
136 # code point in the range by adding lists together that share a common
137 # element
138 foreach my $folded (keys %folded_closure) {
139 foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
140 push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
141 }
00c072cf
KW
142 }
143}
144
62841d05
KW
145sub Is_Non_Latin1_Fold {
146 my @return;
147
148 foreach my $folded (keys %folded_closure) {
149 push @return, sprintf("%X", $folded), if grep { $_ > 255 }
150 @{$folded_closure{$folded}};
151 }
152 return join("\n", @return) . "\n";
00c072cf
KW
153}
154
62841d05
KW
155sub Is_Non_Final_Fold {
156 return join("\n", @hex_non_final_folds) . "\n";
157}
158
159my @bits; # Bit map for each code point
160
b1909af7 161# For each character, calculate which properties it matches.
9c68f0ab
KW
162for my $ord (0..255) {
163 my $char = chr($ord);
164 utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7
KW
165
166 # Look at all the properties we care about here.
9c68f0ab
KW
167 for my $property (@properties) {
168 my $name = $property;
169
b1909af7
KW
170 # Remove the suffix to get the actual property name.
171 # Currently the suffixes are '_L1', '_A', and none.
9c68f0ab
KW
172 # If is a latin1 version, no further checking is needed.
173 if (! ($name =~ s/_L1$//)) {
174
b1909af7 175 # Here, isn't an _L1. If its _A, it's automatically false for
a0947d7b
KW
176 # non-ascii. The only current ones (besides ASCII) without a
177 # suffix are valid over the whole range.
9c68f0ab
KW
178 next if $name =~ s/_A$// && $ord >= 128;
179
180 }
181 my $re;
182 if ($name eq 'PUNCT') {;
183
184 # Sadly, this is inconsistent: \pP and \pS for the ascii range,
185 # just \pP outside it.
186 $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
187 } elsif ($name eq 'CHARNAME_CONT') {;
699ffc5e 188 $re = qr/\p{_Perl_Charname_Continue}/,
9c68f0ab 189 } elsif ($name eq 'SPACE') {;
c6e8e4a9 190 $re = qr/\p{XPerlSpace}/;
9c68f0ab
KW
191 } elsif ($name eq 'IDFIRST') {
192 $re = qr/[_\p{Alpha}]/;
193 } elsif ($name eq 'PSXSPC') {
194 $re = qr/[\v\p{Space}]/;
195 } elsif ($name eq 'WORDCHAR') {
c6e8e4a9 196 $re = qr/\p{XPosixWord}/;
15861f94 197 } elsif ($name eq 'ALPHANUMERIC') {
9c68f0ab 198 # Like \w, but no underscore
aedd44b5 199 $re = qr/\p{Alnum}/;
9a022f3a
KW
200 } elsif ($name eq 'QUOTEMETA') {
201 $re = qr/\p{_Perl_Quotemeta}/;
62841d05
KW
202 } elsif ($name eq 'NONLATIN1_FOLD') {
203 $re = qr/\p{Is_Non_Latin1_Fold}/;
b72a36d4 204 } elsif ($name eq 'NON_FINAL_FOLD') {
62841d05 205 $re = qr/\p{Is_Non_Final_Fold}/;
f7993745
KW
206 } elsif ($name eq 'IS_IN_SOME_FOLD') {
207 $re = qr/\p{_Perl_Any_Folds}/;
4d68ffa0
KW
208 } elsif ($name eq 'BACKSLASH_FOO_LBRACE_IS_META') {
209
210 # This is true for FOO where FOO is the varying character in:
211 # \a{, \b{, \c{, ...
212 # and the sequence has non-literal meaning to Perl; so it is true
213 # for 'x' because \x{ is special, but not 'a' because \a{ isn't.
214 $re = qr/[gkNopPx]/;
9c68f0ab
KW
215 } else { # The remainder have the same name and values as Unicode
216 $re = eval "qr/\\p{$name}/";
217 use Carp;
218 carp $@ if ! defined $re;
219 }
220 #print "$ord, $name $property, $re\n";
07725c18 221 if ($char =~ $re) { # Add this property if matches
9c68f0ab 222 $bits[$ord] .= '|' if $bits[$ord];
265c1f46 223 $bits[$ord] .= "(1U<<_CC_$property)";
9c68f0ab
KW
224 }
225 }
226 #print __LINE__, " $ord $char $bits[$ord]\n";
227}
228
229# Names of C0 controls
230my @C0 = qw (
231 NUL
232 SOH
233 STX
234 ETX
235 EOT
236 ENQ
237 ACK
238 BEL
239 BS
240 HT
241 LF
242 VT
243 FF
244 CR
245 SO
246 SI
247 DLE
248 DC1
249 DC2
250 DC3
251 DC4
252 NAK
253 SYN
254 ETB
255 CAN
256 EOM
257 SUB
258 ESC
259 FS
260 GS
261 RS
262 US
263 );
264
265# Names of C1 controls, plus the adjacent DEL
266my @C1 = qw(
267 DEL
268 PAD
269 HOP
270 BPH
271 NBH
272 IND
273 NEL
274 SSA
275 ESA
276 HTS
277 HTJ
278 VTS
279 PLD
280 PLU
281 RI
282 SS2
283 SS3
284 DCS
285 PU1
286 PU2
287 STS
288 CCH
289 MW
290 SPA
291 EPA
292 SOS
293 SGC
294 SCI
295 CSI
296 ST
297 OSC
298 PM
299 APC
300 );
301
cc49830d 302my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7 303 {style => '*', by => $0,
dbe1ba6b 304 from => "property definitions"});
cfb8fd6a 305
9c68f0ab
KW
306# Output the table using fairly short names for each char.
307for my $ord (0..255) {
308 my $name;
309 if ($ord < 32) { # A C0 control
310 $name = $C0[$ord];
311 } elsif ($ord > 32 && $ord < 127) { # Graphic
312 $name = "'" . chr($ord) . "'";
313 } elsif ($ord >= 127 && $ord <= 0x9f) {
314 $name = $C1[$ord - 127]; # A C1 control + DEL
315 } else { # SPACE, or, if Latin1, shorten the name */
316 use charnames();
317 $name = charnames::viacode($ord);
318 $name =~ s/LATIN CAPITAL LETTER //
319 || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
320 }
cfb8fd6a 321 printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab
KW
322}
323
cfb8fd6a 324read_only_bottom_close_and_rename($out_fh)