This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Preserve the case of t/lib/vmsfspec.t.
[perl5.git] / regen / mk_PL_charclass.pl
CommitLineData
9c68f0ab 1#!perl -w
b72a36d4 2use v5.15.8;
9c68f0ab
KW
3use strict;
4use warnings;
cfb8fd6a 5require 'regen/regen_lib.pl';
9c68f0ab 6
b1909af7
KW
7# This program outputs l1_charclass_tab.h, which defines the guts of the
8# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1 9# code point at the corresponding position in the table array has. The first
b1909af7 10# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1
KW
11# an application to see if the code point "i" has a particular property, it
12# just does
9c68f0ab
KW
13# 'PL_charclass[i] & BIT'
14# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
15# character class, and 'property' is the corresponding property, and 'suffix'
16# is one of '_A' to mean the property is true only if the corresponding code
17# point is ASCII, and '_L1' means that the range includes any Latin1
18# character (ISO-8859-1 including the C0 and C1 controls). A property without
19# these suffixes does not have different forms for both ranges.
20
b1909af7
KW
21# This program need be run only when adding new properties to it, or upon a
22# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab
KW
23
24my @properties = qw(
62841d05 25 NONLATIN1_FOLD
f4cdb42c
KW
26 ALNUMC
27 ALPHA
e48bdfbc 28 ASCII
f4cdb42c 29 BLANK
9c68f0ab 30 CHARNAME_CONT
f4cdb42c
KW
31 CNTRL
32 DIGIT
33 GRAPH
34 IDFIRST
35 LOWER
b72a36d4 36 NON_FINAL_FOLD
f4cdb42c
KW
37 PRINT
38 PSXSPC
39 PUNCT
9a022f3a 40 QUOTEMETA
f4cdb42c
KW
41 SPACE
42 UPPER
43 WORDCHAR
44 XDIGIT
a0947d7b 45 VERTSPACE
f7993745 46 IS_IN_SOME_FOLD
9c68f0ab
KW
47);
48
00c072cf
KW
49# Read in the case fold mappings.
50my %folded_closure;
62841d05 51my @hex_non_final_folds;
dbe1ba6b
KW
52my @folds;
53use Unicode::UCD;
54
62841d05
KW
55BEGIN { # Have to do this at compile time because using user-defined \p{property}
56
4ef0bd69
KW
57 # Use the Unicode data file if we are on an ASCII platform (which its data
58 # is for), and it is in the modern format (starting in Unicode 3.1.0) and
59 # it is available. This avoids being affected by potential bugs
60 # introduced by other layers of Perl
61 my $file="lib/unicore/CaseFolding.txt";
62
63 if (ord('A') == 65
64 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
65 && open my $fh, "<", $file)
66 {
67 @folds = <$fh>;
68 }
69 else {
70 my ($invlist_ref, $invmap_ref, undef, $default)
dbe1ba6b 71 = Unicode::UCD::prop_invmap('Case_Folding');
4ef0bd69
KW
72 for my $i (0 .. @$invlist_ref - 1 - 1) {
73 next if $invmap_ref->[$i] == $default;
74 my $adjust = -1;
75 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
76 $adjust++;
77
78 # Single-code point maps go to a 'C' type
79 if (! ref $invmap_ref->[$i]) {
80 push @folds, sprintf("%04X; C; %04X\n",
81 $j,
82 $invmap_ref->[$i] + $adjust);
83 }
84 else { # Multi-code point maps go to 'F'. prop_invmap()
85 # guarantees that no adjustment is needed for these,
86 # as the range will contain just one element
87 push @folds, sprintf("%04X; F; %s\n",
88 $j,
89 join " ", map { sprintf "%04X", $_ }
90 @{$invmap_ref->[$i]});
91 }
dbe1ba6b
KW
92 }
93 }
94 }
dbe1ba6b 95
4ef0bd69
KW
96 for (@folds) {
97 chomp;
00c072cf 98
4ef0bd69
KW
99 # Lines look like (without the initial '#'
100 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
101 # Get rid of comments, ignore blank or comment-only lines
102 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
103 next unless length $line;
104 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
00c072cf 105
4ef0bd69 106 my $from = hex $hex_from;
00c072cf 107
4ef0bd69
KW
108 # Perl only deals with C and F folds
109 next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf 110
4ef0bd69
KW
111 # Get each code point in the range that participates in this line's fold.
112 # The hash has keys of each code point in the range, and values of what it
113 # folds to and what folds to it
114 for my $i (0 .. @folded - 1) {
115 my $hex_fold = $folded[$i];
116 my $fold = hex $hex_fold;
117 push @{$folded_closure{$fold}}, $from if $fold < 256;
118 push @{$folded_closure{$from}}, $fold if $from < 256;
62841d05 119
07725c18
KW
120 if ($i < @folded-1
121 && $fold < 256
122 && ! grep { $_ eq $hex_fold } @hex_non_final_folds)
123 {
124 push @hex_non_final_folds, $hex_fold;
125
126 # Also add the upper case, which in the latin1 range folds to
127 # $fold
128 push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold;
129 }
4ef0bd69 130 }
00c072cf 131 }
00c072cf 132
4ef0bd69
KW
133 # Now having read all the lines, combine them into the full closure of each
134 # code point in the range by adding lists together that share a common
135 # element
136 foreach my $folded (keys %folded_closure) {
137 foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
138 push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
139 }
00c072cf
KW
140 }
141}
142
62841d05
KW
143sub Is_Non_Latin1_Fold {
144 my @return;
145
146 foreach my $folded (keys %folded_closure) {
147 push @return, sprintf("%X", $folded), if grep { $_ > 255 }
148 @{$folded_closure{$folded}};
149 }
150 return join("\n", @return) . "\n";
00c072cf
KW
151}
152
62841d05
KW
153sub Is_Non_Final_Fold {
154 return join("\n", @hex_non_final_folds) . "\n";
155}
156
157my @bits; # Bit map for each code point
158
b1909af7 159# For each character, calculate which properties it matches.
9c68f0ab
KW
160for my $ord (0..255) {
161 my $char = chr($ord);
162 utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7
KW
163
164 # Look at all the properties we care about here.
9c68f0ab
KW
165 for my $property (@properties) {
166 my $name = $property;
167
b1909af7
KW
168 # Remove the suffix to get the actual property name.
169 # Currently the suffixes are '_L1', '_A', and none.
9c68f0ab
KW
170 # If is a latin1 version, no further checking is needed.
171 if (! ($name =~ s/_L1$//)) {
172
b1909af7 173 # Here, isn't an _L1. If its _A, it's automatically false for
a0947d7b
KW
174 # non-ascii. The only current ones (besides ASCII) without a
175 # suffix are valid over the whole range.
9c68f0ab
KW
176 next if $name =~ s/_A$// && $ord >= 128;
177
178 }
179 my $re;
180 if ($name eq 'PUNCT') {;
181
182 # Sadly, this is inconsistent: \pP and \pS for the ascii range,
183 # just \pP outside it.
184 $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
185 } elsif ($name eq 'CHARNAME_CONT') {;
699ffc5e 186 $re = qr/\p{_Perl_Charname_Continue}/,
9c68f0ab 187 } elsif ($name eq 'SPACE') {;
c6e8e4a9 188 $re = qr/\p{XPerlSpace}/;
9c68f0ab
KW
189 } elsif ($name eq 'IDFIRST') {
190 $re = qr/[_\p{Alpha}]/;
191 } elsif ($name eq 'PSXSPC') {
192 $re = qr/[\v\p{Space}]/;
193 } elsif ($name eq 'WORDCHAR') {
c6e8e4a9 194 $re = qr/\p{XPosixWord}/;
9c68f0ab
KW
195 } elsif ($name eq 'ALNUMC') {
196 # Like \w, but no underscore
aedd44b5 197 $re = qr/\p{Alnum}/;
9a022f3a
KW
198 } elsif ($name eq 'QUOTEMETA') {
199 $re = qr/\p{_Perl_Quotemeta}/;
62841d05
KW
200 } elsif ($name eq 'NONLATIN1_FOLD') {
201 $re = qr/\p{Is_Non_Latin1_Fold}/;
b72a36d4 202 } elsif ($name eq 'NON_FINAL_FOLD') {
62841d05 203 $re = qr/\p{Is_Non_Final_Fold}/;
f7993745
KW
204 } elsif ($name eq 'IS_IN_SOME_FOLD') {
205 $re = qr/\p{_Perl_Any_Folds}/;
9c68f0ab
KW
206 } else { # The remainder have the same name and values as Unicode
207 $re = eval "qr/\\p{$name}/";
208 use Carp;
209 carp $@ if ! defined $re;
210 }
211 #print "$ord, $name $property, $re\n";
07725c18 212 if ($char =~ $re) { # Add this property if matches
9c68f0ab 213 $bits[$ord] .= '|' if $bits[$ord];
265c1f46 214 $bits[$ord] .= "(1U<<_CC_$property)";
9c68f0ab
KW
215 }
216 }
217 #print __LINE__, " $ord $char $bits[$ord]\n";
218}
219
220# Names of C0 controls
221my @C0 = qw (
222 NUL
223 SOH
224 STX
225 ETX
226 EOT
227 ENQ
228 ACK
229 BEL
230 BS
231 HT
232 LF
233 VT
234 FF
235 CR
236 SO
237 SI
238 DLE
239 DC1
240 DC2
241 DC3
242 DC4
243 NAK
244 SYN
245 ETB
246 CAN
247 EOM
248 SUB
249 ESC
250 FS
251 GS
252 RS
253 US
254 );
255
256# Names of C1 controls, plus the adjacent DEL
257my @C1 = qw(
258 DEL
259 PAD
260 HOP
261 BPH
262 NBH
263 IND
264 NEL
265 SSA
266 ESA
267 HTS
268 HTJ
269 VTS
270 PLD
271 PLU
272 RI
273 SS2
274 SS3
275 DCS
276 PU1
277 PU2
278 STS
279 CCH
280 MW
281 SPA
282 EPA
283 SOS
284 SGC
285 SCI
286 CSI
287 ST
288 OSC
289 PM
290 APC
291 );
292
cc49830d 293my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7 294 {style => '*', by => $0,
dbe1ba6b 295 from => "property definitions"});
cfb8fd6a 296
9c68f0ab
KW
297# Output the table using fairly short names for each char.
298for my $ord (0..255) {
299 my $name;
300 if ($ord < 32) { # A C0 control
301 $name = $C0[$ord];
302 } elsif ($ord > 32 && $ord < 127) { # Graphic
303 $name = "'" . chr($ord) . "'";
304 } elsif ($ord >= 127 && $ord <= 0x9f) {
305 $name = $C1[$ord - 127]; # A C1 control + DEL
306 } else { # SPACE, or, if Latin1, shorten the name */
307 use charnames();
308 $name = charnames::viacode($ord);
309 $name =~ s/LATIN CAPITAL LETTER //
310 || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
311 }
cfb8fd6a 312 printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab
KW
313}
314
cfb8fd6a 315read_only_bottom_close_and_rename($out_fh)