This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perldelta: fix minor errors in description of postderef
[perl5.git] / regen / mk_PL_charclass.pl
... / ...
CommitLineData
1#!perl -w
2use v5.15.8;
3use strict;
4use warnings;
5require 'regen/regen_lib.pl';
6
7# This program outputs l1_charclass_tab.h, which defines the guts of the
8# PL_charclass table. Each line is a bit map of properties that the Unicode
9# code point at the corresponding position in the table array has. The first
10# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
11# an application to see if the code point "i" has a particular property, it
12# just does
13# 'PL_charclass[i] & BIT'
14# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
15# character class, and 'property' is the corresponding property, and 'suffix'
16# is one of '_A' to mean the property is true only if the corresponding code
17# point is ASCII, and '_L1' means that the range includes any Latin1
18# character (ISO-8859-1 including the C0 and C1 controls). A property without
19# these suffixes does not have different forms for both ranges.
20
21# This program need be run only when adding new properties to it, or upon a
22# new Unicode release, to make sure things haven't been changed by it.
23
24my @properties = qw(
25 NONLATIN1_FOLD
26 ALPHANUMERIC
27 ALPHA
28 ASCII
29 BLANK
30 CASED
31 CHARNAME_CONT
32 CNTRL
33 DIGIT
34 GRAPH
35 IDFIRST
36 LOWER
37 NON_FINAL_FOLD
38 PRINT
39 PSXSPC
40 PUNCT
41 QUOTEMETA
42 SPACE
43 UPPER
44 WORDCHAR
45 XDIGIT
46 VERTSPACE
47 IS_IN_SOME_FOLD
48 BACKSLASH_FOO_LBRACE_IS_META
49);
50
51# Read in the case fold mappings.
52my %folded_closure;
53my @hex_non_final_folds;
54my @folds;
55use Unicode::UCD;
56
57BEGIN { # Have to do this at compile time because using user-defined \p{property}
58
59 # Use the Unicode data file if we are on an ASCII platform (which its data
60 # is for), and it is in the modern format (starting in Unicode 3.1.0) and
61 # it is available. This avoids being affected by potential bugs
62 # introduced by other layers of Perl
63 my $file="lib/unicore/CaseFolding.txt";
64
65 if (ord('A') == 65
66 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
67 && open my $fh, "<", $file)
68 {
69 @folds = <$fh>;
70 }
71 else {
72 my ($invlist_ref, $invmap_ref, undef, $default)
73 = Unicode::UCD::prop_invmap('Case_Folding');
74 for my $i (0 .. @$invlist_ref - 1 - 1) {
75 next if $invmap_ref->[$i] == $default;
76 my $adjust = -1;
77 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
78 $adjust++;
79
80 # Single-code point maps go to a 'C' type
81 if (! ref $invmap_ref->[$i]) {
82 push @folds, sprintf("%04X; C; %04X\n",
83 $j,
84 $invmap_ref->[$i] + $adjust);
85 }
86 else { # Multi-code point maps go to 'F'. prop_invmap()
87 # guarantees that no adjustment is needed for these,
88 # as the range will contain just one element
89 push @folds, sprintf("%04X; F; %s\n",
90 $j,
91 join " ", map { sprintf "%04X", $_ }
92 @{$invmap_ref->[$i]});
93 }
94 }
95 }
96 }
97
98 for (@folds) {
99 chomp;
100
101 # Lines look like (without the initial '#'
102 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
103 # Get rid of comments, ignore blank or comment-only lines
104 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
105 next unless length $line;
106 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
107
108 my $from = hex $hex_from;
109
110 # Perl only deals with C and F folds
111 next if $fold_type ne 'C' and $fold_type ne 'F';
112
113 # Get each code point in the range that participates in this line's fold.
114 # The hash has keys of each code point in the range, and values of what it
115 # folds to and what folds to it
116 for my $i (0 .. @folded - 1) {
117 my $hex_fold = $folded[$i];
118 my $fold = hex $hex_fold;
119 push @{$folded_closure{$fold}}, $from if $fold < 256;
120 push @{$folded_closure{$from}}, $fold if $from < 256;
121
122 if ($i < @folded-1
123 && $fold < 256
124 && ! grep { $_ eq $hex_fold } @hex_non_final_folds)
125 {
126 push @hex_non_final_folds, $hex_fold;
127
128 # Also add the upper case, which in the latin1 range folds to
129 # $fold
130 push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold;
131 }
132 }
133 }
134
135 # Now having read all the lines, combine them into the full closure of each
136 # code point in the range by adding lists together that share a common
137 # element
138 foreach my $folded (keys %folded_closure) {
139 foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
140 push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
141 }
142 }
143}
144
145sub Is_Non_Latin1_Fold {
146 my @return;
147
148 foreach my $folded (keys %folded_closure) {
149 push @return, sprintf("%X", $folded), if grep { $_ > 255 }
150 @{$folded_closure{$folded}};
151 }
152 return join("\n", @return) . "\n";
153}
154
155sub Is_Non_Final_Fold {
156 return join("\n", @hex_non_final_folds) . "\n";
157}
158
159my @bits; # Bit map for each code point
160
161# For each character, calculate which properties it matches.
162for my $ord (0..255) {
163 my $char = chr($ord);
164 utf8::upgrade($char); # Important to use Unicode semantics!
165
166 # Look at all the properties we care about here.
167 for my $property (@properties) {
168 my $name = $property;
169
170 # Remove the suffix to get the actual property name.
171 # Currently the suffixes are '_L1', '_A', and none.
172 # If is a latin1 version, no further checking is needed.
173 if (! ($name =~ s/_L1$//)) {
174
175 # Here, isn't an _L1. If its _A, it's automatically false for
176 # non-ascii. The only current ones (besides ASCII) without a
177 # suffix are valid over the whole range.
178 next if $name =~ s/_A$// && $ord >= 128;
179
180 }
181 my $re;
182 if ($name eq 'PUNCT') {;
183
184 # Sadly, this is inconsistent: \pP and \pS for the ascii range,
185 # just \pP outside it.
186 $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
187 } elsif ($name eq 'CHARNAME_CONT') {;
188 $re = qr/\p{_Perl_Charname_Continue}/,
189 } elsif ($name eq 'SPACE') {;
190 $re = qr/\p{XPerlSpace}/;
191 } elsif ($name eq 'IDFIRST') {
192 $re = qr/[_\p{Alpha}]/;
193 } elsif ($name eq 'PSXSPC') {
194 $re = qr/[\v\p{Space}]/;
195 } elsif ($name eq 'WORDCHAR') {
196 $re = qr/\p{XPosixWord}/;
197 } elsif ($name eq 'ALPHANUMERIC') {
198 # Like \w, but no underscore
199 $re = qr/\p{Alnum}/;
200 } elsif ($name eq 'QUOTEMETA') {
201 $re = qr/\p{_Perl_Quotemeta}/;
202 } elsif ($name eq 'NONLATIN1_FOLD') {
203 $re = qr/\p{Is_Non_Latin1_Fold}/;
204 } elsif ($name eq 'NON_FINAL_FOLD') {
205 $re = qr/\p{Is_Non_Final_Fold}/;
206 } elsif ($name eq 'IS_IN_SOME_FOLD') {
207 $re = qr/\p{_Perl_Any_Folds}/;
208 } elsif ($name eq 'BACKSLASH_FOO_LBRACE_IS_META') {
209
210 # This is true for FOO where FOO is the varying character in:
211 # \a{, \b{, \c{, ...
212 # and the sequence has non-literal meaning to Perl; so it is true
213 # for 'x' because \x{ is special, but not 'a' because \a{ isn't.
214 $re = qr/[gkNopPx]/;
215 } else { # The remainder have the same name and values as Unicode
216 $re = eval "qr/\\p{$name}/";
217 use Carp;
218 carp $@ if ! defined $re;
219 }
220 #print "$ord, $name $property, $re\n";
221 if ($char =~ $re) { # Add this property if matches
222 $bits[$ord] .= '|' if $bits[$ord];
223 $bits[$ord] .= "(1U<<_CC_$property)";
224 }
225 }
226 #print __LINE__, " $ord $char $bits[$ord]\n";
227}
228
229# Names of C0 controls
230my @C0 = qw (
231 NUL
232 SOH
233 STX
234 ETX
235 EOT
236 ENQ
237 ACK
238 BEL
239 BS
240 HT
241 LF
242 VT
243 FF
244 CR
245 SO
246 SI
247 DLE
248 DC1
249 DC2
250 DC3
251 DC4
252 NAK
253 SYN
254 ETB
255 CAN
256 EOM
257 SUB
258 ESC
259 FS
260 GS
261 RS
262 US
263 );
264
265# Names of C1 controls, plus the adjacent DEL
266my @C1 = qw(
267 DEL
268 PAD
269 HOP
270 BPH
271 NBH
272 IND
273 NEL
274 SSA
275 ESA
276 HTS
277 HTJ
278 VTS
279 PLD
280 PLU
281 RI
282 SS2
283 SS3
284 DCS
285 PU1
286 PU2
287 STS
288 CCH
289 MW
290 SPA
291 EPA
292 SOS
293 SGC
294 SCI
295 CSI
296 ST
297 OSC
298 PM
299 APC
300 );
301
302my $out_fh = open_new('l1_char_class_tab.h', '>',
303 {style => '*', by => $0,
304 from => "property definitions"});
305
306# Output the table using fairly short names for each char.
307for my $ord (0..255) {
308 my $name;
309 if ($ord < 32) { # A C0 control
310 $name = $C0[$ord];
311 } elsif ($ord > 32 && $ord < 127) { # Graphic
312 $name = "'" . chr($ord) . "'";
313 } elsif ($ord >= 127 && $ord <= 0x9f) {
314 $name = $C1[$ord - 127]; # A C1 control + DEL
315 } else { # SPACE, or, if Latin1, shorten the name */
316 use charnames();
317 $name = charnames::viacode($ord);
318 $name =~ s/LATIN CAPITAL LETTER //
319 || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
320 }
321 printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
322}
323
324read_only_bottom_close_and_rename($out_fh)