This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlop: #109408
[perl5.git] / regen / mk_PL_charclass.pl
CommitLineData
9c68f0ab
KW
1#!perl -w
2use 5.012;
3use strict;
4use warnings;
cfb8fd6a 5require 'regen/regen_lib.pl';
9c68f0ab 6
b1909af7
KW
7# This program outputs l1_charclass_tab.h, which defines the guts of the
8# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1 9# code point at the corresponding position in the table array has. The first
b1909af7 10# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1
KW
11# an application to see if the code point "i" has a particular property, it
12# just does
9c68f0ab
KW
13# 'PL_charclass[i] & BIT'
14# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
15# character class, and 'property' is the corresponding property, and 'suffix'
16# is one of '_A' to mean the property is true only if the corresponding code
17# point is ASCII, and '_L1' means that the range includes any Latin1
18# character (ISO-8859-1 including the C0 and C1 controls). A property without
19# these suffixes does not have different forms for both ranges.
20
b1909af7
KW
21# This program need be run only when adding new properties to it, or upon a
22# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab
KW
23
24my @properties = qw(
25 ALNUMC_A
26 ALNUMC_L1
27 ALPHA_A
28 ALPHA_L1
29 BLANK_A
30 BLANK_L1
31 CHARNAME_CONT
32 CNTRL_A
33 CNTRL_L1
34 DIGIT_A
35 GRAPH_A
36 GRAPH_L1
37 IDFIRST_A
38 IDFIRST_L1
39 LOWER_A
40 LOWER_L1
9c68f0ab
KW
41 PRINT_A
42 PRINT_L1
43 PSXSPC_A
44 PSXSPC_L1
45 PUNCT_A
46 PUNCT_L1
47 SPACE_A
48 SPACE_L1
49 UPPER_A
50 UPPER_L1
51 WORDCHAR_A
52 WORDCHAR_L1
53 XDIGIT_A
9a022f3a 54 QUOTEMETA
9c68f0ab
KW
55);
56
00c072cf
KW
57# Read in the case fold mappings.
58my %folded_closure;
cfb8fd6a 59my $file="lib/unicore/CaseFolding.txt";
dbe1ba6b
KW
60my @folds;
61use Unicode::UCD;
62
63# Use the Unicode data file if we are on an ASCII platform (which its data is
64# for), and it is in the modern format (starting in Unicode 3.1.0) and it is
65# available. This avoids being affected by potential bugs introduced by other
66# layers of Perl
67if (ord('A') == 65
68 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
69 && open my $fh, "<", $file)
70{
71 @folds = <$fh>;
72}
73else {
74 my ($invlist_ref, $invmap_ref, undef, $default)
75 = Unicode::UCD::prop_invmap('Case_Folding');
76 for my $i (0 .. @$invlist_ref - 1 - 1) {
77 next if $invmap_ref->[$i] == $default;
78 my $adjust = -1;
79 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
80 $adjust++;
81
82 # Single-code point maps go to a 'C' type
83 if (! ref $invmap_ref->[$i]) {
84 push @folds, sprintf("%04X; C; %04X\n",
85 $j,
86 $invmap_ref->[$i] + $adjust);
87 }
88 else { # Multi-code point maps go to 'F'. prop_invmap()
89 # guarantees that no adjustment is needed for these,
90 # as the range will contain just one element
91 push @folds, sprintf("%04X; F; %s\n",
92 $j,
93 join " ", map { sprintf "%04X", $_ }
94 @{$invmap_ref->[$i]});
95 }
96 }
97 }
98}
99
100for (@folds) {
00c072cf
KW
101 chomp;
102
103 # Lines look like (without the initial '#'
104 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
e3136cf9
KW
105 # Get rid of comments, ignore blank or comment-only lines
106 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
107 next unless length $line;
00c072cf
KW
108 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
109
110 my $from = hex $hex_from;
111
112 # Perl only deals with C and F folds
33e4950c 113 next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf
KW
114
115 # Get each code point in the range that participates in this line's fold.
116 # The hash has keys of each code point in the range, and values of what it
117 # folds to and what folds to it
118 foreach my $hex_fold (@folded) {
119 my $fold = hex $hex_fold;
120 push @{$folded_closure{$fold}}, $from if $fold < 256;
121 push @{$folded_closure{$from}}, $fold if $from < 256;
122 }
123}
124
125# Now having read all the lines, combine them into the full closure of each
126# code point in the range by adding lists together that share a common element
127foreach my $folded (keys %folded_closure) {
128 foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
129 push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
130 }
131}
132
9c68f0ab
KW
133my @bits; # Bit map for each code point
134
00c072cf
KW
135foreach my $folded (keys %folded_closure) {
136 $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
137 @{$folded_closure{$folded}};
138}
139
b1909af7 140# For each character, calculate which properties it matches.
9c68f0ab
KW
141for my $ord (0..255) {
142 my $char = chr($ord);
143 utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7
KW
144
145 # Look at all the properties we care about here.
9c68f0ab
KW
146 for my $property (@properties) {
147 my $name = $property;
148
b1909af7
KW
149 # Remove the suffix to get the actual property name.
150 # Currently the suffixes are '_L1', '_A', and none.
9c68f0ab
KW
151 # If is a latin1 version, no further checking is needed.
152 if (! ($name =~ s/_L1$//)) {
153
b1909af7
KW
154 # Here, isn't an _L1. If its _A, it's automatically false for
155 # non-ascii. The only one current one without a suffix is valid
156 # over the whole range.
9c68f0ab
KW
157 next if $name =~ s/_A$// && $ord >= 128;
158
159 }
160 my $re;
161 if ($name eq 'PUNCT') {;
162
163 # Sadly, this is inconsistent: \pP and \pS for the ascii range,
164 # just \pP outside it.
165 $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
166 } elsif ($name eq 'CHARNAME_CONT') {;
c6e8e4a9 167 $re = qr/[-\p{XPosixWord} ():\xa0]/;
9c68f0ab 168 } elsif ($name eq 'SPACE') {;
c6e8e4a9 169 $re = qr/\p{XPerlSpace}/;
9c68f0ab
KW
170 } elsif ($name eq 'IDFIRST') {
171 $re = qr/[_\p{Alpha}]/;
172 } elsif ($name eq 'PSXSPC') {
173 $re = qr/[\v\p{Space}]/;
174 } elsif ($name eq 'WORDCHAR') {
c6e8e4a9 175 $re = qr/\p{XPosixWord}/;
9c68f0ab
KW
176 } elsif ($name eq 'ALNUMC') {
177 # Like \w, but no underscore
aedd44b5 178 $re = qr/\p{Alnum}/;
9c68f0ab
KW
179 } elsif ($name eq 'OCTAL') {
180 $re = qr/[0-7]/;
9a022f3a
KW
181 } elsif ($name eq 'QUOTEMETA') {
182 $re = qr/\p{_Perl_Quotemeta}/;
9c68f0ab
KW
183 } else { # The remainder have the same name and values as Unicode
184 $re = eval "qr/\\p{$name}/";
185 use Carp;
186 carp $@ if ! defined $re;
187 }
188 #print "$ord, $name $property, $re\n";
189 if ($char =~ $re) { # Add this property if matches
190 $bits[$ord] .= '|' if $bits[$ord];
191 $bits[$ord] .= "_CC_$property";
192 }
193 }
194 #print __LINE__, " $ord $char $bits[$ord]\n";
195}
196
197# Names of C0 controls
198my @C0 = qw (
199 NUL
200 SOH
201 STX
202 ETX
203 EOT
204 ENQ
205 ACK
206 BEL
207 BS
208 HT
209 LF
210 VT
211 FF
212 CR
213 SO
214 SI
215 DLE
216 DC1
217 DC2
218 DC3
219 DC4
220 NAK
221 SYN
222 ETB
223 CAN
224 EOM
225 SUB
226 ESC
227 FS
228 GS
229 RS
230 US
231 );
232
233# Names of C1 controls, plus the adjacent DEL
234my @C1 = qw(
235 DEL
236 PAD
237 HOP
238 BPH
239 NBH
240 IND
241 NEL
242 SSA
243 ESA
244 HTS
245 HTJ
246 VTS
247 PLD
248 PLU
249 RI
250 SS2
251 SS3
252 DCS
253 PU1
254 PU2
255 STS
256 CCH
257 MW
258 SPA
259 EPA
260 SOS
261 SGC
262 SCI
263 CSI
264 ST
265 OSC
266 PM
267 APC
268 );
269
cc49830d 270my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7 271 {style => '*', by => $0,
dbe1ba6b 272 from => "property definitions"});
cfb8fd6a 273
9c68f0ab
KW
274# Output the table using fairly short names for each char.
275for my $ord (0..255) {
276 my $name;
277 if ($ord < 32) { # A C0 control
278 $name = $C0[$ord];
279 } elsif ($ord > 32 && $ord < 127) { # Graphic
280 $name = "'" . chr($ord) . "'";
281 } elsif ($ord >= 127 && $ord <= 0x9f) {
282 $name = $C1[$ord - 127]; # A C1 control + DEL
283 } else { # SPACE, or, if Latin1, shorten the name */
284 use charnames();
285 $name = charnames::viacode($ord);
286 $name =~ s/LATIN CAPITAL LETTER //
287 || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
288 }
cfb8fd6a 289 printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab
KW
290}
291
cfb8fd6a 292read_only_bottom_close_and_rename($out_fh)