This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regex: Add pseudo-Posix class: 'cased'
[perl5.git] / regen / mk_PL_charclass.pl
CommitLineData
9c68f0ab 1#!perl -w
b72a36d4 2use v5.15.8;
9c68f0ab
KW
3use strict;
4use warnings;
cfb8fd6a 5require 'regen/regen_lib.pl';
9c68f0ab 6
b1909af7
KW
7# This program outputs l1_charclass_tab.h, which defines the guts of the
8# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1 9# code point at the corresponding position in the table array has. The first
b1909af7 10# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1
KW
11# an application to see if the code point "i" has a particular property, it
12# just does
9c68f0ab
KW
13# 'PL_charclass[i] & BIT'
14# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
15# character class, and 'property' is the corresponding property, and 'suffix'
16# is one of '_A' to mean the property is true only if the corresponding code
17# point is ASCII, and '_L1' means that the range includes any Latin1
18# character (ISO-8859-1 including the C0 and C1 controls). A property without
19# these suffixes does not have different forms for both ranges.
20
b1909af7
KW
21# This program need be run only when adding new properties to it, or upon a
22# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab
KW
23
24my @properties = qw(
62841d05 25 NONLATIN1_FOLD
15861f94 26 ALPHANUMERIC
f4cdb42c 27 ALPHA
e48bdfbc 28 ASCII
f4cdb42c 29 BLANK
b0d691b2 30 CASED
9c68f0ab 31 CHARNAME_CONT
f4cdb42c
KW
32 CNTRL
33 DIGIT
34 GRAPH
35 IDFIRST
36 LOWER
b72a36d4 37 NON_FINAL_FOLD
f4cdb42c
KW
38 PRINT
39 PSXSPC
40 PUNCT
9a022f3a 41 QUOTEMETA
f4cdb42c
KW
42 SPACE
43 UPPER
44 WORDCHAR
45 XDIGIT
a0947d7b 46 VERTSPACE
f7993745 47 IS_IN_SOME_FOLD
9c68f0ab
KW
48);
49
00c072cf
KW
50# Read in the case fold mappings.
51my %folded_closure;
62841d05 52my @hex_non_final_folds;
dbe1ba6b
KW
53my @folds;
54use Unicode::UCD;
55
62841d05
KW
56BEGIN { # Have to do this at compile time because using user-defined \p{property}
57
4ef0bd69
KW
58 # Use the Unicode data file if we are on an ASCII platform (which its data
59 # is for), and it is in the modern format (starting in Unicode 3.1.0) and
60 # it is available. This avoids being affected by potential bugs
61 # introduced by other layers of Perl
62 my $file="lib/unicore/CaseFolding.txt";
63
64 if (ord('A') == 65
65 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
66 && open my $fh, "<", $file)
67 {
68 @folds = <$fh>;
69 }
70 else {
71 my ($invlist_ref, $invmap_ref, undef, $default)
dbe1ba6b 72 = Unicode::UCD::prop_invmap('Case_Folding');
4ef0bd69
KW
73 for my $i (0 .. @$invlist_ref - 1 - 1) {
74 next if $invmap_ref->[$i] == $default;
75 my $adjust = -1;
76 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
77 $adjust++;
78
79 # Single-code point maps go to a 'C' type
80 if (! ref $invmap_ref->[$i]) {
81 push @folds, sprintf("%04X; C; %04X\n",
82 $j,
83 $invmap_ref->[$i] + $adjust);
84 }
85 else { # Multi-code point maps go to 'F'. prop_invmap()
86 # guarantees that no adjustment is needed for these,
87 # as the range will contain just one element
88 push @folds, sprintf("%04X; F; %s\n",
89 $j,
90 join " ", map { sprintf "%04X", $_ }
91 @{$invmap_ref->[$i]});
92 }
dbe1ba6b
KW
93 }
94 }
95 }
dbe1ba6b 96
4ef0bd69
KW
97 for (@folds) {
98 chomp;
00c072cf 99
4ef0bd69
KW
100 # Lines look like (without the initial '#'
101 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
102 # Get rid of comments, ignore blank or comment-only lines
103 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
104 next unless length $line;
105 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
00c072cf 106
4ef0bd69 107 my $from = hex $hex_from;
00c072cf 108
4ef0bd69
KW
109 # Perl only deals with C and F folds
110 next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf 111
4ef0bd69
KW
112 # Get each code point in the range that participates in this line's fold.
113 # The hash has keys of each code point in the range, and values of what it
114 # folds to and what folds to it
115 for my $i (0 .. @folded - 1) {
116 my $hex_fold = $folded[$i];
117 my $fold = hex $hex_fold;
118 push @{$folded_closure{$fold}}, $from if $fold < 256;
119 push @{$folded_closure{$from}}, $fold if $from < 256;
62841d05 120
07725c18
KW
121 if ($i < @folded-1
122 && $fold < 256
123 && ! grep { $_ eq $hex_fold } @hex_non_final_folds)
124 {
125 push @hex_non_final_folds, $hex_fold;
126
127 # Also add the upper case, which in the latin1 range folds to
128 # $fold
129 push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold;
130 }
4ef0bd69 131 }
00c072cf 132 }
00c072cf 133
4ef0bd69
KW
134 # Now having read all the lines, combine them into the full closure of each
135 # code point in the range by adding lists together that share a common
136 # element
137 foreach my $folded (keys %folded_closure) {
138 foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
139 push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
140 }
00c072cf
KW
141 }
142}
143
62841d05
KW
144sub Is_Non_Latin1_Fold {
145 my @return;
146
147 foreach my $folded (keys %folded_closure) {
148 push @return, sprintf("%X", $folded), if grep { $_ > 255 }
149 @{$folded_closure{$folded}};
150 }
151 return join("\n", @return) . "\n";
00c072cf
KW
152}
153
62841d05
KW
154sub Is_Non_Final_Fold {
155 return join("\n", @hex_non_final_folds) . "\n";
156}
157
158my @bits; # Bit map for each code point
159
b1909af7 160# For each character, calculate which properties it matches.
9c68f0ab
KW
161for my $ord (0..255) {
162 my $char = chr($ord);
163 utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7
KW
164
165 # Look at all the properties we care about here.
9c68f0ab
KW
166 for my $property (@properties) {
167 my $name = $property;
168
b1909af7
KW
169 # Remove the suffix to get the actual property name.
170 # Currently the suffixes are '_L1', '_A', and none.
9c68f0ab
KW
171 # If is a latin1 version, no further checking is needed.
172 if (! ($name =~ s/_L1$//)) {
173
b1909af7 174 # Here, isn't an _L1. If its _A, it's automatically false for
a0947d7b
KW
175 # non-ascii. The only current ones (besides ASCII) without a
176 # suffix are valid over the whole range.
9c68f0ab
KW
177 next if $name =~ s/_A$// && $ord >= 128;
178
179 }
180 my $re;
181 if ($name eq 'PUNCT') {;
182
183 # Sadly, this is inconsistent: \pP and \pS for the ascii range,
184 # just \pP outside it.
185 $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
186 } elsif ($name eq 'CHARNAME_CONT') {;
699ffc5e 187 $re = qr/\p{_Perl_Charname_Continue}/,
9c68f0ab 188 } elsif ($name eq 'SPACE') {;
c6e8e4a9 189 $re = qr/\p{XPerlSpace}/;
9c68f0ab
KW
190 } elsif ($name eq 'IDFIRST') {
191 $re = qr/[_\p{Alpha}]/;
192 } elsif ($name eq 'PSXSPC') {
193 $re = qr/[\v\p{Space}]/;
194 } elsif ($name eq 'WORDCHAR') {
c6e8e4a9 195 $re = qr/\p{XPosixWord}/;
15861f94 196 } elsif ($name eq 'ALPHANUMERIC') {
9c68f0ab 197 # Like \w, but no underscore
aedd44b5 198 $re = qr/\p{Alnum}/;
9a022f3a
KW
199 } elsif ($name eq 'QUOTEMETA') {
200 $re = qr/\p{_Perl_Quotemeta}/;
62841d05
KW
201 } elsif ($name eq 'NONLATIN1_FOLD') {
202 $re = qr/\p{Is_Non_Latin1_Fold}/;
b72a36d4 203 } elsif ($name eq 'NON_FINAL_FOLD') {
62841d05 204 $re = qr/\p{Is_Non_Final_Fold}/;
f7993745
KW
205 } elsif ($name eq 'IS_IN_SOME_FOLD') {
206 $re = qr/\p{_Perl_Any_Folds}/;
9c68f0ab
KW
207 } else { # The remainder have the same name and values as Unicode
208 $re = eval "qr/\\p{$name}/";
209 use Carp;
210 carp $@ if ! defined $re;
211 }
212 #print "$ord, $name $property, $re\n";
07725c18 213 if ($char =~ $re) { # Add this property if matches
9c68f0ab 214 $bits[$ord] .= '|' if $bits[$ord];
265c1f46 215 $bits[$ord] .= "(1U<<_CC_$property)";
9c68f0ab
KW
216 }
217 }
218 #print __LINE__, " $ord $char $bits[$ord]\n";
219}
220
221# Names of C0 controls
222my @C0 = qw (
223 NUL
224 SOH
225 STX
226 ETX
227 EOT
228 ENQ
229 ACK
230 BEL
231 BS
232 HT
233 LF
234 VT
235 FF
236 CR
237 SO
238 SI
239 DLE
240 DC1
241 DC2
242 DC3
243 DC4
244 NAK
245 SYN
246 ETB
247 CAN
248 EOM
249 SUB
250 ESC
251 FS
252 GS
253 RS
254 US
255 );
256
257# Names of C1 controls, plus the adjacent DEL
258my @C1 = qw(
259 DEL
260 PAD
261 HOP
262 BPH
263 NBH
264 IND
265 NEL
266 SSA
267 ESA
268 HTS
269 HTJ
270 VTS
271 PLD
272 PLU
273 RI
274 SS2
275 SS3
276 DCS
277 PU1
278 PU2
279 STS
280 CCH
281 MW
282 SPA
283 EPA
284 SOS
285 SGC
286 SCI
287 CSI
288 ST
289 OSC
290 PM
291 APC
292 );
293
cc49830d 294my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7 295 {style => '*', by => $0,
dbe1ba6b 296 from => "property definitions"});
cfb8fd6a 297
9c68f0ab
KW
298# Output the table using fairly short names for each char.
299for my $ord (0..255) {
300 my $name;
301 if ($ord < 32) { # A C0 control
302 $name = $C0[$ord];
303 } elsif ($ord > 32 && $ord < 127) { # Graphic
304 $name = "'" . chr($ord) . "'";
305 } elsif ($ord >= 127 && $ord <= 0x9f) {
306 $name = $C1[$ord - 127]; # A C1 control + DEL
307 } else { # SPACE, or, if Latin1, shorten the name */
308 use charnames();
309 $name = charnames::viacode($ord);
310 $name =~ s/LATIN CAPITAL LETTER //
311 || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
312 }
cfb8fd6a 313 printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab
KW
314}
315
cfb8fd6a 316read_only_bottom_close_and_rename($out_fh)