This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: White-space only
[perl5.git] / regen / mk_PL_charclass.pl
CommitLineData
9c68f0ab
KW
1#!perl -w
2use 5.012;
3use strict;
4use warnings;
cfb8fd6a 5require 'regen/regen_lib.pl';
9c68f0ab 6
b1909af7
KW
7# This program outputs l1_charclass_tab.h, which defines the guts of the
8# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1 9# code point at the corresponding position in the table array has. The first
b1909af7 10# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1
KW
11# an application to see if the code point "i" has a particular property, it
12# just does
9c68f0ab
KW
13# 'PL_charclass[i] & BIT'
14# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
15# character class, and 'property' is the corresponding property, and 'suffix'
16# is one of '_A' to mean the property is true only if the corresponding code
17# point is ASCII, and '_L1' means that the range includes any Latin1
18# character (ISO-8859-1 including the C0 and C1 controls). A property without
19# these suffixes does not have different forms for both ranges.
20
b1909af7
KW
21# This program need be run only when adding new properties to it, or upon a
22# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab
KW
23
24my @properties = qw(
25 ALNUMC_A
26 ALNUMC_L1
27 ALPHA_A
28 ALPHA_L1
29 BLANK_A
30 BLANK_L1
31 CHARNAME_CONT
32 CNTRL_A
33 CNTRL_L1
34 DIGIT_A
35 GRAPH_A
36 GRAPH_L1
37 IDFIRST_A
38 IDFIRST_L1
39 LOWER_A
40 LOWER_L1
41 OCTAL_A
42 PRINT_A
43 PRINT_L1
44 PSXSPC_A
45 PSXSPC_L1
46 PUNCT_A
47 PUNCT_L1
48 SPACE_A
49 SPACE_L1
50 UPPER_A
51 UPPER_L1
52 WORDCHAR_A
53 WORDCHAR_L1
54 XDIGIT_A
55);
56
00c072cf
KW
57# Read in the case fold mappings.
58my %folded_closure;
cfb8fd6a 59my $file="lib/unicore/CaseFolding.txt";
00c072cf
KW
60open my $fh, "<", $file or die "Failed to read '$file': $!";
61while (<$fh>) {
62 chomp;
63
64 # Lines look like (without the initial '#'
65 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
e3136cf9
KW
66 # Get rid of comments, ignore blank or comment-only lines
67 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
68 next unless length $line;
00c072cf
KW
69 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
70
71 my $from = hex $hex_from;
72
73 # Perl only deals with C and F folds
33e4950c 74 next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf
KW
75
76 # Get each code point in the range that participates in this line's fold.
77 # The hash has keys of each code point in the range, and values of what it
78 # folds to and what folds to it
79 foreach my $hex_fold (@folded) {
80 my $fold = hex $hex_fold;
81 push @{$folded_closure{$fold}}, $from if $fold < 256;
82 push @{$folded_closure{$from}}, $fold if $from < 256;
83 }
84}
85
86# Now having read all the lines, combine them into the full closure of each
87# code point in the range by adding lists together that share a common element
88foreach my $folded (keys %folded_closure) {
89 foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
90 push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
91 }
92}
93
9c68f0ab
KW
94my @bits; # Bit map for each code point
95
00c072cf
KW
96foreach my $folded (keys %folded_closure) {
97 $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
98 @{$folded_closure{$folded}};
99}
100
b1909af7 101# For each character, calculate which properties it matches.
9c68f0ab
KW
102for my $ord (0..255) {
103 my $char = chr($ord);
104 utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7
KW
105
106 # Look at all the properties we care about here.
9c68f0ab
KW
107 for my $property (@properties) {
108 my $name = $property;
109
b1909af7
KW
110 # Remove the suffix to get the actual property name.
111 # Currently the suffixes are '_L1', '_A', and none.
9c68f0ab
KW
112 # If is a latin1 version, no further checking is needed.
113 if (! ($name =~ s/_L1$//)) {
114
b1909af7
KW
115 # Here, isn't an _L1. If its _A, it's automatically false for
116 # non-ascii. The only one current one without a suffix is valid
117 # over the whole range.
9c68f0ab
KW
118 next if $name =~ s/_A$// && $ord >= 128;
119
120 }
121 my $re;
122 if ($name eq 'PUNCT') {;
123
124 # Sadly, this is inconsistent: \pP and \pS for the ascii range,
125 # just \pP outside it.
126 $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
127 } elsif ($name eq 'CHARNAME_CONT') {;
128 $re = qr/[-\w ():\xa0]/;
129 } elsif ($name eq 'SPACE') {;
130 $re = qr/\s/;
131 } elsif ($name eq 'IDFIRST') {
132 $re = qr/[_\p{Alpha}]/;
133 } elsif ($name eq 'PSXSPC') {
134 $re = qr/[\v\p{Space}]/;
135 } elsif ($name eq 'WORDCHAR') {
136 $re = qr/\w/;
137 } elsif ($name eq 'ALNUMC') {
138 # Like \w, but no underscore
aedd44b5 139 $re = qr/\p{Alnum}/;
9c68f0ab
KW
140 } elsif ($name eq 'OCTAL') {
141 $re = qr/[0-7]/;
142 } else { # The remainder have the same name and values as Unicode
143 $re = eval "qr/\\p{$name}/";
144 use Carp;
145 carp $@ if ! defined $re;
146 }
147 #print "$ord, $name $property, $re\n";
148 if ($char =~ $re) { # Add this property if matches
149 $bits[$ord] .= '|' if $bits[$ord];
150 $bits[$ord] .= "_CC_$property";
151 }
152 }
153 #print __LINE__, " $ord $char $bits[$ord]\n";
154}
155
156# Names of C0 controls
157my @C0 = qw (
158 NUL
159 SOH
160 STX
161 ETX
162 EOT
163 ENQ
164 ACK
165 BEL
166 BS
167 HT
168 LF
169 VT
170 FF
171 CR
172 SO
173 SI
174 DLE
175 DC1
176 DC2
177 DC3
178 DC4
179 NAK
180 SYN
181 ETB
182 CAN
183 EOM
184 SUB
185 ESC
186 FS
187 GS
188 RS
189 US
190 );
191
192# Names of C1 controls, plus the adjacent DEL
193my @C1 = qw(
194 DEL
195 PAD
196 HOP
197 BPH
198 NBH
199 IND
200 NEL
201 SSA
202 ESA
203 HTS
204 HTJ
205 VTS
206 PLD
207 PLU
208 RI
209 SS2
210 SS3
211 DCS
212 PU1
213 PU2
214 STS
215 CCH
216 MW
217 SPA
218 EPA
219 SOS
220 SGC
221 SCI
222 CSI
223 ST
224 OSC
225 PM
226 APC
227 );
228
cc49830d 229my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7
KW
230 {style => '*', by => $0,
231 from => "property definitions and $file"});
cfb8fd6a 232
9c68f0ab
KW
233# Output the table using fairly short names for each char.
234for my $ord (0..255) {
235 my $name;
236 if ($ord < 32) { # A C0 control
237 $name = $C0[$ord];
238 } elsif ($ord > 32 && $ord < 127) { # Graphic
239 $name = "'" . chr($ord) . "'";
240 } elsif ($ord >= 127 && $ord <= 0x9f) {
241 $name = $C1[$ord - 127]; # A C1 control + DEL
242 } else { # SPACE, or, if Latin1, shorten the name */
243 use charnames();
244 $name = charnames::viacode($ord);
245 $name =~ s/LATIN CAPITAL LETTER //
246 || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
247 }
cfb8fd6a 248 printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab
KW
249}
250
cfb8fd6a 251read_only_bottom_close_and_rename($out_fh)