This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix compiler warning
[perl5.git] / Porting / mk_PL_charclass.pl
CommitLineData
9c68f0ab
KW
1#!perl -w
2use 5.012;
3use strict;
4use warnings;
00c072cf 5use Config;
9c68f0ab
KW
6
7# This program outputs the 256 lines that form the guts of the PL_charclass
8# table. The output should be used to manually replace the table contents in
8d4ab2a1
KW
9# l1_charclass_tab.h. Each line is a bit map of properties that the Unicode
10# code point at the corresponding position in the table array has. The first
11# line corresponds to code point U+0000, NULL, the last line to U=00FF. For
12# an application to see if the code point "i" has a particular property, it
13# just does
9c68f0ab
KW
14# 'PL_charclass[i] & BIT'
15# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
16# character class, and 'property' is the corresponding property, and 'suffix'
17# is one of '_A' to mean the property is true only if the corresponding code
18# point is ASCII, and '_L1' means that the range includes any Latin1
19# character (ISO-8859-1 including the C0 and C1 controls). A property without
20# these suffixes does not have different forms for both ranges.
21
22# The data in the table is pretty well set in stone, so that this program need
23# be run only when adding new properties to it.
24
25my @properties = qw(
26 ALNUMC_A
27 ALNUMC_L1
28 ALPHA_A
29 ALPHA_L1
30 BLANK_A
31 BLANK_L1
32 CHARNAME_CONT
33 CNTRL_A
34 CNTRL_L1
35 DIGIT_A
36 GRAPH_A
37 GRAPH_L1
38 IDFIRST_A
39 IDFIRST_L1
40 LOWER_A
41 LOWER_L1
42 OCTAL_A
43 PRINT_A
44 PRINT_L1
45 PSXSPC_A
46 PSXSPC_L1
47 PUNCT_A
48 PUNCT_L1
49 SPACE_A
50 SPACE_L1
51 UPPER_A
52 UPPER_L1
53 WORDCHAR_A
54 WORDCHAR_L1
55 XDIGIT_A
56);
57
00c072cf
KW
58# Read in the case fold mappings.
59my %folded_closure;
60my $file="$Config{privlib}/unicore/CaseFolding.txt";
61open my $fh, "<", $file or die "Failed to read '$file': $!";
62while (<$fh>) {
63 chomp;
64
65 # Lines look like (without the initial '#'
66 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
67 my ($line, $comment) = split / \s+ \# \s+ /x, $_;
68 next if $line eq "" || substr($line, 0, 1) eq '#';
69 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
70
71 my $from = hex $hex_from;
72
73 # Perl only deals with C and F folds
74 next if $fold_type ne 'C' and $fold_type ne 'F';
75 next if $fold_type ne 'C'; # And for now, just single-char folds. XXX
76
77 # Get each code point in the range that participates in this line's fold.
78 # The hash has keys of each code point in the range, and values of what it
79 # folds to and what folds to it
80 foreach my $hex_fold (@folded) {
81 my $fold = hex $hex_fold;
82 push @{$folded_closure{$fold}}, $from if $fold < 256;
83 push @{$folded_closure{$from}}, $fold if $from < 256;
84 }
85}
86
87# Now having read all the lines, combine them into the full closure of each
88# code point in the range by adding lists together that share a common element
89foreach my $folded (keys %folded_closure) {
90 foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
91 push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
92 }
93}
94
9c68f0ab
KW
95my @bits; # Bit map for each code point
96
00c072cf
KW
97foreach my $folded (keys %folded_closure) {
98 $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
99 @{$folded_closure{$folded}};
100}
101
9c68f0ab
KW
102for my $ord (0..255) {
103 my $char = chr($ord);
104 utf8::upgrade($char); # Important to use Unicode semantics!
105 for my $property (@properties) {
106 my $name = $property;
107
108 # The property name that corresponds to this doesn't have a suffix.
109 # If is a latin1 version, no further checking is needed.
110 if (! ($name =~ s/_L1$//)) {
111
112 # Here, isn't an L1. It's either a special one or the suffix ends
113 # in _A. In the latter case, it's automatically false for
114 # non-ascii. The one current special is valid over the whole range.
115 next if $name =~ s/_A$// && $ord >= 128;
116
117 }
118 my $re;
119 if ($name eq 'PUNCT') {;
120
121 # Sadly, this is inconsistent: \pP and \pS for the ascii range,
122 # just \pP outside it.
123 $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
124 } elsif ($name eq 'CHARNAME_CONT') {;
125 $re = qr/[-\w ():\xa0]/;
126 } elsif ($name eq 'SPACE') {;
127 $re = qr/\s/;
128 } elsif ($name eq 'IDFIRST') {
129 $re = qr/[_\p{Alpha}]/;
130 } elsif ($name eq 'PSXSPC') {
131 $re = qr/[\v\p{Space}]/;
132 } elsif ($name eq 'WORDCHAR') {
133 $re = qr/\w/;
134 } elsif ($name eq 'ALNUMC') {
135 # Like \w, but no underscore
aedd44b5 136 $re = qr/\p{Alnum}/;
9c68f0ab
KW
137 } elsif ($name eq 'OCTAL') {
138 $re = qr/[0-7]/;
139 } else { # The remainder have the same name and values as Unicode
140 $re = eval "qr/\\p{$name}/";
141 use Carp;
142 carp $@ if ! defined $re;
143 }
144 #print "$ord, $name $property, $re\n";
145 if ($char =~ $re) { # Add this property if matches
146 $bits[$ord] .= '|' if $bits[$ord];
147 $bits[$ord] .= "_CC_$property";
148 }
149 }
150 #print __LINE__, " $ord $char $bits[$ord]\n";
151}
152
153# Names of C0 controls
154my @C0 = qw (
155 NUL
156 SOH
157 STX
158 ETX
159 EOT
160 ENQ
161 ACK
162 BEL
163 BS
164 HT
165 LF
166 VT
167 FF
168 CR
169 SO
170 SI
171 DLE
172 DC1
173 DC2
174 DC3
175 DC4
176 NAK
177 SYN
178 ETB
179 CAN
180 EOM
181 SUB
182 ESC
183 FS
184 GS
185 RS
186 US
187 );
188
189# Names of C1 controls, plus the adjacent DEL
190my @C1 = qw(
191 DEL
192 PAD
193 HOP
194 BPH
195 NBH
196 IND
197 NEL
198 SSA
199 ESA
200 HTS
201 HTJ
202 VTS
203 PLD
204 PLU
205 RI
206 SS2
207 SS3
208 DCS
209 PU1
210 PU2
211 STS
212 CCH
213 MW
214 SPA
215 EPA
216 SOS
217 SGC
218 SCI
219 CSI
220 ST
221 OSC
222 PM
223 APC
224 );
225
226# Output the table using fairly short names for each char.
227for my $ord (0..255) {
228 my $name;
229 if ($ord < 32) { # A C0 control
230 $name = $C0[$ord];
231 } elsif ($ord > 32 && $ord < 127) { # Graphic
232 $name = "'" . chr($ord) . "'";
233 } elsif ($ord >= 127 && $ord <= 0x9f) {
234 $name = $C1[$ord - 127]; # A C1 control + DEL
235 } else { # SPACE, or, if Latin1, shorten the name */
236 use charnames();
237 $name = charnames::viacode($ord);
238 $name =~ s/LATIN CAPITAL LETTER //
239 || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
240 }
241 printf "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
242}
243