This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
l1_char_class_tab.h: Remove multi-char fold targets
[perl5.git] / regen / mk_PL_charclass.pl
CommitLineData
9c68f0ab
KW
1#!perl -w
2use 5.012;
3use strict;
4use warnings;
cfb8fd6a 5require 'regen/regen_lib.pl';
9c68f0ab
KW
6
7# This program outputs the 256 lines that form the guts of the PL_charclass
8# table. The output should be used to manually replace the table contents in
8d4ab2a1
KW
9# l1_charclass_tab.h. Each line is a bit map of properties that the Unicode
10# code point at the corresponding position in the table array has. The first
11# line corresponds to code point U+0000, NULL, the last line to U=00FF. For
12# an application to see if the code point "i" has a particular property, it
13# just does
9c68f0ab
KW
14# 'PL_charclass[i] & BIT'
15# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
16# character class, and 'property' is the corresponding property, and 'suffix'
17# is one of '_A' to mean the property is true only if the corresponding code
18# point is ASCII, and '_L1' means that the range includes any Latin1
19# character (ISO-8859-1 including the C0 and C1 controls). A property without
20# these suffixes does not have different forms for both ranges.
21
22# The data in the table is pretty well set in stone, so that this program need
23# be run only when adding new properties to it.
24
25my @properties = qw(
26 ALNUMC_A
27 ALNUMC_L1
28 ALPHA_A
29 ALPHA_L1
30 BLANK_A
31 BLANK_L1
32 CHARNAME_CONT
33 CNTRL_A
34 CNTRL_L1
35 DIGIT_A
36 GRAPH_A
37 GRAPH_L1
38 IDFIRST_A
39 IDFIRST_L1
40 LOWER_A
41 LOWER_L1
42 OCTAL_A
43 PRINT_A
44 PRINT_L1
45 PSXSPC_A
46 PSXSPC_L1
47 PUNCT_A
48 PUNCT_L1
49 SPACE_A
50 SPACE_L1
51 UPPER_A
52 UPPER_L1
53 WORDCHAR_A
54 WORDCHAR_L1
55 XDIGIT_A
56);
57
00c072cf
KW
58# Read in the case fold mappings.
59my %folded_closure;
cfb8fd6a 60my $file="lib/unicore/CaseFolding.txt";
00c072cf
KW
61open my $fh, "<", $file or die "Failed to read '$file': $!";
62while (<$fh>) {
63 chomp;
64
65 # Lines look like (without the initial '#'
66 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
67 my ($line, $comment) = split / \s+ \# \s+ /x, $_;
68 next if $line eq "" || substr($line, 0, 1) eq '#';
69 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
70
71 my $from = hex $hex_from;
72
73 # Perl only deals with C and F folds
88c8c961 74 next if $fold_type ne 'C'; # Not currently used: && $fold_type ne 'F';
00c072cf
KW
75
76 # Get each code point in the range that participates in this line's fold.
77 # The hash has keys of each code point in the range, and values of what it
78 # folds to and what folds to it
79 foreach my $hex_fold (@folded) {
80 my $fold = hex $hex_fold;
81 push @{$folded_closure{$fold}}, $from if $fold < 256;
82 push @{$folded_closure{$from}}, $fold if $from < 256;
83 }
84}
85
86# Now having read all the lines, combine them into the full closure of each
87# code point in the range by adding lists together that share a common element
88foreach my $folded (keys %folded_closure) {
89 foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
90 push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
91 }
92}
93
9c68f0ab
KW
94my @bits; # Bit map for each code point
95
00c072cf
KW
96foreach my $folded (keys %folded_closure) {
97 $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
98 @{$folded_closure{$folded}};
99}
100
9c68f0ab
KW
101for my $ord (0..255) {
102 my $char = chr($ord);
103 utf8::upgrade($char); # Important to use Unicode semantics!
104 for my $property (@properties) {
105 my $name = $property;
106
107 # The property name that corresponds to this doesn't have a suffix.
108 # If is a latin1 version, no further checking is needed.
109 if (! ($name =~ s/_L1$//)) {
110
111 # Here, isn't an L1. It's either a special one or the suffix ends
112 # in _A. In the latter case, it's automatically false for
113 # non-ascii. The one current special is valid over the whole range.
114 next if $name =~ s/_A$// && $ord >= 128;
115
116 }
117 my $re;
118 if ($name eq 'PUNCT') {;
119
120 # Sadly, this is inconsistent: \pP and \pS for the ascii range,
121 # just \pP outside it.
122 $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
123 } elsif ($name eq 'CHARNAME_CONT') {;
124 $re = qr/[-\w ():\xa0]/;
125 } elsif ($name eq 'SPACE') {;
126 $re = qr/\s/;
127 } elsif ($name eq 'IDFIRST') {
128 $re = qr/[_\p{Alpha}]/;
129 } elsif ($name eq 'PSXSPC') {
130 $re = qr/[\v\p{Space}]/;
131 } elsif ($name eq 'WORDCHAR') {
132 $re = qr/\w/;
133 } elsif ($name eq 'ALNUMC') {
134 # Like \w, but no underscore
aedd44b5 135 $re = qr/\p{Alnum}/;
9c68f0ab
KW
136 } elsif ($name eq 'OCTAL') {
137 $re = qr/[0-7]/;
138 } else { # The remainder have the same name and values as Unicode
139 $re = eval "qr/\\p{$name}/";
140 use Carp;
141 carp $@ if ! defined $re;
142 }
143 #print "$ord, $name $property, $re\n";
144 if ($char =~ $re) { # Add this property if matches
145 $bits[$ord] .= '|' if $bits[$ord];
146 $bits[$ord] .= "_CC_$property";
147 }
148 }
149 #print __LINE__, " $ord $char $bits[$ord]\n";
150}
151
152# Names of C0 controls
153my @C0 = qw (
154 NUL
155 SOH
156 STX
157 ETX
158 EOT
159 ENQ
160 ACK
161 BEL
162 BS
163 HT
164 LF
165 VT
166 FF
167 CR
168 SO
169 SI
170 DLE
171 DC1
172 DC2
173 DC3
174 DC4
175 NAK
176 SYN
177 ETB
178 CAN
179 EOM
180 SUB
181 ESC
182 FS
183 GS
184 RS
185 US
186 );
187
188# Names of C1 controls, plus the adjacent DEL
189my @C1 = qw(
190 DEL
191 PAD
192 HOP
193 BPH
194 NBH
195 IND
196 NEL
197 SSA
198 ESA
199 HTS
200 HTJ
201 VTS
202 PLD
203 PLU
204 RI
205 SS2
206 SS3
207 DCS
208 PU1
209 PU2
210 STS
211 CCH
212 MW
213 SPA
214 EPA
215 SOS
216 SGC
217 SCI
218 CSI
219 ST
220 OSC
221 PM
222 APC
223 );
224
cfb8fd6a
NC
225my $out_fh = safer_open('l1_char_class_tab.h-new', 'l1_char_class_tab.h');
226print $out_fh read_only_top(lang => 'C', style => '*', by => $0, from => $file);
227
9c68f0ab
KW
228# Output the table using fairly short names for each char.
229for my $ord (0..255) {
230 my $name;
231 if ($ord < 32) { # A C0 control
232 $name = $C0[$ord];
233 } elsif ($ord > 32 && $ord < 127) { # Graphic
234 $name = "'" . chr($ord) . "'";
235 } elsif ($ord >= 127 && $ord <= 0x9f) {
236 $name = $C1[$ord - 127]; # A C1 control + DEL
237 } else { # SPACE, or, if Latin1, shorten the name */
238 use charnames();
239 $name = charnames::viacode($ord);
240 $name =~ s/LATIN CAPITAL LETTER //
241 || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
242 }
cfb8fd6a 243 printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab
KW
244}
245
cfb8fd6a 246read_only_bottom_close_and_rename($out_fh)