This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Move Pod::Functions from lib/ to ext/
[perl5.git] / regen / mk_PL_charclass.pl
CommitLineData
9c68f0ab
KW
1#!perl -w
2use 5.012;
3use strict;
4use warnings;
cfb8fd6a 5require 'regen/regen_lib.pl';
9c68f0ab 6
b1909af7
KW
7# This program outputs l1_charclass_tab.h, which defines the guts of the
8# PL_charclass table. Each line is a bit map of properties that the Unicode
8d4ab2a1 9# code point at the corresponding position in the table array has. The first
b1909af7 10# line corresponds to code point U+0000, NULL, the last line to U+00FF. For
8d4ab2a1
KW
11# an application to see if the code point "i" has a particular property, it
12# just does
9c68f0ab
KW
13# 'PL_charclass[i] & BIT'
14# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
15# character class, and 'property' is the corresponding property, and 'suffix'
16# is one of '_A' to mean the property is true only if the corresponding code
17# point is ASCII, and '_L1' means that the range includes any Latin1
18# character (ISO-8859-1 including the C0 and C1 controls). A property without
19# these suffixes does not have different forms for both ranges.
20
b1909af7
KW
21# This program need be run only when adding new properties to it, or upon a
22# new Unicode release, to make sure things haven't been changed by it.
9c68f0ab
KW
23
24my @properties = qw(
25 ALNUMC_A
26 ALNUMC_L1
27 ALPHA_A
28 ALPHA_L1
29 BLANK_A
30 BLANK_L1
31 CHARNAME_CONT
32 CNTRL_A
33 CNTRL_L1
34 DIGIT_A
35 GRAPH_A
36 GRAPH_L1
37 IDFIRST_A
38 IDFIRST_L1
39 LOWER_A
40 LOWER_L1
41 OCTAL_A
42 PRINT_A
43 PRINT_L1
44 PSXSPC_A
45 PSXSPC_L1
46 PUNCT_A
47 PUNCT_L1
48 SPACE_A
49 SPACE_L1
50 UPPER_A
51 UPPER_L1
52 WORDCHAR_A
53 WORDCHAR_L1
54 XDIGIT_A
9a022f3a 55 QUOTEMETA
9c68f0ab
KW
56);
57
00c072cf
KW
58# Read in the case fold mappings.
59my %folded_closure;
cfb8fd6a 60my $file="lib/unicore/CaseFolding.txt";
00c072cf
KW
61open my $fh, "<", $file or die "Failed to read '$file': $!";
62while (<$fh>) {
63 chomp;
64
65 # Lines look like (without the initial '#'
66 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
e3136cf9
KW
67 # Get rid of comments, ignore blank or comment-only lines
68 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
69 next unless length $line;
00c072cf
KW
70 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
71
72 my $from = hex $hex_from;
73
74 # Perl only deals with C and F folds
33e4950c 75 next if $fold_type ne 'C' and $fold_type ne 'F';
00c072cf
KW
76
77 # Get each code point in the range that participates in this line's fold.
78 # The hash has keys of each code point in the range, and values of what it
79 # folds to and what folds to it
80 foreach my $hex_fold (@folded) {
81 my $fold = hex $hex_fold;
82 push @{$folded_closure{$fold}}, $from if $fold < 256;
83 push @{$folded_closure{$from}}, $fold if $from < 256;
84 }
85}
86
87# Now having read all the lines, combine them into the full closure of each
88# code point in the range by adding lists together that share a common element
89foreach my $folded (keys %folded_closure) {
90 foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
91 push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
92 }
93}
94
9c68f0ab
KW
95my @bits; # Bit map for each code point
96
00c072cf
KW
97foreach my $folded (keys %folded_closure) {
98 $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
99 @{$folded_closure{$folded}};
100}
101
b1909af7 102# For each character, calculate which properties it matches.
9c68f0ab
KW
103for my $ord (0..255) {
104 my $char = chr($ord);
105 utf8::upgrade($char); # Important to use Unicode semantics!
b1909af7
KW
106
107 # Look at all the properties we care about here.
9c68f0ab
KW
108 for my $property (@properties) {
109 my $name = $property;
110
b1909af7
KW
111 # Remove the suffix to get the actual property name.
112 # Currently the suffixes are '_L1', '_A', and none.
9c68f0ab
KW
113 # If is a latin1 version, no further checking is needed.
114 if (! ($name =~ s/_L1$//)) {
115
b1909af7
KW
116 # Here, isn't an _L1. If its _A, it's automatically false for
117 # non-ascii. The only one current one without a suffix is valid
118 # over the whole range.
9c68f0ab
KW
119 next if $name =~ s/_A$// && $ord >= 128;
120
121 }
122 my $re;
123 if ($name eq 'PUNCT') {;
124
125 # Sadly, this is inconsistent: \pP and \pS for the ascii range,
126 # just \pP outside it.
127 $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
128 } elsif ($name eq 'CHARNAME_CONT') {;
129 $re = qr/[-\w ():\xa0]/;
130 } elsif ($name eq 'SPACE') {;
131 $re = qr/\s/;
132 } elsif ($name eq 'IDFIRST') {
133 $re = qr/[_\p{Alpha}]/;
134 } elsif ($name eq 'PSXSPC') {
135 $re = qr/[\v\p{Space}]/;
136 } elsif ($name eq 'WORDCHAR') {
137 $re = qr/\w/;
138 } elsif ($name eq 'ALNUMC') {
139 # Like \w, but no underscore
aedd44b5 140 $re = qr/\p{Alnum}/;
9c68f0ab
KW
141 } elsif ($name eq 'OCTAL') {
142 $re = qr/[0-7]/;
9a022f3a
KW
143 } elsif ($name eq 'QUOTEMETA') {
144 $re = qr/\p{_Perl_Quotemeta}/;
9c68f0ab
KW
145 } else { # The remainder have the same name and values as Unicode
146 $re = eval "qr/\\p{$name}/";
147 use Carp;
148 carp $@ if ! defined $re;
149 }
150 #print "$ord, $name $property, $re\n";
151 if ($char =~ $re) { # Add this property if matches
152 $bits[$ord] .= '|' if $bits[$ord];
153 $bits[$ord] .= "_CC_$property";
154 }
155 }
156 #print __LINE__, " $ord $char $bits[$ord]\n";
157}
158
159# Names of C0 controls
160my @C0 = qw (
161 NUL
162 SOH
163 STX
164 ETX
165 EOT
166 ENQ
167 ACK
168 BEL
169 BS
170 HT
171 LF
172 VT
173 FF
174 CR
175 SO
176 SI
177 DLE
178 DC1
179 DC2
180 DC3
181 DC4
182 NAK
183 SYN
184 ETB
185 CAN
186 EOM
187 SUB
188 ESC
189 FS
190 GS
191 RS
192 US
193 );
194
195# Names of C1 controls, plus the adjacent DEL
196my @C1 = qw(
197 DEL
198 PAD
199 HOP
200 BPH
201 NBH
202 IND
203 NEL
204 SSA
205 ESA
206 HTS
207 HTJ
208 VTS
209 PLD
210 PLU
211 RI
212 SS2
213 SS3
214 DCS
215 PU1
216 PU2
217 STS
218 CCH
219 MW
220 SPA
221 EPA
222 SOS
223 SGC
224 SCI
225 CSI
226 ST
227 OSC
228 PM
229 APC
230 );
231
cc49830d 232my $out_fh = open_new('l1_char_class_tab.h', '>',
b1909af7
KW
233 {style => '*', by => $0,
234 from => "property definitions and $file"});
cfb8fd6a 235
9c68f0ab
KW
236# Output the table using fairly short names for each char.
237for my $ord (0..255) {
238 my $name;
239 if ($ord < 32) { # A C0 control
240 $name = $C0[$ord];
241 } elsif ($ord > 32 && $ord < 127) { # Graphic
242 $name = "'" . chr($ord) . "'";
243 } elsif ($ord >= 127 && $ord <= 0x9f) {
244 $name = $C1[$ord - 127]; # A C1 control + DEL
245 } else { # SPACE, or, if Latin1, shorten the name */
246 use charnames();
247 $name = charnames::viacode($ord);
248 $name =~ s/LATIN CAPITAL LETTER //
249 || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
250 }
cfb8fd6a 251 printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
9c68f0ab
KW
252}
253
cfb8fd6a 254read_only_bottom_close_and_rename($out_fh)