This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: indent code in new block
[perl5.git] / Porting / mk_PL_charclass.pl
CommitLineData
9c68f0ab
KW
1#!perl -w
2use 5.012;
3use strict;
4use warnings;
5
6# This program outputs the 256 lines that form the guts of the PL_charclass
7# table. The output should be used to manually replace the table contents in
8d4ab2a1
KW
8# l1_charclass_tab.h. Each line is a bit map of properties that the Unicode
9# code point at the corresponding position in the table array has. The first
10# line corresponds to code point U+0000, NULL, the last line to U=00FF. For
11# an application to see if the code point "i" has a particular property, it
12# just does
9c68f0ab
KW
13# 'PL_charclass[i] & BIT'
14# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
15# character class, and 'property' is the corresponding property, and 'suffix'
16# is one of '_A' to mean the property is true only if the corresponding code
17# point is ASCII, and '_L1' means that the range includes any Latin1
18# character (ISO-8859-1 including the C0 and C1 controls). A property without
19# these suffixes does not have different forms for both ranges.
20
21# The data in the table is pretty well set in stone, so that this program need
22# be run only when adding new properties to it.
23
24my @properties = qw(
25 ALNUMC_A
26 ALNUMC_L1
27 ALPHA_A
28 ALPHA_L1
29 BLANK_A
30 BLANK_L1
31 CHARNAME_CONT
32 CNTRL_A
33 CNTRL_L1
34 DIGIT_A
35 GRAPH_A
36 GRAPH_L1
37 IDFIRST_A
38 IDFIRST_L1
39 LOWER_A
40 LOWER_L1
41 OCTAL_A
42 PRINT_A
43 PRINT_L1
44 PSXSPC_A
45 PSXSPC_L1
46 PUNCT_A
47 PUNCT_L1
48 SPACE_A
49 SPACE_L1
50 UPPER_A
51 UPPER_L1
52 WORDCHAR_A
53 WORDCHAR_L1
54 XDIGIT_A
55);
56
57my @bits; # Bit map for each code point
58
59for my $ord (0..255) {
60 my $char = chr($ord);
61 utf8::upgrade($char); # Important to use Unicode semantics!
62 for my $property (@properties) {
63 my $name = $property;
64
65 # The property name that corresponds to this doesn't have a suffix.
66 # If is a latin1 version, no further checking is needed.
67 if (! ($name =~ s/_L1$//)) {
68
69 # Here, isn't an L1. It's either a special one or the suffix ends
70 # in _A. In the latter case, it's automatically false for
71 # non-ascii. The one current special is valid over the whole range.
72 next if $name =~ s/_A$// && $ord >= 128;
73
74 }
75 my $re;
76 if ($name eq 'PUNCT') {;
77
78 # Sadly, this is inconsistent: \pP and \pS for the ascii range,
79 # just \pP outside it.
80 $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
81 } elsif ($name eq 'CHARNAME_CONT') {;
82 $re = qr/[-\w ():\xa0]/;
83 } elsif ($name eq 'SPACE') {;
84 $re = qr/\s/;
85 } elsif ($name eq 'IDFIRST') {
86 $re = qr/[_\p{Alpha}]/;
87 } elsif ($name eq 'PSXSPC') {
88 $re = qr/[\v\p{Space}]/;
89 } elsif ($name eq 'WORDCHAR') {
90 $re = qr/\w/;
91 } elsif ($name eq 'ALNUMC') {
92 # Like \w, but no underscore
aedd44b5 93 $re = qr/\p{Alnum}/;
9c68f0ab
KW
94 } elsif ($name eq 'OCTAL') {
95 $re = qr/[0-7]/;
96 } else { # The remainder have the same name and values as Unicode
97 $re = eval "qr/\\p{$name}/";
98 use Carp;
99 carp $@ if ! defined $re;
100 }
101 #print "$ord, $name $property, $re\n";
102 if ($char =~ $re) { # Add this property if matches
103 $bits[$ord] .= '|' if $bits[$ord];
104 $bits[$ord] .= "_CC_$property";
105 }
106 }
107 #print __LINE__, " $ord $char $bits[$ord]\n";
108}
109
110# Names of C0 controls
111my @C0 = qw (
112 NUL
113 SOH
114 STX
115 ETX
116 EOT
117 ENQ
118 ACK
119 BEL
120 BS
121 HT
122 LF
123 VT
124 FF
125 CR
126 SO
127 SI
128 DLE
129 DC1
130 DC2
131 DC3
132 DC4
133 NAK
134 SYN
135 ETB
136 CAN
137 EOM
138 SUB
139 ESC
140 FS
141 GS
142 RS
143 US
144 );
145
146# Names of C1 controls, plus the adjacent DEL
147my @C1 = qw(
148 DEL
149 PAD
150 HOP
151 BPH
152 NBH
153 IND
154 NEL
155 SSA
156 ESA
157 HTS
158 HTJ
159 VTS
160 PLD
161 PLU
162 RI
163 SS2
164 SS3
165 DCS
166 PU1
167 PU2
168 STS
169 CCH
170 MW
171 SPA
172 EPA
173 SOS
174 SGC
175 SCI
176 CSI
177 ST
178 OSC
179 PM
180 APC
181 );
182
183# Output the table using fairly short names for each char.
184for my $ord (0..255) {
185 my $name;
186 if ($ord < 32) { # A C0 control
187 $name = $C0[$ord];
188 } elsif ($ord > 32 && $ord < 127) { # Graphic
189 $name = "'" . chr($ord) . "'";
190 } elsif ($ord >= 127 && $ord <= 0x9f) {
191 $name = $C1[$ord - 127]; # A C1 control + DEL
192 } else { # SPACE, or, if Latin1, shorten the name */
193 use charnames();
194 $name = charnames::viacode($ord);
195 $name =~ s/LATIN CAPITAL LETTER //
196 || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
197 }
198 printf "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
199}
200