Commit | Line | Data |
---|---|---|
9c68f0ab | 1 | #!perl -w |
b72a36d4 | 2 | use v5.15.8; |
9c68f0ab KW |
3 | use strict; |
4 | use warnings; | |
3d7c117d MB |
5 | require './regen/regen_lib.pl'; |
6 | require './regen/charset_translations.pl'; | |
adff23ea | 7 | use Unicode::UCD 'prop_invlist'; |
9c68f0ab | 8 | |
b1909af7 KW |
9 | # This program outputs l1_charclass_tab.h, which defines the guts of the |
10 | # PL_charclass table. Each line is a bit map of properties that the Unicode | |
8d4ab2a1 | 11 | # code point at the corresponding position in the table array has. The first |
adff23ea | 12 | # line corresponds to code point 0x0, NULL, the last line to 0xFF. For |
8d4ab2a1 KW |
13 | # an application to see if the code point "i" has a particular property, it |
14 | # just does | |
9c68f0ab KW |
15 | # 'PL_charclass[i] & BIT' |
16 | # The bit names are of the form '_CC_property_suffix', where 'CC' stands for | |
17 | # character class, and 'property' is the corresponding property, and 'suffix' | |
18 | # is one of '_A' to mean the property is true only if the corresponding code | |
19 | # point is ASCII, and '_L1' means that the range includes any Latin1 | |
20 | # character (ISO-8859-1 including the C0 and C1 controls). A property without | |
21 | # these suffixes does not have different forms for both ranges. | |
22 | ||
b1909af7 KW |
23 | # This program need be run only when adding new properties to it, or upon a |
24 | # new Unicode release, to make sure things haven't been changed by it. | |
9c68f0ab | 25 | |
adff23ea KW |
26 | # keys are the names of the bits; values are what generates the code points |
27 | # that have the bit set, or 0 if \p{key} is the generator | |
28 | my %bit_names = ( | |
29 | NONLATIN1_SIMPLE_FOLD => \&Non_Latin1_Simple_Folds, | |
30 | NONLATIN1_FOLD => \&Non_Latin1_Folds, | |
31 | ALPHANUMERIC => 'Alnum', # Like \w, but no underscore | |
32 | ALPHA => 'XPosixAlpha', | |
33 | ASCII => 0, | |
34 | BLANK => 0, | |
35 | CASED => 0, | |
36 | CHARNAME_CONT => '_Perl_Charname_Continue', | |
37 | CNTRL => 0, | |
38 | DIGIT => 0, | |
39 | GRAPH => 0, | |
40 | IDFIRST => \&Id_First, | |
41 | LOWER => 'XPosixLower', | |
42 | NON_FINAL_FOLD => \&Non_Final_Folds, | |
43 | PRINT => 0, | |
44 | PUNCT => \&Punct_and_Symbols, | |
45 | QUOTEMETA => '_Perl_Quotemeta', | |
46 | SPACE => 'XPerlSpace', | |
47 | UPPER => 'XPosixUpper', | |
48 | WORDCHAR => 'XPosixWord', | |
49 | XDIGIT => 0, | |
50 | VERTSPACE => 0, | |
51 | IS_IN_SOME_FOLD => '_Perl_Any_Folds', | |
52 | ||
53 | # These are the control characters that there are mnemonics for | |
54 | MNEMONIC_CNTRL => [ ord "\a", ord "\b", ord "\e", ord "\f", | |
55 | ord "\n", ord "\r", ord "\t" ], | |
9c68f0ab KW |
56 | ); |
57 | ||
adff23ea KW |
58 | sub uniques { |
59 | # Returns non-duplicated input values. From "Perl Best Practices: | |
60 | # Encapsulated Cleverness". p. 455 in first edition. | |
61 | ||
62 | my %seen; | |
63 | return grep { ! $seen{$_}++ } @_; | |
64 | } | |
65 | ||
66 | sub expand_invlist { | |
67 | # Return the code points that are in the inversion list given by the | |
68 | # argument | |
69 | ||
70 | my $invlist_ref = shift; | |
71 | my $i; | |
72 | my @full_list; | |
73 | ||
74 | for (my $i = 0; $i < @$invlist_ref; $i += 2) { | |
75 | my $upper = ($i + 1) < @$invlist_ref | |
76 | ? $invlist_ref->[$i+1] - 1 # In range | |
77 | : $Unicode::UCD::MAX_CP; # To infinity. | |
78 | for my $j ($invlist_ref->[$i] .. $upper) { | |
79 | push @full_list, $j; | |
80 | } | |
81 | } | |
82 | ||
83 | return @full_list; | |
84 | } | |
85 | ||
00c072cf KW |
86 | # Read in the case fold mappings. |
87 | my %folded_closure; | |
377a5857 | 88 | my %simple_folded_closure; |
adff23ea | 89 | my @non_final_folds; |
f12c0118 | 90 | my @non_latin1_simple_folds; |
dbe1ba6b KW |
91 | my @folds; |
92 | use Unicode::UCD; | |
93 | ||
d54fd781 KW |
94 | # Use the Unicode data file if we are on an ASCII platform (which its data |
95 | # is for), and it is in the modern format (starting in Unicode 3.1.0) and | |
96 | # it is available. This avoids being affected by potential bugs | |
97 | # introduced by other layers of Perl | |
98 | my $file="lib/unicore/CaseFolding.txt"; | |
99 | ||
100 | if (ord('A') == 65 | |
101 | && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0 | |
102 | && open my $fh, "<", $file) | |
103 | { | |
104 | @folds = <$fh>; | |
105 | } | |
106 | else { | |
107 | my ($invlist_ref, $invmap_ref, undef, $default) | |
108 | = Unicode::UCD::prop_invmap('Case_Folding'); | |
109 | for my $i (0 .. @$invlist_ref - 1 - 1) { | |
110 | next if $invmap_ref->[$i] == $default; | |
111 | my $adjust = -1; | |
112 | for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) { | |
113 | $adjust++; | |
114 | ||
115 | # Single-code point maps go to a 'C' type | |
116 | if (! ref $invmap_ref->[$i]) { | |
117 | push @folds, sprintf("%04X; C; %04X\n", | |
118 | $j, | |
119 | $invmap_ref->[$i] + $adjust); | |
120 | } | |
121 | else { # Multi-code point maps go to 'F'. prop_invmap() | |
122 | # guarantees that no adjustment is needed for these, | |
123 | # as the range will contain just one element | |
124 | push @folds, sprintf("%04X; F; %s\n", | |
125 | $j, | |
126 | join " ", map { sprintf "%04X", $_ } | |
127 | @{$invmap_ref->[$i]}); | |
dbe1ba6b KW |
128 | } |
129 | } | |
130 | } | |
d54fd781 | 131 | } |
dbe1ba6b | 132 | |
d54fd781 KW |
133 | for (@folds) { |
134 | chomp; | |
135 | ||
136 | # Lines look like (without the initial '#' | |
137 | #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE | |
138 | # Get rid of comments, ignore blank or comment-only lines | |
139 | my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx; | |
140 | next unless length $line; | |
141 | my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line; | |
142 | ||
143 | my $from = hex $hex_from; | |
144 | ||
145 | # Perl only deals with S, C, and F folds | |
146 | next if $fold_type ne 'C' and $fold_type ne 'F' and $fold_type ne 'S'; | |
147 | ||
148 | # Get each code point in the range that participates in this line's fold. | |
149 | # The hash has keys of each code point in the range, and values of what it | |
150 | # folds to and what folds to it | |
151 | for my $i (0 .. @folded - 1) { | |
152 | my $fold = hex $folded[$i]; | |
153 | if ($fold < 256) { | |
154 | push @{$folded_closure{$fold}}, $from; | |
155 | push @{$simple_folded_closure{$fold}}, $from if $fold_type ne 'F'; | |
156 | } | |
157 | if ($from < 256) { | |
158 | push @{$folded_closure{$from}}, $fold; | |
159 | push @{$simple_folded_closure{$from}}, $fold if $fold_type ne 'F'; | |
160 | } | |
62841d05 | 161 | |
d54fd781 KW |
162 | if (($fold_type eq 'C' || $fold_type eq 'S') |
163 | && ($fold < 256 != $from < 256)) | |
164 | { | |
165 | # Fold is simple (hence can't be a non-final fold, so the 'if' | |
166 | # above is mutualy exclusive from the 'if below) and crosses | |
167 | # 255/256 boundary. We keep track of the Latin1 code points | |
168 | # in such folds. | |
169 | push @non_latin1_simple_folds, ($fold < 256) | |
170 | ? $fold | |
171 | : $from; | |
172 | } | |
173 | elsif ($i < @folded-1 | |
174 | && $fold < 256 | |
175 | && ! grep { $_ == $fold } @non_final_folds) | |
176 | { | |
177 | push @non_final_folds, $fold; | |
178 | ||
179 | # Also add the upper case, which in the latin1 range folds to | |
180 | # $fold | |
181 | push @non_final_folds, ord uc chr $fold; | |
4ef0bd69 | 182 | } |
00c072cf | 183 | } |
d54fd781 | 184 | } |
00c072cf | 185 | |
d54fd781 KW |
186 | # Now having read all the lines, combine them into the full closure of each |
187 | # code point in the range by adding lists together that share a common | |
188 | # element | |
189 | foreach my $folded (keys %folded_closure) { | |
190 | foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) { | |
191 | push @{$folded_closure{$from}}, @{$folded_closure{$folded}}; | |
00c072cf | 192 | } |
d54fd781 KW |
193 | } |
194 | foreach my $folded (keys %simple_folded_closure) { | |
195 | foreach my $from (grep { $_ < 256 } @{$simple_folded_closure{$folded}}) { | |
196 | push @{$simple_folded_closure{$from}}, @{$simple_folded_closure{$folded}}; | |
377a5857 | 197 | } |
d54fd781 | 198 | } |
f12c0118 | 199 | |
d54fd781 KW |
200 | # We have the single-character folds that cross the 255/256, like KELVIN |
201 | # SIGN => 'k', but we need the closure, so add like 'K' to it | |
202 | foreach my $folded (@non_latin1_simple_folds) { | |
203 | foreach my $fold (@{$simple_folded_closure{$folded}}) { | |
204 | if ($fold < 256 && ! grep { $fold == $_ } @non_latin1_simple_folds) { | |
205 | push @non_latin1_simple_folds, $fold; | |
f12c0118 KW |
206 | } |
207 | } | |
d54fd781 | 208 | } |
adff23ea KW |
209 | |
210 | sub Id_First { | |
211 | my @alpha_invlist = prop_invlist("XPosixAlpha"); | |
212 | my @ids = expand_invlist(\@alpha_invlist); | |
213 | push @ids, ord "_"; | |
214 | return sort { $a <=> $b } uniques @ids; | |
00c072cf KW |
215 | } |
216 | ||
adff23ea | 217 | sub Non_Latin1_Folds { |
62841d05 KW |
218 | my @return; |
219 | ||
220 | foreach my $folded (keys %folded_closure) { | |
adff23ea | 221 | push @return, $folded if grep { $_ > 255 } @{$folded_closure{$folded}}; |
62841d05 | 222 | } |
adff23ea | 223 | return @return; |
00c072cf KW |
224 | } |
225 | ||
adff23ea KW |
226 | sub Non_Latin1_Simple_Folds { # Latin1 code points that are folded to by |
227 | # non-Latin1 code points as single character | |
228 | # folds | |
229 | return @non_latin1_simple_folds; | |
f12c0118 KW |
230 | } |
231 | ||
adff23ea KW |
232 | sub Non_Final_Folds { |
233 | return @non_final_folds; | |
62841d05 KW |
234 | } |
235 | ||
adff23ea KW |
236 | sub Punct_and_Symbols { |
237 | # Sadly, this is inconsistent: \pP and \pS for the ascii range; | |
238 | # just \pP outside it. | |
62841d05 | 239 | |
adff23ea KW |
240 | my @punct_invlist = prop_invlist("Punct"); |
241 | my @return = expand_invlist(\@punct_invlist); | |
b1909af7 | 242 | |
adff23ea KW |
243 | my @symbols_invlist = prop_invlist("Symbol"); |
244 | my @symbols = expand_invlist(\@symbols_invlist); | |
245 | foreach my $cp (@symbols) { | |
246 | last if $cp > 0x7f; | |
247 | push @return, $cp; | |
248 | } | |
9c68f0ab | 249 | |
adff23ea KW |
250 | return sort { $a <=> $b } uniques @return; |
251 | } | |
9c68f0ab | 252 | |
adff23ea KW |
253 | my @bits; # Each element is a bit map for a single code point |
254 | ||
255 | # For each bit type, calculate which code points should have it set | |
256 | foreach my $bit_name (sort keys %bit_names) { | |
257 | my @code_points; | |
258 | ||
259 | my $property = $bit_name; # The bit name is the same as its property, | |
260 | # unless overridden | |
261 | $property = $bit_names{$bit_name} if $bit_names{$bit_name}; | |
262 | ||
263 | if (! ref $property) { | |
264 | my @invlist = prop_invlist($property, '_perl_core_internal_ok'); | |
265 | @code_points = expand_invlist(\@invlist); | |
266 | } | |
267 | elsif (ref $property eq 'CODE') { | |
268 | @code_points = &$property; | |
269 | } | |
270 | elsif (ref $property eq 'ARRAY') { | |
271 | @code_points = @{$property}; | |
272 | } | |
273 | ||
274 | foreach my $cp (@code_points) { | |
275 | last if $cp > 0xFF; | |
276 | $bits[$cp] .= '|' if $bits[$cp]; | |
277 | $bits[$cp] .= "(1U<<_CC_$bit_name)"; | |
9c68f0ab | 278 | } |
9c68f0ab KW |
279 | } |
280 | ||
cc49830d | 281 | my $out_fh = open_new('l1_char_class_tab.h', '>', |
b1909af7 | 282 | {style => '*', by => $0, |
adff23ea | 283 | from => "Unicode::UCD"}); |
cfb8fd6a | 284 | |
29ec702e KW |
285 | print $out_fh <<END; |
286 | /* For code points whose position is not the same as Unicode, both are shown | |
287 | * in the comment*/ | |
288 | END | |
289 | ||
9c68f0ab | 290 | # Output the table using fairly short names for each char. |
073c22b3 KW |
291 | my $is_for_ascii = 1; # get_supported_code_pages() returns the ASCII |
292 | # character set first | |
29ec702e | 293 | foreach my $charset (get_supported_code_pages()) { |
c30a0cf2 | 294 | my @a2n = @{get_a2n($charset)}; |
29ec702e | 295 | my @out; |
073c22b3 KW |
296 | my @utf_to_i8; |
297 | ||
298 | if ($is_for_ascii) { | |
299 | $is_for_ascii = 0; | |
300 | } | |
301 | else { # EBCDIC. Calculate mapping from UTF-EBCDIC bytes to I8 | |
302 | my $i8_to_utf_ref = get_I8_2_utf($charset); | |
303 | for my $i (0..255) { | |
304 | $utf_to_i8[$i8_to_utf_ref->[$i]] = $i; | |
305 | } | |
306 | } | |
29ec702e KW |
307 | |
308 | print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
64d34faf KW |
309 | for my $ord (0..255) { |
310 | my $name; | |
311 | my $char = chr $ord; | |
312 | if ($char =~ /\p{PosixGraph}/) { | |
313 | my $quote = $char eq "'" ? '"' : "'"; | |
314 | $name = $quote . chr($ord) . $quote; | |
620ee5ce | 315 | } |
64d34faf KW |
316 | elsif ($char =~ /\p{XPosixGraph}/) { |
317 | use charnames(); | |
318 | $name = charnames::viacode($ord); | |
319 | $name =~ s/LATIN CAPITAL LETTER // | |
320 | or $name =~ s/LATIN SMALL LETTER (.*)/\L$1/ | |
321 | or $name =~ s/ SIGN\b// | |
322 | or $name =~ s/EXCLAMATION MARK/'!'/ | |
323 | or $name =~ s/QUESTION MARK/'?'/ | |
324 | or $name =~ s/QUOTATION MARK/QUOTE/ | |
325 | or $name =~ s/ INDICATOR//; | |
326 | $name =~ s/\bWITH\b/\L$&/; | |
327 | $name =~ s/\bONE\b/1/; | |
328 | $name =~ s/\b(TWO|HALF)\b/2/; | |
329 | $name =~ s/\bTHREE\b/3/; | |
330 | $name =~ s/\b QUARTER S? \b/4/x; | |
331 | $name =~ s/VULGAR FRACTION (.) (.)/$1\/$2/; | |
332 | $name =~ s/\bTILDE\b/'~'/i | |
333 | or $name =~ s/\bCIRCUMFLEX\b/'^'/i | |
334 | or $name =~ s/\bSTROKE\b/'\/'/i | |
335 | or $name =~ s/ ABOVE\b//i; | |
620ee5ce KW |
336 | } |
337 | else { | |
64d34faf | 338 | use Unicode::UCD qw(prop_invmap); |
ce8d64d9 KW |
339 | my ($list_ref, $map_ref, $format) |
340 | = prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok'); | |
64d34faf KW |
341 | if ($format !~ /^s/) { |
342 | use Carp; | |
ce8d64d9 | 343 | carp "Unexpected format '$format' for '_Perl_Name_Alias"; |
64d34faf KW |
344 | last; |
345 | } | |
346 | my $which = Unicode::UCD::search_invlist($list_ref, $ord); | |
347 | if (! defined $which) { | |
348 | use Carp; | |
349 | carp "No name found for code pont $ord"; | |
620ee5ce KW |
350 | } |
351 | else { | |
64d34faf KW |
352 | my $map = $map_ref->[$which]; |
353 | if (! ref $map) { | |
354 | $name = $map; | |
355 | } | |
356 | else { | |
357 | # Just pick the first abbreviation if more than one | |
358 | my @names = grep { $_ =~ /abbreviation/ } @$map; | |
359 | $name = $names[0]; | |
360 | } | |
361 | $name =~ s/:.*//; | |
620ee5ce | 362 | } |
620ee5ce | 363 | } |
c93ea49e | 364 | |
64d34faf | 365 | my $index = $a2n[$ord]; |
073c22b3 KW |
366 | my $i8; |
367 | $i8 = $utf_to_i8[$index] if @utf_to_i8; | |
368 | ||
c93ea49e KW |
369 | $out[$index] = "/* "; |
370 | $out[$index] .= sprintf "0x%02X ", $index if $ord != $index; | |
371 | $out[$index] .= sprintf "U+%02X ", $ord; | |
0f191379 | 372 | $out[$index] .= sprintf "I8=%02X ", $i8 if defined $i8 && $i8 != $ord; |
c93ea49e KW |
373 | $out[$index] .= "$name */ "; |
374 | $out[$index] .= $bits[$ord]; | |
073c22b3 KW |
375 | |
376 | # For EBCDIC character sets, we also add some data for when the bytes | |
377 | # are in UTF-EBCDIC; these are based on the fundamental | |
378 | # characteristics of UTF-EBCDIC. | |
379 | if (@utf_to_i8) { | |
5d5376e2 KW |
380 | if ($i8 >= 0xF1) { |
381 | $out[$index] .= | |
382 | '|(1U<<_CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE)'; | |
383 | } | |
073c22b3 KW |
384 | } |
385 | ||
c93ea49e | 386 | $out[$index] .= ",\n"; |
9c68f0ab | 387 | } |
e8368231 KW |
388 | $out[-1] =~ s/,$//; # No trailing comma in the final entry |
389 | ||
29ec702e KW |
390 | print $out_fh join "", @out; |
391 | print $out_fh "\n" . get_conditional_compile_line_end(); | |
9c68f0ab KW |
392 | } |
393 | ||
cfb8fd6a | 394 | read_only_bottom_close_and_rename($out_fh) |