| 1 | #!perl -w |
| 2 | use v5.15.8; |
| 3 | use strict; |
| 4 | use warnings; |
| 5 | require 'regen/regen_lib.pl'; |
| 6 | require 'regen/charset_translations.pl'; |
| 7 | |
| 8 | # This program outputs l1_charclass_tab.h, which defines the guts of the |
| 9 | # PL_charclass table. Each line is a bit map of properties that the Unicode |
| 10 | # code point at the corresponding position in the table array has. The first |
| 11 | # line corresponds to code point U+0000, NULL, the last line to U+00FF. For |
| 12 | # an application to see if the code point "i" has a particular property, it |
| 13 | # just does |
| 14 | # 'PL_charclass[i] & BIT' |
| 15 | # The bit names are of the form '_CC_property_suffix', where 'CC' stands for |
| 16 | # character class, and 'property' is the corresponding property, and 'suffix' |
| 17 | # is one of '_A' to mean the property is true only if the corresponding code |
| 18 | # point is ASCII, and '_L1' means that the range includes any Latin1 |
| 19 | # character (ISO-8859-1 including the C0 and C1 controls). A property without |
| 20 | # these suffixes does not have different forms for both ranges. |
| 21 | |
| 22 | # This program need be run only when adding new properties to it, or upon a |
| 23 | # new Unicode release, to make sure things haven't been changed by it. |
| 24 | |
| 25 | my @properties = qw( |
| 26 | NONLATIN1_SIMPLE_FOLD |
| 27 | NONLATIN1_FOLD |
| 28 | ALPHANUMERIC |
| 29 | ALPHA |
| 30 | ASCII |
| 31 | BLANK |
| 32 | CASED |
| 33 | CHARNAME_CONT |
| 34 | CNTRL |
| 35 | DIGIT |
| 36 | GRAPH |
| 37 | IDFIRST |
| 38 | LOWER |
| 39 | NON_FINAL_FOLD |
| 40 | PRINT |
| 41 | PUNCT |
| 42 | QUOTEMETA |
| 43 | SPACE |
| 44 | UPPER |
| 45 | WORDCHAR |
| 46 | XDIGIT |
| 47 | VERTSPACE |
| 48 | IS_IN_SOME_FOLD |
| 49 | MNEMONIC_CNTRL |
| 50 | ); |
| 51 | |
| 52 | # Read in the case fold mappings. |
| 53 | my %folded_closure; |
| 54 | my %simple_folded_closure; |
| 55 | my @hex_non_final_folds; |
| 56 | my @non_latin1_simple_folds; |
| 57 | my @folds; |
| 58 | use Unicode::UCD; |
| 59 | |
| 60 | BEGIN { # Have to do this at compile time because using user-defined \p{property} |
| 61 | |
| 62 | # Use the Unicode data file if we are on an ASCII platform (which its data |
| 63 | # is for), and it is in the modern format (starting in Unicode 3.1.0) and |
| 64 | # it is available. This avoids being affected by potential bugs |
| 65 | # introduced by other layers of Perl |
| 66 | my $file="lib/unicore/CaseFolding.txt"; |
| 67 | |
| 68 | if (ord('A') == 65 |
| 69 | && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0 |
| 70 | && open my $fh, "<", $file) |
| 71 | { |
| 72 | @folds = <$fh>; |
| 73 | } |
| 74 | else { |
| 75 | my ($invlist_ref, $invmap_ref, undef, $default) |
| 76 | = Unicode::UCD::prop_invmap('Case_Folding'); |
| 77 | for my $i (0 .. @$invlist_ref - 1 - 1) { |
| 78 | next if $invmap_ref->[$i] == $default; |
| 79 | my $adjust = -1; |
| 80 | for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) { |
| 81 | $adjust++; |
| 82 | |
| 83 | # Single-code point maps go to a 'C' type |
| 84 | if (! ref $invmap_ref->[$i]) { |
| 85 | push @folds, sprintf("%04X; C; %04X\n", |
| 86 | $j, |
| 87 | $invmap_ref->[$i] + $adjust); |
| 88 | } |
| 89 | else { # Multi-code point maps go to 'F'. prop_invmap() |
| 90 | # guarantees that no adjustment is needed for these, |
| 91 | # as the range will contain just one element |
| 92 | push @folds, sprintf("%04X; F; %s\n", |
| 93 | $j, |
| 94 | join " ", map { sprintf "%04X", $_ } |
| 95 | @{$invmap_ref->[$i]}); |
| 96 | } |
| 97 | } |
| 98 | } |
| 99 | } |
| 100 | |
| 101 | for (@folds) { |
| 102 | chomp; |
| 103 | |
| 104 | # Lines look like (without the initial '#' |
| 105 | #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE |
| 106 | # Get rid of comments, ignore blank or comment-only lines |
| 107 | my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx; |
| 108 | next unless length $line; |
| 109 | my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line; |
| 110 | |
| 111 | my $from = hex $hex_from; |
| 112 | |
| 113 | # Perl only deals with S, C, and F folds |
| 114 | next if $fold_type ne 'C' and $fold_type ne 'F' and $fold_type ne 'S'; |
| 115 | |
| 116 | # Get each code point in the range that participates in this line's fold. |
| 117 | # The hash has keys of each code point in the range, and values of what it |
| 118 | # folds to and what folds to it |
| 119 | for my $i (0 .. @folded - 1) { |
| 120 | my $hex_fold = $folded[$i]; |
| 121 | my $fold = hex $hex_fold; |
| 122 | if ($fold < 256) { |
| 123 | push @{$folded_closure{$fold}}, $from; |
| 124 | push @{$simple_folded_closure{$fold}}, $from if $fold_type ne 'F'; |
| 125 | } |
| 126 | if ($from < 256) { |
| 127 | push @{$folded_closure{$from}}, $fold; |
| 128 | push @{$simple_folded_closure{$from}}, $fold if $fold_type ne 'F'; |
| 129 | } |
| 130 | |
| 131 | if (($fold_type eq 'C' || $fold_type eq 'S') |
| 132 | && ($fold < 256 != $from < 256)) |
| 133 | { |
| 134 | # Fold is simple (hence can't be a non-final fold, so the 'if' |
| 135 | # above is mutualy exclusive from the 'if below) and crosses |
| 136 | # 255/256 boundary. We keep track of the Latin1 code points |
| 137 | # in such folds. |
| 138 | push @non_latin1_simple_folds, ($fold < 256) |
| 139 | ? $fold |
| 140 | : $from; |
| 141 | } |
| 142 | elsif ($i < @folded-1 |
| 143 | && $fold < 256 |
| 144 | && ! grep { $_ eq $hex_fold } @hex_non_final_folds) |
| 145 | { |
| 146 | push @hex_non_final_folds, $hex_fold; |
| 147 | |
| 148 | # Also add the upper case, which in the latin1 range folds to |
| 149 | # $fold |
| 150 | push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold; |
| 151 | } |
| 152 | } |
| 153 | } |
| 154 | |
| 155 | # Now having read all the lines, combine them into the full closure of each |
| 156 | # code point in the range by adding lists together that share a common |
| 157 | # element |
| 158 | foreach my $folded (keys %folded_closure) { |
| 159 | foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) { |
| 160 | push @{$folded_closure{$from}}, @{$folded_closure{$folded}}; |
| 161 | } |
| 162 | } |
| 163 | foreach my $folded (keys %simple_folded_closure) { |
| 164 | foreach my $from (grep { $_ < 256 } @{$simple_folded_closure{$folded}}) { |
| 165 | push @{$simple_folded_closure{$from}}, @{$simple_folded_closure{$folded}}; |
| 166 | } |
| 167 | } |
| 168 | |
| 169 | # We have the single-character folds that cross the 255/256, like KELVIN |
| 170 | # SIGN => 'k', but we need the closure, so add like 'K' to it |
| 171 | foreach my $folded (@non_latin1_simple_folds) { |
| 172 | foreach my $fold (@{$simple_folded_closure{$folded}}) { |
| 173 | if ($fold < 256 && ! grep { $fold == $_ } @non_latin1_simple_folds) { |
| 174 | push @non_latin1_simple_folds, $fold; |
| 175 | } |
| 176 | } |
| 177 | } |
| 178 | } |
| 179 | |
| 180 | sub Is_Non_Latin1_Fold { |
| 181 | my @return; |
| 182 | |
| 183 | foreach my $folded (keys %folded_closure) { |
| 184 | push @return, sprintf("%X", $folded), if grep { $_ > 255 } |
| 185 | @{$folded_closure{$folded}}; |
| 186 | } |
| 187 | return join("\n", @return) . "\n"; |
| 188 | } |
| 189 | |
| 190 | sub Is_Non_Latin1_Simple_Fold { # Latin1 code points that are folded to by |
| 191 | # non-Latin1 code points as single character |
| 192 | # folds |
| 193 | return join("\n", map { sprintf "%X", $_ } @non_latin1_simple_folds) . "\n"; |
| 194 | } |
| 195 | |
| 196 | sub Is_Non_Final_Fold { |
| 197 | return join("\n", @hex_non_final_folds) . "\n"; |
| 198 | } |
| 199 | |
| 200 | my @bits; # Bit map for each code point |
| 201 | |
| 202 | # For each character, calculate which properties it matches. |
| 203 | for my $ord (0..255) { |
| 204 | my $char = chr($ord); |
| 205 | utf8::upgrade($char); # Important to use Unicode rules! |
| 206 | |
| 207 | # Look at all the properties we care about here. |
| 208 | for my $property (@properties) { |
| 209 | my $name = $property; |
| 210 | |
| 211 | # Remove the suffix to get the actual property name. |
| 212 | # Currently the suffixes are '_L1', '_A', and none. |
| 213 | # If is a latin1 version, no further checking is needed. |
| 214 | if (! ($name =~ s/_L1$//)) { |
| 215 | |
| 216 | # Here, isn't an _L1. If its _A, it's automatically false for |
| 217 | # non-ascii. The only current ones (besides ASCII) without a |
| 218 | # suffix are valid over the whole range. |
| 219 | next if $name =~ s/_A$// && $char !~ /\p{ASCII}/; |
| 220 | } |
| 221 | my $re; |
| 222 | if ($name eq 'PUNCT') {; |
| 223 | |
| 224 | # Sadly, this is inconsistent: \pP and \pS for the ascii range, |
| 225 | # just \pP outside it. |
| 226 | $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/; |
| 227 | } elsif ($name eq 'CHARNAME_CONT') {; |
| 228 | $re = qr/\p{_Perl_Charname_Continue}/, |
| 229 | } elsif ($name eq 'SPACE') {; |
| 230 | $re = qr/\p{XPerlSpace}/; |
| 231 | } elsif ($name eq 'IDFIRST') { |
| 232 | $re = qr/[_\p{XPosixAlpha}]/; |
| 233 | } elsif ($name eq 'WORDCHAR') { |
| 234 | $re = qr/\p{XPosixWord}/; |
| 235 | } elsif ($name eq 'LOWER') { |
| 236 | $re = qr/\p{XPosixLower}/; |
| 237 | } elsif ($name eq 'UPPER') { |
| 238 | $re = qr/\p{XPosixUpper}/; |
| 239 | } elsif ($name eq 'ALPHANUMERIC') { |
| 240 | # Like \w, but no underscore |
| 241 | $re = qr/\p{Alnum}/; |
| 242 | } elsif ($name eq 'ALPHA') { |
| 243 | $re = qr/\p{XPosixAlpha}/; |
| 244 | } elsif ($name eq 'QUOTEMETA') { |
| 245 | $re = qr/\p{_Perl_Quotemeta}/; |
| 246 | } elsif ($name eq 'NONLATIN1_FOLD') { |
| 247 | $re = qr/\p{Is_Non_Latin1_Fold}/; |
| 248 | } elsif ($name eq 'NONLATIN1_SIMPLE_FOLD') { |
| 249 | $re = qr/\p{Is_Non_Latin1_Simple_Fold}/; |
| 250 | } elsif ($name eq 'NON_FINAL_FOLD') { |
| 251 | $re = qr/\p{Is_Non_Final_Fold}/; |
| 252 | } elsif ($name eq 'IS_IN_SOME_FOLD') { |
| 253 | $re = qr/\p{_Perl_Any_Folds}/; |
| 254 | } elsif ($name eq 'MNEMONIC_CNTRL') { |
| 255 | # These are the control characters that there are mnemonics for |
| 256 | $re = qr/[\a\b\e\f\n\r\t]/; |
| 257 | } else { # The remainder have the same name and values as Unicode |
| 258 | $re = eval "qr/\\p{$name}/"; |
| 259 | use Carp; |
| 260 | carp $@ if ! defined $re; |
| 261 | } |
| 262 | #print STDERR __LINE__, ": $ord, $name $property, $re\n"; |
| 263 | if ($char =~ $re) { # Add this property if matches |
| 264 | $bits[$ord] .= '|' if $bits[$ord]; |
| 265 | $bits[$ord] .= "(1U<<_CC_$property)"; |
| 266 | } |
| 267 | } |
| 268 | #print __LINE__, " $ord $char $bits[$ord]\n"; |
| 269 | } |
| 270 | |
| 271 | my $out_fh = open_new('l1_char_class_tab.h', '>', |
| 272 | {style => '*', by => $0, |
| 273 | from => "property definitions"}); |
| 274 | |
| 275 | print $out_fh <<END; |
| 276 | /* For code points whose position is not the same as Unicode, both are shown |
| 277 | * in the comment*/ |
| 278 | END |
| 279 | |
| 280 | # Output the table using fairly short names for each char. |
| 281 | my $is_for_ascii = 1; # get_supported_code_pages() returns the ASCII |
| 282 | # character set first |
| 283 | foreach my $charset (get_supported_code_pages()) { |
| 284 | my @a2n = @{get_a2n($charset)}; |
| 285 | my @out; |
| 286 | my @utf_to_i8; |
| 287 | |
| 288 | if ($is_for_ascii) { |
| 289 | $is_for_ascii = 0; |
| 290 | } |
| 291 | else { # EBCDIC. Calculate mapping from UTF-EBCDIC bytes to I8 |
| 292 | my $i8_to_utf_ref = get_I8_2_utf($charset); |
| 293 | for my $i (0..255) { |
| 294 | $utf_to_i8[$i8_to_utf_ref->[$i]] = $i; |
| 295 | } |
| 296 | } |
| 297 | |
| 298 | print $out_fh "\n" . get_conditional_compile_line_start($charset); |
| 299 | for my $ord (0..255) { |
| 300 | my $name; |
| 301 | my $char = chr $ord; |
| 302 | if ($char =~ /\p{PosixGraph}/) { |
| 303 | my $quote = $char eq "'" ? '"' : "'"; |
| 304 | $name = $quote . chr($ord) . $quote; |
| 305 | } |
| 306 | elsif ($char =~ /\p{XPosixGraph}/) { |
| 307 | use charnames(); |
| 308 | $name = charnames::viacode($ord); |
| 309 | $name =~ s/LATIN CAPITAL LETTER // |
| 310 | or $name =~ s/LATIN SMALL LETTER (.*)/\L$1/ |
| 311 | or $name =~ s/ SIGN\b// |
| 312 | or $name =~ s/EXCLAMATION MARK/'!'/ |
| 313 | or $name =~ s/QUESTION MARK/'?'/ |
| 314 | or $name =~ s/QUOTATION MARK/QUOTE/ |
| 315 | or $name =~ s/ INDICATOR//; |
| 316 | $name =~ s/\bWITH\b/\L$&/; |
| 317 | $name =~ s/\bONE\b/1/; |
| 318 | $name =~ s/\b(TWO|HALF)\b/2/; |
| 319 | $name =~ s/\bTHREE\b/3/; |
| 320 | $name =~ s/\b QUARTER S? \b/4/x; |
| 321 | $name =~ s/VULGAR FRACTION (.) (.)/$1\/$2/; |
| 322 | $name =~ s/\bTILDE\b/'~'/i |
| 323 | or $name =~ s/\bCIRCUMFLEX\b/'^'/i |
| 324 | or $name =~ s/\bSTROKE\b/'\/'/i |
| 325 | or $name =~ s/ ABOVE\b//i; |
| 326 | } |
| 327 | else { |
| 328 | use Unicode::UCD qw(prop_invmap); |
| 329 | my ($list_ref, $map_ref, $format) |
| 330 | = prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok'); |
| 331 | if ($format !~ /^s/) { |
| 332 | use Carp; |
| 333 | carp "Unexpected format '$format' for '_Perl_Name_Alias"; |
| 334 | last; |
| 335 | } |
| 336 | my $which = Unicode::UCD::search_invlist($list_ref, $ord); |
| 337 | if (! defined $which) { |
| 338 | use Carp; |
| 339 | carp "No name found for code pont $ord"; |
| 340 | } |
| 341 | else { |
| 342 | my $map = $map_ref->[$which]; |
| 343 | if (! ref $map) { |
| 344 | $name = $map; |
| 345 | } |
| 346 | else { |
| 347 | # Just pick the first abbreviation if more than one |
| 348 | my @names = grep { $_ =~ /abbreviation/ } @$map; |
| 349 | $name = $names[0]; |
| 350 | } |
| 351 | $name =~ s/:.*//; |
| 352 | } |
| 353 | } |
| 354 | |
| 355 | my $index = $a2n[$ord]; |
| 356 | my $i8; |
| 357 | $i8 = $utf_to_i8[$index] if @utf_to_i8; |
| 358 | |
| 359 | $out[$index] = "/* "; |
| 360 | $out[$index] .= sprintf "0x%02X ", $index if $ord != $index; |
| 361 | $out[$index] .= sprintf "U+%02X ", $ord; |
| 362 | $out[$index] .= sprintf "I8=%02X ", $i8 if defined $i8 && $i8 != $ord; |
| 363 | $out[$index] .= "$name */ "; |
| 364 | $out[$index] .= $bits[$ord]; |
| 365 | |
| 366 | # For EBCDIC character sets, we also add some data for when the bytes |
| 367 | # are in UTF-EBCDIC; these are based on the fundamental |
| 368 | # characteristics of UTF-EBCDIC. |
| 369 | if (@utf_to_i8) { |
| 370 | if ($i8 >= 0xC5 && $i8 != 0xE0) { |
| 371 | $out[$index] .= '|(1U<<_CC_UTF8_IS_START)'; |
| 372 | if ($i8 <= 0xC7) { |
| 373 | $out[$index] .= '|(1U<<_CC_UTF8_IS_DOWNGRADEABLE_START)'; |
| 374 | } |
| 375 | } |
| 376 | if (($i8 & 0xE0) == 0xA0) { |
| 377 | $out[$index] .= '|(1U<<_CC_UTF8_IS_CONTINUATION)'; |
| 378 | } |
| 379 | if ($i8 >= 0xF1) { |
| 380 | $out[$index] .= |
| 381 | '|(1U<<_CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE)'; |
| 382 | } |
| 383 | } |
| 384 | |
| 385 | $out[$index] .= ",\n"; |
| 386 | } |
| 387 | $out[-1] =~ s/,$//; # No trailing comma in the final entry |
| 388 | |
| 389 | print $out_fh join "", @out; |
| 390 | print $out_fh "\n" . get_conditional_compile_line_end(); |
| 391 | } |
| 392 | |
| 393 | read_only_bottom_close_and_rename($out_fh) |