regen/mk_PL_charclass.pl

   1 #!perl -w
   2 use 5.012;
   3 use strict;
   4 use warnings;
   5 require 'regen/regen_lib.pl';
   6
   7 # This program outputs l1_charclass_tab.h, which defines the guts of the
   8 # PL_charclass table.  Each line is a bit map of properties that the Unicode
   9 # code point at the corresponding position in the table array has.  The first
  10 # line corresponds to code point U+0000, NULL, the last line to U+00FF.  For
  11 # an application to see if the code point "i" has a particular property, it
  12 # just does
  13 #    'PL_charclass[i] & BIT'
  14 # The bit names are of the form '_CC_property_suffix', where 'CC' stands for
  15 # character class, and 'property' is the corresponding property, and 'suffix'
  16 # is one of '_A' to mean the property is true only if the corresponding code
  17 # point is ASCII, and '_L1' means that the range includes any Latin1
  18 # character (ISO-8859-1 including the C0 and C1 controls).  A property without
  19 # these suffixes does not have different forms for both ranges.
  20
  21 # This program need be run only when adding new properties to it, or upon a
  22 # new Unicode release, to make sure things haven't been changed by it.
  23
  24 my @properties = qw(
  25     ALNUMC_A
  26     ALNUMC_L1
  27     ALPHA_A
  28     ALPHA_L1
  29     BLANK_A
  30     BLANK_L1
  31     CHARNAME_CONT
  32     CNTRL_A
  33     CNTRL_L1
  34     DIGIT_A
  35     GRAPH_A
  36     GRAPH_L1
  37     IDFIRST_A
  38     IDFIRST_L1
  39     LOWER_A
  40     LOWER_L1
  41     PRINT_A
  42     PRINT_L1
  43     PSXSPC_A
  44     PSXSPC_L1
  45     PUNCT_A
  46     PUNCT_L1
  47     SPACE_A
  48     SPACE_L1
  49     UPPER_A
  50     UPPER_L1
  51     WORDCHAR_A
  52     WORDCHAR_L1
  53     XDIGIT_A
  54     QUOTEMETA
  55 );
  56
  57 # Read in the case fold mappings.
  58 my %folded_closure;
  59 my $file="lib/unicore/CaseFolding.txt";
  60 my @folds;
  61 use Unicode::UCD;
  62
  63 # Use the Unicode data file if we are on an ASCII platform (which its data is
  64 # for), and it is in the modern format (starting in Unicode 3.1.0) and it is
  65 # available.  This avoids being affected by potential bugs introduced by other
  66 # layers of Perl
  67 if (ord('A') == 65
  68     && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
  69     && open my $fh, "<", $file)
  70 {
  71     @folds = <$fh>;
  72 }
  73 else {
  74     my ($invlist_ref, $invmap_ref, undef, $default)
  75                                     = Unicode::UCD::prop_invmap('Case_Folding');
  76     for my $i (0 .. @$invlist_ref - 1 - 1) {
  77         next if $invmap_ref->[$i] == $default;
  78         my $adjust = -1;
  79         for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
  80             $adjust++;
  81
  82             # Single-code point maps go to a 'C' type
  83             if (! ref $invmap_ref->[$i]) {
  84                 push @folds, sprintf("%04X; C; %04X\n",
  85                                      $j,
  86                                      $invmap_ref->[$i] + $adjust);
  87             }
  88             else {  # Multi-code point maps go to 'F'.  prop_invmap()
  89                     # guarantees that no adjustment is needed for these,
  90                     # as the range will contain just one element
  91                 push @folds, sprintf("%04X; F; %s\n",
  92                                     $j,
  93                                     join " ", map { sprintf "%04X", $_ }
  94                                                     @{$invmap_ref->[$i]});
  95             }
  96         }
  97     }
  98 }
  99
 100 for (@folds) {
 101     chomp;
 102
 103     # Lines look like (without the initial '#'
 104     #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
 105     # Get rid of comments, ignore blank or comment-only lines
 106     my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
 107     next unless length $line;
 108     my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
 109
 110     my $from = hex $hex_from;
 111
 112     # Perl only deals with C and F folds
 113     next if $fold_type ne 'C' and $fold_type ne 'F';
 114
 115     # Get each code point in the range that participates in this line's fold.
 116     # The hash has keys of each code point in the range, and values of what it
 117     # folds to and what folds to it
 118     foreach my $hex_fold (@folded) {
 119         my $fold = hex $hex_fold;
 120         push @{$folded_closure{$fold}}, $from if $fold < 256;
 121         push @{$folded_closure{$from}}, $fold if $from < 256;
 122     }
 123 }
 124
 125 # Now having read all the lines, combine them into the full closure of each
 126 # code point in the range by adding lists together that share a common element
 127 foreach my $folded (keys %folded_closure) {
 128     foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
 129         push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
 130     }
 131 }
 132
 133 my @bits;   # Bit map for each code point
 134
 135 foreach my $folded (keys %folded_closure) {
 136     $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
 137                                                 @{$folded_closure{$folded}};
 138 }
 139
 140 # For each character, calculate which properties it matches.
 141 for my $ord (0..255) {
 142     my $char = chr($ord);
 143     utf8::upgrade($char);   # Important to use Unicode semantics!
 144
 145     # Look at all the properties we care about here.
 146     for my $property (@properties) {
 147         my $name = $property;
 148
 149         # Remove the suffix to get the actual property name.
 150         # Currently the suffixes are '_L1', '_A', and none.
 151         # If is a latin1 version, no further checking is needed.
 152         if (! ($name =~ s/_L1$//)) {
 153
 154             # Here, isn't an _L1.  If its _A, it's automatically false for
 155             # non-ascii.  The only one current one without a suffix is valid
 156             # over the whole range.
 157             next if $name =~ s/_A$// && $ord >= 128;
 158
 159         }
 160         my $re;
 161         if ($name eq 'PUNCT') {;
 162
 163             # Sadly, this is inconsistent: \pP and \pS for the ascii range,
 164             # just \pP outside it.
 165             $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
 166         } elsif ($name eq 'CHARNAME_CONT') {;
 167             $re = qr/[-\p{XPosixWord} ():\xa0]/;
 168         } elsif ($name eq 'SPACE') {;
 169             $re = qr/\p{XPerlSpace}/;
 170         } elsif ($name eq 'IDFIRST') {
 171             $re = qr/[_\p{Alpha}]/;
 172         } elsif ($name eq 'PSXSPC') {
 173             $re = qr/[\v\p{Space}]/;
 174         } elsif ($name eq 'WORDCHAR') {
 175             $re = qr/\p{XPosixWord}/;
 176         } elsif ($name eq 'ALNUMC') {
 177             # Like \w, but no underscore
 178             $re = qr/\p{Alnum}/;
 179         } elsif ($name eq 'OCTAL') {
 180             $re = qr/[0-7]/;
 181         } elsif ($name eq 'QUOTEMETA') {
 182             $re = qr/\p{_Perl_Quotemeta}/;
 183         } else {    # The remainder have the same name and values as Unicode
 184             $re = eval "qr/\\p{$name}/";
 185             use Carp;
 186             carp $@ if ! defined $re;
 187         }
 188         #print "$ord, $name $property, $re\n";
 189         if ($char =~ $re) {  # Add this property if matches
 190             $bits[$ord] .= '|' if $bits[$ord];
 191             $bits[$ord] .= "_CC_$property";
 192         }
 193     }
 194     #print __LINE__, " $ord $char $bits[$ord]\n";
 195 }
 196
 197 # Names of C0 controls
 198 my @C0 = qw (
 199                 NUL
 200                 SOH
 201                 STX
 202                 ETX
 203                 EOT
 204                 ENQ
 205                 ACK
 206                 BEL
 207                 BS
 208                 HT
 209                 LF
 210                 VT
 211                 FF
 212                 CR
 213                 SO
 214                 SI
 215                 DLE
 216                 DC1
 217                 DC2
 218                 DC3
 219                 DC4
 220                 NAK
 221                 SYN
 222                 ETB
 223                 CAN
 224                 EOM
 225                 SUB
 226                 ESC
 227                 FS
 228                 GS
 229                 RS
 230                 US
 231             );
 232
 233 # Names of C1 controls, plus the adjacent DEL
 234 my @C1 = qw(
 235                 DEL
 236                 PAD
 237                 HOP
 238                 BPH
 239                 NBH
 240                 IND
 241                 NEL
 242                 SSA
 243                 ESA
 244                 HTS
 245                 HTJ
 246                 VTS
 247                 PLD
 248                 PLU
 249                 RI
 250                 SS2
 251                 SS3
 252                 DCS
 253                 PU1
 254                 PU2
 255                 STS
 256                 CCH
 257                 MW
 258                 SPA
 259                 EPA
 260                 SOS
 261                 SGC
 262                 SCI
 263                 CSI
 264                 ST
 265                 OSC
 266                 PM
 267                 APC
 268             );
 269
 270 my $out_fh = open_new('l1_char_class_tab.h', '>',
 271                       {style => '*', by => $0,
 272                       from => "property definitions"});
 273
 274 # Output the table using fairly short names for each char.
 275 for my $ord (0..255) {
 276     my $name;
 277     if ($ord < 32) {    # A C0 control
 278         $name = $C0[$ord];
 279     } elsif ($ord > 32 && $ord < 127) { # Graphic
 280         $name = "'" . chr($ord) . "'";
 281     } elsif ($ord >= 127 && $ord <= 0x9f) {
 282         $name = $C1[$ord - 127];    # A C1 control + DEL
 283     } else {    # SPACE, or, if Latin1, shorten the name */
 284         use charnames();
 285         $name = charnames::viacode($ord);
 286         $name =~ s/LATIN CAPITAL LETTER //
 287         || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
 288     }
 289     printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
 290 }
 291
 292 read_only_bottom_close_and_rename($out_fh)