regen/mk_PL_charclass.pl

   1 #!perl -w
   2 use 5.012;
   3 use strict;
   4 use warnings;
   5 require 'regen/regen_lib.pl';
   6
   7 # This program outputs l1_charclass_tab.h, which defines the guts of the
   8 # PL_charclass table.  Each line is a bit map of properties that the Unicode
   9 # code point at the corresponding position in the table array has.  The first
  10 # line corresponds to code point U+0000, NULL, the last line to U+00FF.  For
  11 # an application to see if the code point "i" has a particular property, it
  12 # just does
  13 #    'PL_charclass[i] & BIT'
  14 # The bit names are of the form '_CC_property_suffix', where 'CC' stands for
  15 # character class, and 'property' is the corresponding property, and 'suffix'
  16 # is one of '_A' to mean the property is true only if the corresponding code
  17 # point is ASCII, and '_L1' means that the range includes any Latin1
  18 # character (ISO-8859-1 including the C0 and C1 controls).  A property without
  19 # these suffixes does not have different forms for both ranges.
  20
  21 # This program need be run only when adding new properties to it, or upon a
  22 # new Unicode release, to make sure things haven't been changed by it.
  23
  24 my @properties = qw(
  25     ALNUMC_A
  26     ALNUMC_L1
  27     ALPHA_A
  28     ALPHA_L1
  29     BLANK_A
  30     BLANK_L1
  31     CHARNAME_CONT
  32     CNTRL_A
  33     CNTRL_L1
  34     DIGIT_A
  35     GRAPH_A
  36     GRAPH_L1
  37     IDFIRST_A
  38     IDFIRST_L1
  39     LOWER_A
  40     LOWER_L1
  41     OCTAL_A
  42     PRINT_A
  43     PRINT_L1
  44     PSXSPC_A
  45     PSXSPC_L1
  46     PUNCT_A
  47     PUNCT_L1
  48     SPACE_A
  49     SPACE_L1
  50     UPPER_A
  51     UPPER_L1
  52     WORDCHAR_A
  53     WORDCHAR_L1
  54     XDIGIT_A
  55     QUOTEMETA
  56 );
  57
  58 # Read in the case fold mappings.
  59 my %folded_closure;
  60 my $file="lib/unicore/CaseFolding.txt";
  61 open my $fh, "<", $file or die "Failed to read '$file': $!";
  62 while (<$fh>) {
  63     chomp;
  64
  65     # Lines look like (without the initial '#'
  66     #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
  67     # Get rid of comments, ignore blank or comment-only lines
  68     my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
  69     next unless length $line;
  70     my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
  71
  72     my $from = hex $hex_from;
  73
  74     # Perl only deals with C and F folds
  75     next if $fold_type ne 'C' and $fold_type ne 'F';
  76
  77     # Get each code point in the range that participates in this line's fold.
  78     # The hash has keys of each code point in the range, and values of what it
  79     # folds to and what folds to it
  80     foreach my $hex_fold (@folded) {
  81         my $fold = hex $hex_fold;
  82         push @{$folded_closure{$fold}}, $from if $fold < 256;
  83         push @{$folded_closure{$from}}, $fold if $from < 256;
  84     }
  85 }
  86
  87 # Now having read all the lines, combine them into the full closure of each
  88 # code point in the range by adding lists together that share a common element
  89 foreach my $folded (keys %folded_closure) {
  90     foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
  91         push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
  92     }
  93 }
  94
  95 my @bits;   # Bit map for each code point
  96
  97 foreach my $folded (keys %folded_closure) {
  98     $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
  99                                                 @{$folded_closure{$folded}};
 100 }
 101
 102 # For each character, calculate which properties it matches.
 103 for my $ord (0..255) {
 104     my $char = chr($ord);
 105     utf8::upgrade($char);   # Important to use Unicode semantics!
 106
 107     # Look at all the properties we care about here.
 108     for my $property (@properties) {
 109         my $name = $property;
 110
 111         # Remove the suffix to get the actual property name.
 112         # Currently the suffixes are '_L1', '_A', and none.
 113         # If is a latin1 version, no further checking is needed.
 114         if (! ($name =~ s/_L1$//)) {
 115
 116             # Here, isn't an _L1.  If its _A, it's automatically false for
 117             # non-ascii.  The only one current one without a suffix is valid
 118             # over the whole range.
 119             next if $name =~ s/_A$// && $ord >= 128;
 120
 121         }
 122         my $re;
 123         if ($name eq 'PUNCT') {;
 124
 125             # Sadly, this is inconsistent: \pP and \pS for the ascii range,
 126             # just \pP outside it.
 127             $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
 128         } elsif ($name eq 'CHARNAME_CONT') {;
 129             $re = qr/[-\w ():\xa0]/;
 130         } elsif ($name eq 'SPACE') {;
 131             $re = qr/\s/;
 132         } elsif ($name eq 'IDFIRST') {
 133             $re = qr/[_\p{Alpha}]/;
 134         } elsif ($name eq 'PSXSPC') {
 135             $re = qr/[\v\p{Space}]/;
 136         } elsif ($name eq 'WORDCHAR') {
 137             $re = qr/\w/;
 138         } elsif ($name eq 'ALNUMC') {
 139             # Like \w, but no underscore
 140             $re = qr/\p{Alnum}/;
 141         } elsif ($name eq 'OCTAL') {
 142             $re = qr/[0-7]/;
 143         } elsif ($name eq 'QUOTEMETA') {
 144             $re = qr/\p{_Perl_Quotemeta}/;
 145         } else {    # The remainder have the same name and values as Unicode
 146             $re = eval "qr/\\p{$name}/";
 147             use Carp;
 148             carp $@ if ! defined $re;
 149         }
 150         #print "$ord, $name $property, $re\n";
 151         if ($char =~ $re) {  # Add this property if matches
 152             $bits[$ord] .= '|' if $bits[$ord];
 153             $bits[$ord] .= "_CC_$property";
 154         }
 155     }
 156     #print __LINE__, " $ord $char $bits[$ord]\n";
 157 }
 158
 159 # Names of C0 controls
 160 my @C0 = qw (
 161                 NUL
 162                 SOH
 163                 STX
 164                 ETX
 165                 EOT
 166                 ENQ
 167                 ACK
 168                 BEL
 169                 BS
 170                 HT
 171                 LF
 172                 VT
 173                 FF
 174                 CR
 175                 SO
 176                 SI
 177                 DLE
 178                 DC1
 179                 DC2
 180                 DC3
 181                 DC4
 182                 NAK
 183                 SYN
 184                 ETB
 185                 CAN
 186                 EOM
 187                 SUB
 188                 ESC
 189                 FS
 190                 GS
 191                 RS
 192                 US
 193             );
 194
 195 # Names of C1 controls, plus the adjacent DEL
 196 my @C1 = qw(
 197                 DEL
 198                 PAD
 199                 HOP
 200                 BPH
 201                 NBH
 202                 IND
 203                 NEL
 204                 SSA
 205                 ESA
 206                 HTS
 207                 HTJ
 208                 VTS
 209                 PLD
 210                 PLU
 211                 RI
 212                 SS2
 213                 SS3
 214                 DCS
 215                 PU1
 216                 PU2
 217                 STS
 218                 CCH
 219                 MW
 220                 SPA
 221                 EPA
 222                 SOS
 223                 SGC
 224                 SCI
 225                 CSI
 226                 ST
 227                 OSC
 228                 PM
 229                 APC
 230             );
 231
 232 my $out_fh = open_new('l1_char_class_tab.h', '>',
 233                       {style => '*', by => $0,
 234                       from => "property definitions and $file"});
 235
 236 # Output the table using fairly short names for each char.
 237 for my $ord (0..255) {
 238     my $name;
 239     if ($ord < 32) {    # A C0 control
 240         $name = $C0[$ord];
 241     } elsif ($ord > 32 && $ord < 127) { # Graphic
 242         $name = "'" . chr($ord) . "'";
 243     } elsif ($ord >= 127 && $ord <= 0x9f) {
 244         $name = $C1[$ord - 127];    # A C1 control + DEL
 245     } else {    # SPACE, or, if Latin1, shorten the name */
 246         use charnames();
 247         $name = charnames::viacode($ord);
 248         $name =~ s/LATIN CAPITAL LETTER //
 249         || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
 250     }
 251     printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
 252 }
 253
 254 read_only_bottom_close_and_rename($out_fh)