regen/mk_PL_charclass.pl

   1 #!perl -w
   2 use 5.012;
   3 use strict;
   4 use warnings;
   5 require 'regen/regen_lib.pl';
   6
   7 # This program outputs l1_charclass_tab.h, which defines the guts of the
   8 # PL_charclass table.  Each line is a bit map of properties that the Unicode
   9 # code point at the corresponding position in the table array has.  The first
  10 # line corresponds to code point U+0000, NULL, the last line to U+00FF.  For
  11 # an application to see if the code point "i" has a particular property, it
  12 # just does
  13 #    'PL_charclass[i] & BIT'
  14 # The bit names are of the form '_CC_property_suffix', where 'CC' stands for
  15 # character class, and 'property' is the corresponding property, and 'suffix'
  16 # is one of '_A' to mean the property is true only if the corresponding code
  17 # point is ASCII, and '_L1' means that the range includes any Latin1
  18 # character (ISO-8859-1 including the C0 and C1 controls).  A property without
  19 # these suffixes does not have different forms for both ranges.
  20
  21 # This program need be run only when adding new properties to it, or upon a
  22 # new Unicode release, to make sure things haven't been changed by it.
  23
  24 my @properties = qw(
  25     ALNUMC_A
  26     ALNUMC_L1
  27     ALPHA_A
  28     ALPHA_L1
  29     BLANK_A
  30     BLANK_L1
  31     CHARNAME_CONT
  32     CNTRL_A
  33     CNTRL_L1
  34     DIGIT_A
  35     GRAPH_A
  36     GRAPH_L1
  37     IDFIRST_A
  38     IDFIRST_L1
  39     LOWER_A
  40     LOWER_L1
  41     OCTAL_A
  42     PRINT_A
  43     PRINT_L1
  44     PSXSPC_A
  45     PSXSPC_L1
  46     PUNCT_A
  47     PUNCT_L1
  48     SPACE_A
  49     SPACE_L1
  50     UPPER_A
  51     UPPER_L1
  52     WORDCHAR_A
  53     WORDCHAR_L1
  54     XDIGIT_A
  55 );
  56
  57 # Read in the case fold mappings.
  58 my %folded_closure;
  59 my $file="lib/unicore/CaseFolding.txt";
  60 open my $fh, "<", $file or die "Failed to read '$file': $!";
  61 while (<$fh>) {
  62     chomp;
  63
  64     # Lines look like (without the initial '#'
  65     #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
  66     # Get rid of comments, ignore blank or comment-only lines
  67     my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
  68     next unless length $line;
  69     my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
  70
  71     my $from = hex $hex_from;
  72
  73     # Perl only deals with C and F folds
  74     next if $fold_type ne 'C' and $fold_type ne 'F';
  75
  76     # Get each code point in the range that participates in this line's fold.
  77     # The hash has keys of each code point in the range, and values of what it
  78     # folds to and what folds to it
  79     foreach my $hex_fold (@folded) {
  80         my $fold = hex $hex_fold;
  81         push @{$folded_closure{$fold}}, $from if $fold < 256;
  82         push @{$folded_closure{$from}}, $fold if $from < 256;
  83     }
  84 }
  85
  86 # Now having read all the lines, combine them into the full closure of each
  87 # code point in the range by adding lists together that share a common element
  88 foreach my $folded (keys %folded_closure) {
  89     foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
  90         push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
  91     }
  92 }
  93
  94 my @bits;   # Bit map for each code point
  95
  96 foreach my $folded (keys %folded_closure) {
  97     $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
  98                                                 @{$folded_closure{$folded}};
  99 }
 100
 101 # For each character, calculate which properties it matches.
 102 for my $ord (0..255) {
 103     my $char = chr($ord);
 104     utf8::upgrade($char);   # Important to use Unicode semantics!
 105
 106     # Look at all the properties we care about here.
 107     for my $property (@properties) {
 108         my $name = $property;
 109
 110         # Remove the suffix to get the actual property name.
 111         # Currently the suffixes are '_L1', '_A', and none.
 112         # If is a latin1 version, no further checking is needed.
 113         if (! ($name =~ s/_L1$//)) {
 114
 115             # Here, isn't an _L1.  If its _A, it's automatically false for
 116             # non-ascii.  The only one current one without a suffix is valid
 117             # over the whole range.
 118             next if $name =~ s/_A$// && $ord >= 128;
 119
 120         }
 121         my $re;
 122         if ($name eq 'PUNCT') {;
 123
 124             # Sadly, this is inconsistent: \pP and \pS for the ascii range,
 125             # just \pP outside it.
 126             $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
 127         } elsif ($name eq 'CHARNAME_CONT') {;
 128             $re = qr/[-\w ():\xa0]/;
 129         } elsif ($name eq 'SPACE') {;
 130             $re = qr/\s/;
 131         } elsif ($name eq 'IDFIRST') {
 132             $re = qr/[_\p{Alpha}]/;
 133         } elsif ($name eq 'PSXSPC') {
 134             $re = qr/[\v\p{Space}]/;
 135         } elsif ($name eq 'WORDCHAR') {
 136             $re = qr/\w/;
 137         } elsif ($name eq 'ALNUMC') {
 138             # Like \w, but no underscore
 139             $re = qr/\p{Alnum}/;
 140         } elsif ($name eq 'OCTAL') {
 141             $re = qr/[0-7]/;
 142         } else {    # The remainder have the same name and values as Unicode
 143             $re = eval "qr/\\p{$name}/";
 144             use Carp;
 145             carp $@ if ! defined $re;
 146         }
 147         #print "$ord, $name $property, $re\n";
 148         if ($char =~ $re) {  # Add this property if matches
 149             $bits[$ord] .= '|' if $bits[$ord];
 150             $bits[$ord] .= "_CC_$property";
 151         }
 152     }
 153     #print __LINE__, " $ord $char $bits[$ord]\n";
 154 }
 155
 156 # Names of C0 controls
 157 my @C0 = qw (
 158                 NUL
 159                 SOH
 160                 STX
 161                 ETX
 162                 EOT
 163                 ENQ
 164                 ACK
 165                 BEL
 166                 BS
 167                 HT
 168                 LF
 169                 VT
 170                 FF
 171                 CR
 172                 SO
 173                 SI
 174                 DLE
 175                 DC1
 176                 DC2
 177                 DC3
 178                 DC4
 179                 NAK
 180                 SYN
 181                 ETB
 182                 CAN
 183                 EOM
 184                 SUB
 185                 ESC
 186                 FS
 187                 GS
 188                 RS
 189                 US
 190             );
 191
 192 # Names of C1 controls, plus the adjacent DEL
 193 my @C1 = qw(
 194                 DEL
 195                 PAD
 196                 HOP
 197                 BPH
 198                 NBH
 199                 IND
 200                 NEL
 201                 SSA
 202                 ESA
 203                 HTS
 204                 HTJ
 205                 VTS
 206                 PLD
 207                 PLU
 208                 RI
 209                 SS2
 210                 SS3
 211                 DCS
 212                 PU1
 213                 PU2
 214                 STS
 215                 CCH
 216                 MW
 217                 SPA
 218                 EPA
 219                 SOS
 220                 SGC
 221                 SCI
 222                 CSI
 223                 ST
 224                 OSC
 225                 PM
 226                 APC
 227             );
 228
 229 my $out_fh = open_new('l1_char_class_tab.h', '>',
 230                       {style => '*', by => $0,
 231                       from => "property definitions and $file"});
 232
 233 # Output the table using fairly short names for each char.
 234 for my $ord (0..255) {
 235     my $name;
 236     if ($ord < 32) {    # A C0 control
 237         $name = $C0[$ord];
 238     } elsif ($ord > 32 && $ord < 127) { # Graphic
 239         $name = "'" . chr($ord) . "'";
 240     } elsif ($ord >= 127 && $ord <= 0x9f) {
 241         $name = $C1[$ord - 127];    # A C1 control + DEL
 242     } else {    # SPACE, or, if Latin1, shorten the name */
 243         use charnames();
 244         $name = charnames::viacode($ord);
 245         $name =~ s/LATIN CAPITAL LETTER //
 246         || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
 247     }
 248     printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
 249 }
 250
 251 read_only_bottom_close_and_rename($out_fh)