regen/mk_PL_charclass.pl

   1 #!perl -w
   2 use v5.15.8;
   3 use strict;
   4 use warnings;
   5 require 'regen/regen_lib.pl';
   6
   7 # This program outputs l1_charclass_tab.h, which defines the guts of the
   8 # PL_charclass table.  Each line is a bit map of properties that the Unicode
   9 # code point at the corresponding position in the table array has.  The first
  10 # line corresponds to code point U+0000, NULL, the last line to U+00FF.  For
  11 # an application to see if the code point "i" has a particular property, it
  12 # just does
  13 #    'PL_charclass[i] & BIT'
  14 # The bit names are of the form '_CC_property_suffix', where 'CC' stands for
  15 # character class, and 'property' is the corresponding property, and 'suffix'
  16 # is one of '_A' to mean the property is true only if the corresponding code
  17 # point is ASCII, and '_L1' means that the range includes any Latin1
  18 # character (ISO-8859-1 including the C0 and C1 controls).  A property without
  19 # these suffixes does not have different forms for both ranges.
  20
  21 # This program need be run only when adding new properties to it, or upon a
  22 # new Unicode release, to make sure things haven't been changed by it.
  23
  24 my @properties = qw(
  25     NONLATIN1_FOLD
  26     ALNUMC
  27     ALPHA
  28     ASCII
  29     BLANK
  30     CHARNAME_CONT
  31     CNTRL
  32     DIGIT
  33     GRAPH
  34     IDFIRST
  35     LOWER
  36     NON_FINAL_FOLD
  37     PRINT
  38     PSXSPC
  39     PUNCT
  40     QUOTEMETA
  41     SPACE
  42     UPPER
  43     WORDCHAR
  44     XDIGIT
  45     VERTSPACE
  46     IS_IN_SOME_FOLD
  47 );
  48
  49 # Read in the case fold mappings.
  50 my %folded_closure;
  51 my @hex_non_final_folds;
  52 my @folds;
  53 use Unicode::UCD;
  54
  55 BEGIN { # Have to do this at compile time because using user-defined \p{property}
  56
  57     # Use the Unicode data file if we are on an ASCII platform (which its data
  58     # is for), and it is in the modern format (starting in Unicode 3.1.0) and
  59     # it is available.  This avoids being affected by potential bugs
  60     # introduced by other layers of Perl
  61     my $file="lib/unicore/CaseFolding.txt";
  62
  63     if (ord('A') == 65
  64         && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
  65         && open my $fh, "<", $file)
  66     {
  67         @folds = <$fh>;
  68     }
  69     else {
  70         my ($invlist_ref, $invmap_ref, undef, $default)
  71                                     = Unicode::UCD::prop_invmap('Case_Folding');
  72         for my $i (0 .. @$invlist_ref - 1 - 1) {
  73             next if $invmap_ref->[$i] == $default;
  74             my $adjust = -1;
  75             for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
  76                 $adjust++;
  77
  78                 # Single-code point maps go to a 'C' type
  79                 if (! ref $invmap_ref->[$i]) {
  80                     push @folds, sprintf("%04X; C; %04X\n",
  81                                         $j,
  82                                         $invmap_ref->[$i] + $adjust);
  83                 }
  84                 else {  # Multi-code point maps go to 'F'.  prop_invmap()
  85                         # guarantees that no adjustment is needed for these,
  86                         # as the range will contain just one element
  87                     push @folds, sprintf("%04X; F; %s\n",
  88                                         $j,
  89                                         join " ", map { sprintf "%04X", $_ }
  90                                                         @{$invmap_ref->[$i]});
  91                 }
  92             }
  93         }
  94     }
  95
  96     for (@folds) {
  97         chomp;
  98
  99         # Lines look like (without the initial '#'
 100         #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
 101         # Get rid of comments, ignore blank or comment-only lines
 102         my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
 103         next unless length $line;
 104         my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
 105
 106         my $from = hex $hex_from;
 107
 108         # Perl only deals with C and F folds
 109         next if $fold_type ne 'C' and $fold_type ne 'F';
 110
 111         # Get each code point in the range that participates in this line's fold.
 112         # The hash has keys of each code point in the range, and values of what it
 113         # folds to and what folds to it
 114         for my $i (0 .. @folded - 1) {
 115             my $hex_fold = $folded[$i];
 116             my $fold = hex $hex_fold;
 117             push @{$folded_closure{$fold}}, $from if $fold < 256;
 118             push @{$folded_closure{$from}}, $fold if $from < 256;
 119
 120             if ($i < @folded-1
 121                 && $fold < 256
 122                 && ! grep { $_ eq $hex_fold } @hex_non_final_folds)
 123             {
 124                 push @hex_non_final_folds, $hex_fold;
 125
 126                 # Also add the upper case, which in the latin1 range folds to
 127                 # $fold
 128                 push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold;
 129             }
 130         }
 131     }
 132
 133     # Now having read all the lines, combine them into the full closure of each
 134     # code point in the range by adding lists together that share a common
 135     # element
 136     foreach my $folded (keys %folded_closure) {
 137         foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
 138             push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
 139         }
 140     }
 141 }
 142
 143 sub Is_Non_Latin1_Fold {
 144     my @return;
 145
 146     foreach my $folded (keys %folded_closure) {
 147         push @return, sprintf("%X", $folded), if grep { $_ > 255 }
 148                                                      @{$folded_closure{$folded}};
 149     }
 150     return join("\n", @return) . "\n";
 151 }
 152
 153 sub Is_Non_Final_Fold {
 154     return join("\n", @hex_non_final_folds) . "\n";
 155 }
 156
 157 my @bits;   # Bit map for each code point
 158
 159 # For each character, calculate which properties it matches.
 160 for my $ord (0..255) {
 161     my $char = chr($ord);
 162     utf8::upgrade($char);   # Important to use Unicode semantics!
 163
 164     # Look at all the properties we care about here.
 165     for my $property (@properties) {
 166         my $name = $property;
 167
 168         # Remove the suffix to get the actual property name.
 169         # Currently the suffixes are '_L1', '_A', and none.
 170         # If is a latin1 version, no further checking is needed.
 171         if (! ($name =~ s/_L1$//)) {
 172
 173             # Here, isn't an _L1.  If its _A, it's automatically false for
 174             # non-ascii.  The only current ones (besides ASCII) without a
 175             # suffix are valid over the whole range.
 176             next if $name =~ s/_A$// && $ord >= 128;
 177
 178         }
 179         my $re;
 180         if ($name eq 'PUNCT') {;
 181
 182             # Sadly, this is inconsistent: \pP and \pS for the ascii range,
 183             # just \pP outside it.
 184             $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
 185         } elsif ($name eq 'CHARNAME_CONT') {;
 186             $re = qr/\p{_Perl_Charname_Continue}/,
 187         } elsif ($name eq 'SPACE') {;
 188             $re = qr/\p{XPerlSpace}/;
 189         } elsif ($name eq 'IDFIRST') {
 190             $re = qr/[_\p{Alpha}]/;
 191         } elsif ($name eq 'PSXSPC') {
 192             $re = qr/[\v\p{Space}]/;
 193         } elsif ($name eq 'WORDCHAR') {
 194             $re = qr/\p{XPosixWord}/;
 195         } elsif ($name eq 'ALNUMC') {
 196             # Like \w, but no underscore
 197             $re = qr/\p{Alnum}/;
 198         } elsif ($name eq 'QUOTEMETA') {
 199             $re = qr/\p{_Perl_Quotemeta}/;
 200         } elsif ($name eq 'NONLATIN1_FOLD') {
 201             $re = qr/\p{Is_Non_Latin1_Fold}/;
 202         } elsif ($name eq 'NON_FINAL_FOLD') {
 203             $re = qr/\p{Is_Non_Final_Fold}/;
 204         } elsif ($name eq 'IS_IN_SOME_FOLD') {
 205             $re = qr/\p{_Perl_Any_Folds}/;
 206         } else {    # The remainder have the same name and values as Unicode
 207             $re = eval "qr/\\p{$name}/";
 208             use Carp;
 209             carp $@ if ! defined $re;
 210         }
 211         #print "$ord, $name $property, $re\n";
 212         if ($char =~ $re) {  # Add this property if matches
 213             $bits[$ord] .= '|' if $bits[$ord];
 214             $bits[$ord] .= "(1U<<_CC_$property)";
 215         }
 216     }
 217     #print __LINE__, " $ord $char $bits[$ord]\n";
 218 }
 219
 220 # Names of C0 controls
 221 my @C0 = qw (
 222                 NUL
 223                 SOH
 224                 STX
 225                 ETX
 226                 EOT
 227                 ENQ
 228                 ACK
 229                 BEL
 230                 BS
 231                 HT
 232                 LF
 233                 VT
 234                 FF
 235                 CR
 236                 SO
 237                 SI
 238                 DLE
 239                 DC1
 240                 DC2
 241                 DC3
 242                 DC4
 243                 NAK
 244                 SYN
 245                 ETB
 246                 CAN
 247                 EOM
 248                 SUB
 249                 ESC
 250                 FS
 251                 GS
 252                 RS
 253                 US
 254             );
 255
 256 # Names of C1 controls, plus the adjacent DEL
 257 my @C1 = qw(
 258                 DEL
 259                 PAD
 260                 HOP
 261                 BPH
 262                 NBH
 263                 IND
 264                 NEL
 265                 SSA
 266                 ESA
 267                 HTS
 268                 HTJ
 269                 VTS
 270                 PLD
 271                 PLU
 272                 RI
 273                 SS2
 274                 SS3
 275                 DCS
 276                 PU1
 277                 PU2
 278                 STS
 279                 CCH
 280                 MW
 281                 SPA
 282                 EPA
 283                 SOS
 284                 SGC
 285                 SCI
 286                 CSI
 287                 ST
 288                 OSC
 289                 PM
 290                 APC
 291             );
 292
 293 my $out_fh = open_new('l1_char_class_tab.h', '>',
 294                       {style => '*', by => $0,
 295                       from => "property definitions"});
 296
 297 # Output the table using fairly short names for each char.
 298 for my $ord (0..255) {
 299     my $name;
 300     if ($ord < 32) {    # A C0 control
 301         $name = $C0[$ord];
 302     } elsif ($ord > 32 && $ord < 127) { # Graphic
 303         $name = "'" . chr($ord) . "'";
 304     } elsif ($ord >= 127 && $ord <= 0x9f) {
 305         $name = $C1[$ord - 127];    # A C1 control + DEL
 306     } else {    # SPACE, or, if Latin1, shorten the name */
 307         use charnames();
 308         $name = charnames::viacode($ord);
 309         $name =~ s/LATIN CAPITAL LETTER //
 310         || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
 311     }
 312     printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
 313 }
 314
 315 read_only_bottom_close_and_rename($out_fh)