regen/mk_PL_charclass.pl

   1 #!perl -w
   2 use v5.15.8;
   3 use strict;
   4 use warnings;
   5 require 'regen/regen_lib.pl';
   6
   7 # This program outputs l1_charclass_tab.h, which defines the guts of the
   8 # PL_charclass table.  Each line is a bit map of properties that the Unicode
   9 # code point at the corresponding position in the table array has.  The first
  10 # line corresponds to code point U+0000, NULL, the last line to U+00FF.  For
  11 # an application to see if the code point "i" has a particular property, it
  12 # just does
  13 #    'PL_charclass[i] & BIT'
  14 # The bit names are of the form '_CC_property_suffix', where 'CC' stands for
  15 # character class, and 'property' is the corresponding property, and 'suffix'
  16 # is one of '_A' to mean the property is true only if the corresponding code
  17 # point is ASCII, and '_L1' means that the range includes any Latin1
  18 # character (ISO-8859-1 including the C0 and C1 controls).  A property without
  19 # these suffixes does not have different forms for both ranges.
  20
  21 # This program need be run only when adding new properties to it, or upon a
  22 # new Unicode release, to make sure things haven't been changed by it.
  23
  24 my @properties = qw(
  25     NONLATIN1_FOLD
  26     ALPHANUMERIC
  27     ALPHA
  28     ASCII
  29     BLANK
  30     CASED
  31     CHARNAME_CONT
  32     CNTRL
  33     DIGIT
  34     GRAPH
  35     IDFIRST
  36     LOWER
  37     NON_FINAL_FOLD
  38     PRINT
  39     PSXSPC
  40     PUNCT
  41     QUOTEMETA
  42     SPACE
  43     UPPER
  44     WORDCHAR
  45     XDIGIT
  46     VERTSPACE
  47     IS_IN_SOME_FOLD
  48     BACKSLASH_FOO_LBRACE_IS_META
  49 );
  50
  51 # Read in the case fold mappings.
  52 my %folded_closure;
  53 my @hex_non_final_folds;
  54 my @folds;
  55 use Unicode::UCD;
  56
  57 BEGIN { # Have to do this at compile time because using user-defined \p{property}
  58
  59     # Use the Unicode data file if we are on an ASCII platform (which its data
  60     # is for), and it is in the modern format (starting in Unicode 3.1.0) and
  61     # it is available.  This avoids being affected by potential bugs
  62     # introduced by other layers of Perl
  63     my $file="lib/unicore/CaseFolding.txt";
  64
  65     if (ord('A') == 65
  66         && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
  67         && open my $fh, "<", $file)
  68     {
  69         @folds = <$fh>;
  70     }
  71     else {
  72         my ($invlist_ref, $invmap_ref, undef, $default)
  73                                     = Unicode::UCD::prop_invmap('Case_Folding');
  74         for my $i (0 .. @$invlist_ref - 1 - 1) {
  75             next if $invmap_ref->[$i] == $default;
  76             my $adjust = -1;
  77             for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
  78                 $adjust++;
  79
  80                 # Single-code point maps go to a 'C' type
  81                 if (! ref $invmap_ref->[$i]) {
  82                     push @folds, sprintf("%04X; C; %04X\n",
  83                                         $j,
  84                                         $invmap_ref->[$i] + $adjust);
  85                 }
  86                 else {  # Multi-code point maps go to 'F'.  prop_invmap()
  87                         # guarantees that no adjustment is needed for these,
  88                         # as the range will contain just one element
  89                     push @folds, sprintf("%04X; F; %s\n",
  90                                         $j,
  91                                         join " ", map { sprintf "%04X", $_ }
  92                                                         @{$invmap_ref->[$i]});
  93                 }
  94             }
  95         }
  96     }
  97
  98     for (@folds) {
  99         chomp;
 100
 101         # Lines look like (without the initial '#'
 102         #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
 103         # Get rid of comments, ignore blank or comment-only lines
 104         my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
 105         next unless length $line;
 106         my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
 107
 108         my $from = hex $hex_from;
 109
 110         # Perl only deals with C and F folds
 111         next if $fold_type ne 'C' and $fold_type ne 'F';
 112
 113         # Get each code point in the range that participates in this line's fold.
 114         # The hash has keys of each code point in the range, and values of what it
 115         # folds to and what folds to it
 116         for my $i (0 .. @folded - 1) {
 117             my $hex_fold = $folded[$i];
 118             my $fold = hex $hex_fold;
 119             push @{$folded_closure{$fold}}, $from if $fold < 256;
 120             push @{$folded_closure{$from}}, $fold if $from < 256;
 121
 122             if ($i < @folded-1
 123                 && $fold < 256
 124                 && ! grep { $_ eq $hex_fold } @hex_non_final_folds)
 125             {
 126                 push @hex_non_final_folds, $hex_fold;
 127
 128                 # Also add the upper case, which in the latin1 range folds to
 129                 # $fold
 130                 push @hex_non_final_folds, sprintf "%04X", ord uc chr $fold;
 131             }
 132         }
 133     }
 134
 135     # Now having read all the lines, combine them into the full closure of each
 136     # code point in the range by adding lists together that share a common
 137     # element
 138     foreach my $folded (keys %folded_closure) {
 139         foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
 140             push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
 141         }
 142     }
 143 }
 144
 145 sub Is_Non_Latin1_Fold {
 146     my @return;
 147
 148     foreach my $folded (keys %folded_closure) {
 149         push @return, sprintf("%X", $folded), if grep { $_ > 255 }
 150                                                      @{$folded_closure{$folded}};
 151     }
 152     return join("\n", @return) . "\n";
 153 }
 154
 155 sub Is_Non_Final_Fold {
 156     return join("\n", @hex_non_final_folds) . "\n";
 157 }
 158
 159 my @bits;   # Bit map for each code point
 160
 161 # For each character, calculate which properties it matches.
 162 for my $ord (0..255) {
 163     my $char = chr($ord);
 164     utf8::upgrade($char);   # Important to use Unicode semantics!
 165
 166     # Look at all the properties we care about here.
 167     for my $property (@properties) {
 168         my $name = $property;
 169
 170         # Remove the suffix to get the actual property name.
 171         # Currently the suffixes are '_L1', '_A', and none.
 172         # If is a latin1 version, no further checking is needed.
 173         if (! ($name =~ s/_L1$//)) {
 174
 175             # Here, isn't an _L1.  If its _A, it's automatically false for
 176             # non-ascii.  The only current ones (besides ASCII) without a
 177             # suffix are valid over the whole range.
 178             next if $name =~ s/_A$// && $ord >= 128;
 179
 180         }
 181         my $re;
 182         if ($name eq 'PUNCT') {;
 183
 184             # Sadly, this is inconsistent: \pP and \pS for the ascii range,
 185             # just \pP outside it.
 186             $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
 187         } elsif ($name eq 'CHARNAME_CONT') {;
 188             $re = qr/\p{_Perl_Charname_Continue}/,
 189         } elsif ($name eq 'SPACE') {;
 190             $re = qr/\p{XPerlSpace}/;
 191         } elsif ($name eq 'IDFIRST') {
 192             $re = qr/[_\p{Alpha}]/;
 193         } elsif ($name eq 'PSXSPC') {
 194             $re = qr/[\v\p{Space}]/;
 195         } elsif ($name eq 'WORDCHAR') {
 196             $re = qr/\p{XPosixWord}/;
 197         } elsif ($name eq 'ALPHANUMERIC') {
 198             # Like \w, but no underscore
 199             $re = qr/\p{Alnum}/;
 200         } elsif ($name eq 'QUOTEMETA') {
 201             $re = qr/\p{_Perl_Quotemeta}/;
 202         } elsif ($name eq 'NONLATIN1_FOLD') {
 203             $re = qr/\p{Is_Non_Latin1_Fold}/;
 204         } elsif ($name eq 'NON_FINAL_FOLD') {
 205             $re = qr/\p{Is_Non_Final_Fold}/;
 206         } elsif ($name eq 'IS_IN_SOME_FOLD') {
 207             $re = qr/\p{_Perl_Any_Folds}/;
 208         } elsif ($name eq 'BACKSLASH_FOO_LBRACE_IS_META') {
 209
 210             # This is true for FOO where FOO is the varying character in:
 211             # \a{, \b{, \c{, ...
 212             # and the sequence has non-literal meaning to Perl; so it is true
 213             # for 'x' because \x{ is special, but not 'a' because \a{ isn't.
 214             $re = qr/[gkNopPx]/;
 215         } else {    # The remainder have the same name and values as Unicode
 216             $re = eval "qr/\\p{$name}/";
 217             use Carp;
 218             carp $@ if ! defined $re;
 219         }
 220         #print "$ord, $name $property, $re\n";
 221         if ($char =~ $re) {  # Add this property if matches
 222             $bits[$ord] .= '|' if $bits[$ord];
 223             $bits[$ord] .= "(1U<<_CC_$property)";
 224         }
 225     }
 226     #print __LINE__, " $ord $char $bits[$ord]\n";
 227 }
 228
 229 # Names of C0 controls
 230 my @C0 = qw (
 231                 NUL
 232                 SOH
 233                 STX
 234                 ETX
 235                 EOT
 236                 ENQ
 237                 ACK
 238                 BEL
 239                 BS
 240                 HT
 241                 LF
 242                 VT
 243                 FF
 244                 CR
 245                 SO
 246                 SI
 247                 DLE
 248                 DC1
 249                 DC2
 250                 DC3
 251                 DC4
 252                 NAK
 253                 SYN
 254                 ETB
 255                 CAN
 256                 EOM
 257                 SUB
 258                 ESC
 259                 FS
 260                 GS
 261                 RS
 262                 US
 263             );
 264
 265 # Names of C1 controls, plus the adjacent DEL
 266 my @C1 = qw(
 267                 DEL
 268                 PAD
 269                 HOP
 270                 BPH
 271                 NBH
 272                 IND
 273                 NEL
 274                 SSA
 275                 ESA
 276                 HTS
 277                 HTJ
 278                 VTS
 279                 PLD
 280                 PLU
 281                 RI
 282                 SS2
 283                 SS3
 284                 DCS
 285                 PU1
 286                 PU2
 287                 STS
 288                 CCH
 289                 MW
 290                 SPA
 291                 EPA
 292                 SOS
 293                 SGC
 294                 SCI
 295                 CSI
 296                 ST
 297                 OSC
 298                 PM
 299                 APC
 300             );
 301
 302 my $out_fh = open_new('l1_char_class_tab.h', '>',
 303                       {style => '*', by => $0,
 304                       from => "property definitions"});
 305
 306 # Output the table using fairly short names for each char.
 307 for my $ord (0..255) {
 308     my $name;
 309     if ($ord < 32) {    # A C0 control
 310         $name = $C0[$ord];
 311     } elsif ($ord > 32 && $ord < 127) { # Graphic
 312         $name = "'" . chr($ord) . "'";
 313     } elsif ($ord >= 127 && $ord <= 0x9f) {
 314         $name = $C1[$ord - 127];    # A C1 control + DEL
 315     } else {    # SPACE, or, if Latin1, shorten the name */
 316         use charnames();
 317         $name = charnames::viacode($ord);
 318         $name =~ s/LATIN CAPITAL LETTER //
 319         || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
 320     }
 321     printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
 322 }
 323
 324 read_only_bottom_close_and_rename($out_fh)