regen/mk_PL_charclass.pl

   1 #!perl -w
   2 use v5.15.8;
   3 use strict;
   4 use warnings;
   5 require 'regen/regen_lib.pl';
   6
   7 # This program outputs l1_charclass_tab.h, which defines the guts of the
   8 # PL_charclass table.  Each line is a bit map of properties that the Unicode
   9 # code point at the corresponding position in the table array has.  The first
  10 # line corresponds to code point U+0000, NULL, the last line to U+00FF.  For
  11 # an application to see if the code point "i" has a particular property, it
  12 # just does
  13 #    'PL_charclass[i] & BIT'
  14 # The bit names are of the form '_CC_property_suffix', where 'CC' stands for
  15 # character class, and 'property' is the corresponding property, and 'suffix'
  16 # is one of '_A' to mean the property is true only if the corresponding code
  17 # point is ASCII, and '_L1' means that the range includes any Latin1
  18 # character (ISO-8859-1 including the C0 and C1 controls).  A property without
  19 # these suffixes does not have different forms for both ranges.
  20
  21 # This program need be run only when adding new properties to it, or upon a
  22 # new Unicode release, to make sure things haven't been changed by it.
  23
  24 my @properties = qw(
  25     ALNUMC
  26     ALPHA
  27     ASCII
  28     BLANK
  29     CHARNAME_CONT
  30     CNTRL
  31     DIGIT
  32     GRAPH
  33     IDFIRST
  34     LOWER
  35     NON_FINAL_FOLD
  36     PRINT
  37     PSXSPC
  38     PUNCT
  39     QUOTEMETA
  40     SPACE
  41     UPPER
  42     WORDCHAR
  43     XDIGIT
  44 );
  45
  46 # Read in the case fold mappings.
  47 my %folded_closure;
  48 my $file="lib/unicore/CaseFolding.txt";
  49 my @folds;
  50 use Unicode::UCD;
  51
  52 # Use the Unicode data file if we are on an ASCII platform (which its data is
  53 # for), and it is in the modern format (starting in Unicode 3.1.0) and it is
  54 # available.  This avoids being affected by potential bugs introduced by other
  55 # layers of Perl
  56 if (ord('A') == 65
  57     && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
  58     && open my $fh, "<", $file)
  59 {
  60     @folds = <$fh>;
  61 }
  62 else {
  63     my ($invlist_ref, $invmap_ref, undef, $default)
  64                                     = Unicode::UCD::prop_invmap('Case_Folding');
  65     for my $i (0 .. @$invlist_ref - 1 - 1) {
  66         next if $invmap_ref->[$i] == $default;
  67         my $adjust = -1;
  68         for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
  69             $adjust++;
  70
  71             # Single-code point maps go to a 'C' type
  72             if (! ref $invmap_ref->[$i]) {
  73                 push @folds, sprintf("%04X; C; %04X\n",
  74                                      $j,
  75                                      $invmap_ref->[$i] + $adjust);
  76             }
  77             else {  # Multi-code point maps go to 'F'.  prop_invmap()
  78                     # guarantees that no adjustment is needed for these,
  79                     # as the range will contain just one element
  80                 push @folds, sprintf("%04X; F; %s\n",
  81                                     $j,
  82                                     join " ", map { sprintf "%04X", $_ }
  83                                                     @{$invmap_ref->[$i]});
  84             }
  85         }
  86     }
  87 }
  88
  89 for (@folds) {
  90     chomp;
  91
  92     # Lines look like (without the initial '#'
  93     #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
  94     # Get rid of comments, ignore blank or comment-only lines
  95     my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
  96     next unless length $line;
  97     my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
  98
  99     my $from = hex $hex_from;
 100
 101     # Perl only deals with C and F folds
 102     next if $fold_type ne 'C' and $fold_type ne 'F';
 103
 104     # Get each code point in the range that participates in this line's fold.
 105     # The hash has keys of each code point in the range, and values of what it
 106     # folds to and what folds to it
 107     foreach my $hex_fold (@folded) {
 108         my $fold = hex $hex_fold;
 109         push @{$folded_closure{$fold}}, $from if $fold < 256;
 110         push @{$folded_closure{$from}}, $fold if $from < 256;
 111     }
 112 }
 113
 114 # Now having read all the lines, combine them into the full closure of each
 115 # code point in the range by adding lists together that share a common element
 116 foreach my $folded (keys %folded_closure) {
 117     foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
 118         push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
 119     }
 120 }
 121
 122 my @bits;   # Bit map for each code point
 123
 124 foreach my $folded (keys %folded_closure) {
 125     $bits[$folded] = "(1U<<_CC_NONLATIN1_FOLD)" if grep { $_ > 255 }
 126                                                 @{$folded_closure{$folded}};
 127 }
 128
 129 # For each character, calculate which properties it matches.
 130 for my $ord (0..255) {
 131     my $char = chr($ord);
 132     utf8::upgrade($char);   # Important to use Unicode semantics!
 133
 134     # Look at all the properties we care about here.
 135     for my $property (@properties) {
 136         my $name = $property;
 137
 138         # Remove the suffix to get the actual property name.
 139         # Currently the suffixes are '_L1', '_A', and none.
 140         # If is a latin1 version, no further checking is needed.
 141         if (! ($name =~ s/_L1$//)) {
 142
 143             # Here, isn't an _L1.  If its _A, it's automatically false for
 144             # non-ascii.  The only one current one (besides ASCII) without a
 145             # suffix is valid over the whole range.
 146             next if $name =~ s/_A$// && $ord >= 128;
 147
 148         }
 149         my $re;
 150         if ($name eq 'PUNCT') {;
 151
 152             # Sadly, this is inconsistent: \pP and \pS for the ascii range,
 153             # just \pP outside it.
 154             $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
 155         } elsif ($name eq 'CHARNAME_CONT') {;
 156             $re = qr/[-\p{XPosixWord} ():\xa0]/;
 157         } elsif ($name eq 'SPACE') {;
 158             $re = qr/\p{XPerlSpace}/;
 159         } elsif ($name eq 'IDFIRST') {
 160             $re = qr/[_\p{Alpha}]/;
 161         } elsif ($name eq 'PSXSPC') {
 162             $re = qr/[\v\p{Space}]/;
 163         } elsif ($name eq 'WORDCHAR') {
 164             $re = qr/\p{XPosixWord}/;
 165         } elsif ($name eq 'ALNUMC') {
 166             # Like \w, but no underscore
 167             $re = qr/\p{Alnum}/;
 168         } elsif ($name eq 'QUOTEMETA') {
 169             $re = qr/\p{_Perl_Quotemeta}/;
 170         } elsif ($name eq 'NON_FINAL_FOLD') {
 171             $re = qr/\p{_Perl_Non_Final_Folds}/;
 172         } else {    # The remainder have the same name and values as Unicode
 173             $re = eval "qr/\\p{$name}/";
 174             use Carp;
 175             carp $@ if ! defined $re;
 176         }
 177         #print "$ord, $name $property, $re\n";
 178         if ($char =~ $re  # Add this property if matches
 179             || ($name eq 'NON_FINAL_FOLD'
 180                 # Also include chars that fold to the non-final
 181                 && CORE::fc($char) =~ $re))
 182         {
 183             $bits[$ord] .= '|' if $bits[$ord];
 184             $bits[$ord] .= "(1U<<_CC_$property)";
 185         }
 186     }
 187     #print __LINE__, " $ord $char $bits[$ord]\n";
 188 }
 189
 190 # Names of C0 controls
 191 my @C0 = qw (
 192                 NUL
 193                 SOH
 194                 STX
 195                 ETX
 196                 EOT
 197                 ENQ
 198                 ACK
 199                 BEL
 200                 BS
 201                 HT
 202                 LF
 203                 VT
 204                 FF
 205                 CR
 206                 SO
 207                 SI
 208                 DLE
 209                 DC1
 210                 DC2
 211                 DC3
 212                 DC4
 213                 NAK
 214                 SYN
 215                 ETB
 216                 CAN
 217                 EOM
 218                 SUB
 219                 ESC
 220                 FS
 221                 GS
 222                 RS
 223                 US
 224             );
 225
 226 # Names of C1 controls, plus the adjacent DEL
 227 my @C1 = qw(
 228                 DEL
 229                 PAD
 230                 HOP
 231                 BPH
 232                 NBH
 233                 IND
 234                 NEL
 235                 SSA
 236                 ESA
 237                 HTS
 238                 HTJ
 239                 VTS
 240                 PLD
 241                 PLU
 242                 RI
 243                 SS2
 244                 SS3
 245                 DCS
 246                 PU1
 247                 PU2
 248                 STS
 249                 CCH
 250                 MW
 251                 SPA
 252                 EPA
 253                 SOS
 254                 SGC
 255                 SCI
 256                 CSI
 257                 ST
 258                 OSC
 259                 PM
 260                 APC
 261             );
 262
 263 my $out_fh = open_new('l1_char_class_tab.h', '>',
 264                       {style => '*', by => $0,
 265                       from => "property definitions"});
 266
 267 # Output the table using fairly short names for each char.
 268 for my $ord (0..255) {
 269     my $name;
 270     if ($ord < 32) {    # A C0 control
 271         $name = $C0[$ord];
 272     } elsif ($ord > 32 && $ord < 127) { # Graphic
 273         $name = "'" . chr($ord) . "'";
 274     } elsif ($ord >= 127 && $ord <= 0x9f) {
 275         $name = $C1[$ord - 127];    # A C1 control + DEL
 276     } else {    # SPACE, or, if Latin1, shorten the name */
 277         use charnames();
 278         $name = charnames::viacode($ord);
 279         $name =~ s/LATIN CAPITAL LETTER //
 280         || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
 281     }
 282     printf $out_fh "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
 283 }
 284
 285 read_only_bottom_close_and_rename($out_fh)