regen/unicode_constants.pl

   1 use v5.16.0;
   2 use strict;
   3 use warnings;
   4 require 'regen/regen_lib.pl';
   5 require 'regen/charset_translations.pl';
   6 use charnames qw(:loose);
   7
   8 my $out_fh = open_new('unicode_constants.h', '>',
   9         {style => '*', by => $0,
  10                       from => "Unicode data"});
  11
  12 print $out_fh <<END;
  13
  14 #ifndef H_UNICODE_CONSTANTS   /* Guard against nested #includes */
  15 #define H_UNICODE_CONSTANTS   1
  16
  17 /* This file contains #defines for various Unicode code points.  The values
  18  * the macros expand to are the native Unicode code point, or all or portions
  19  * of the UTF-8 encoding for the code point.  In the former case, the macro
  20  * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8".
  21  *
  22  * The macros that have the suffix "_UTF8" may have further suffixes, as
  23  * follows:
  24  *  "_FIRST_BYTE" if the value is just the first byte of the UTF-8
  25  *                representation; the value will be a numeric constant.
  26  *  "_TAIL"       if instead it represents all but the first byte.  This, and
  27  *                with no additional suffix are both string constants */
  28
  29 END
  30
  31 # The data are at the end of this file.  A blank line is output as-is.
  32 # Comments (lines whose first non-blank is a '#') are converted to C-style,
  33 # though empty comments are converted to blank lines.  Otherwise, each line
  34 # represents one #define, and begins with either a Unicode character name with
  35 # the blanks and dashes in it squeezed out or replaced by underscores; or it
  36 # may be a hexadecimal Unicode code point of the form U+xxxx.  In the latter
  37 # case, the name will be looked-up to use as the name of the macro.  In either
  38 # case, the macro name will have suffixes as listed above, and all blanks and
  39 # dashes will be replaced by underscores.
  40 #
  41 # Each line may optionally have one of the following flags on it, separated by
  42 # white space from the initial token.
  43 #   string  indicates that the output is to be of the string form
  44 #           described in the comments above that are placed in the file.
  45 #   string_skip_ifundef  is the same as 'string', but instead of dying if the
  46 #           code point doesn't exist, the line is just skipped: no output is
  47 #           generated for it
  48 #   first   indicates that the output is to be of the FIRST_BYTE form.
  49 #   tail    indicates that the output is of the _TAIL form.
  50 #   native  indicates that the output is the code point, converted to the
  51 #           platform's native character set if applicable
  52 #
  53 # If the code point has no official name, the desired name may be appended
  54 # after the flag, which will be ignored if there is an official name.
  55 #
  56 # This program is used to make it convenient to create compile time constants
  57 # of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
  58 # having to figure things out.
  59
  60 my @data = <DATA>;
  61
  62 foreach my $charset (get_supported_code_pages()) {
  63     print $out_fh "\n" . get_conditional_compile_line_start($charset);
  64
  65     my @a2n = @{get_a2n($charset)};
  66
  67     for ( @data ) {
  68         chomp;
  69
  70         # Convert any '#' comments to /* ... */; empty lines and comments are
  71         # output as blank lines
  72         if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
  73             my $comment_body = $1 // "";
  74             if ($comment_body ne "") {
  75                 print $out_fh "/* $comment_body */\n";
  76             }
  77             else {
  78                 print $out_fh "\n";
  79             }
  80             next;
  81         }
  82
  83         unless ($_ =~ m/ ^ ( [^\ ]* )           # Name or code point token
  84                         (?: [\ ]+ ( [^ ]* ) )?  # optional flag
  85                         (?: [\ ]+ ( .* ) )?  # name if unnamed; flag is required
  86                     /x)
  87         {
  88             die "Unexpected syntax at line $.: $_\n";
  89         }
  90
  91         my $name_or_cp = $1;
  92         my $flag = $2;
  93         my $desired_name = $3;
  94
  95         my $name;
  96         my $cp;
  97         my $U_cp;   # code point in Unicode (not-native) terms
  98         my $undef_ok = $desired_name || $flag =~ /skip_if_undef/;
  99
 100         if ($name_or_cp =~ /^U\+(.*)/) {
 101             $U_cp = hex $1;
 102             $name = charnames::viacode($name_or_cp);
 103             if (! defined $name) {
 104                 die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $undef_ok;
 105                 $name = "";
 106             }
 107         }
 108         else {
 109             $name = $name_or_cp;
 110             die "Unknown name '$name' at line $.: $_\n" unless defined $name;
 111             $U_cp = charnames::vianame($name =~ s/_/ /gr);
 112         }
 113
 114         $cp = ($U_cp < 256)
 115             ? $a2n[$U_cp]
 116             : $U_cp;
 117
 118         $name = $desired_name if $name eq "" && $desired_name;
 119         $name =~ s/[- ]/_/g;   # The macro name can have no blanks nor dashes
 120
 121         my $str;
 122         my $suffix;
 123         if (defined $flag && $flag eq 'native') {
 124             die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
 125             $suffix = '_NATIVE';
 126             $str = sprintf "0x%02X", $cp;        # Is a numeric constant
 127         }
 128         else {
 129             $str = join "", map { sprintf "\\x%02X", ord $_ } split //, cp_2_utfbytes($U_cp, $charset);
 130
 131             $suffix = '_UTF8';
 132             if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) {
 133                 $str = "\"$str\"";  # Will be a string constant
 134             } elsif ($flag eq 'tail') {
 135                     $str =~ s/\\x..//;  # Remove the first byte
 136                     $suffix .= '_TAIL';
 137                     $str = "\"$str\"";  # Will be a string constant
 138             }
 139             elsif ($flag eq 'first') {
 140                 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
 141                 $suffix .= '_FIRST_BYTE';
 142                 $str = "0x$str";        # Is a numeric constant
 143             }
 144             else {
 145                 die "Unknown flag at line $.: $_\n";
 146             }
 147         }
 148         printf $out_fh "#   define %s%s  %s    /* U+%04X */\n", $name, $suffix, $str, $U_cp;
 149     }
 150
 151     my $max_PRINT_A = 0;
 152     for my $i (0x20 .. 0x7E) {
 153         $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
 154     }
 155     printf $out_fh "#   define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C   0x%02X   /* The max code point that isPRINT_A */\n", $max_PRINT_A;
 156
 157     print $out_fh "\n" . get_conditional_compile_line_end();
 158
 159 }
 160
 161 use Unicode::UCD 'prop_invlist';
 162
 163 my $count = 0;
 164 my @other_invlist = prop_invlist("Other");
 165 for (my $i = 0; $i < @other_invlist; $i += 2) {
 166     $count += ((defined $other_invlist[$i+1])
 167               ? $other_invlist[$i+1]
 168               : 0x110000)
 169               - $other_invlist[$i];
 170 }
 171 printf $out_fh "\n/* The number of code points not matching \\pC */\n"
 172              . "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C  %d\n",
 173             0x110000 - $count;
 174
 175 print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";
 176
 177 read_only_bottom_close_and_rename($out_fh);
 178
 179 __DATA__
 180 U+017F string
 181
 182 U+0300 string
 183
 184 U+0399 string
 185 U+03BC string
 186
 187 U+1E9E string
 188
 189 U+FB05 string
 190 U+FB06 string
 191
 192 U+2010 string
 193 U+D800 first FIRST_SURROGATE
 194 BOM first
 195 BOM tail
 196
 197 NBSP native
 198 NBSP string
 199
 200 DEL native
 201 CR  native
 202 LF  native
 203 VT  native
 204 ESC native
 205 U+00DF native
 206 U+00E5 native
 207 U+00C5 native
 208 U+00FF native
 209 U+00B5 native