regen/unicode_constants.pl

   1 use v5.16.0;
   2 use strict;
   3 use warnings;
   4 require 'regen/regen_lib.pl';
   5 require 'regen/charset_translations.pl';
   6 use Unicode::UCD;
   7 use charnames qw(:loose);
   8
   9 my $out_fh = open_new('unicode_constants.h', '>',
  10         {style => '*', by => $0,
  11                       from => "Unicode data"});
  12
  13 print $out_fh <<END;
  14
  15 #ifndef H_UNICODE_CONSTANTS   /* Guard against nested #includes */
  16 #define H_UNICODE_CONSTANTS   1
  17
  18 /* This file contains #defines for the version of Unicode being used and
  19  * various Unicode code points.  The values the code point macros expand to
  20  * are the native Unicode code point, or all or portions of the UTF-8 encoding
  21  * for the code point.  In the former case, the macro name has the suffix
  22  * "_NATIVE"; otherwise, the suffix "_UTF8".
  23  *
  24  * The macros that have the suffix "_UTF8" may have further suffixes, as
  25  * follows:
  26  *  "_FIRST_BYTE" if the value is just the first byte of the UTF-8
  27  *                representation; the value will be a numeric constant.
  28  *  "_TAIL"       if instead it represents all but the first byte.  This, and
  29  *                with no additional suffix are both string constants */
  30
  31 END
  32
  33 my $version = Unicode::UCD::UnicodeVersion();
  34 my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x;
  35 $dotdot = 0 unless defined $dotdot;
  36
  37 print $out_fh <<END;
  38 #define UNICODE_MAJOR_VERSION   $major
  39 #define UNICODE_DOT_VERSION     $dot
  40 #define UNICODE_DOT_DOT_VERSION $dotdot
  41
  42 END
  43
  44 # The data are at __DATA__  in this file.
  45
  46 my @data = <DATA>;
  47
  48 foreach my $charset (get_supported_code_pages()) {
  49     print $out_fh "\n" . get_conditional_compile_line_start($charset);
  50
  51     my @a2n = @{get_a2n($charset)};
  52
  53     for ( @data ) {
  54         chomp;
  55
  56         # Convert any '#' comments to /* ... */; empty lines and comments are
  57         # output as blank lines
  58         if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
  59             my $comment_body = $1 // "";
  60             if ($comment_body ne "") {
  61                 print $out_fh "/* $comment_body */\n";
  62             }
  63             else {
  64                 print $out_fh "\n";
  65             }
  66             next;
  67         }
  68
  69         unless ($_ =~ m/ ^ ( [^\ ]* )           # Name or code point token
  70                         (?: [\ ]+ ( [^ ]* ) )?  # optional flag
  71                         (?: [\ ]+ ( .* ) )?  # name if unnamed; flag is required
  72                     /x)
  73         {
  74             die "Unexpected syntax at line $.: $_\n";
  75         }
  76
  77         my $name_or_cp = $1;
  78         my $flag = $2;
  79         my $desired_name = $3;
  80
  81         my $name;
  82         my $cp;
  83         my $U_cp;   # code point in Unicode (not-native) terms
  84
  85         if ($name_or_cp =~ /^U\+(.*)/) {
  86             $U_cp = hex $1;
  87             $name = charnames::viacode($name_or_cp);
  88             if (! defined $name) {
  89                 next if $flag =~ /skip_if_undef/;
  90                 die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name;
  91                 $name = "";
  92             }
  93         }
  94         else {
  95             $name = $name_or_cp;
  96             die "Unknown name '$name' at line $.: $_\n" unless defined $name;
  97             $U_cp = charnames::vianame($name =~ s/_/ /gr);
  98         }
  99
 100         $cp = ($U_cp < 256)
 101             ? $a2n[$U_cp]
 102             : $U_cp;
 103
 104         $name = $desired_name if $name eq "" && $desired_name;
 105         $name =~ s/[- ]/_/g;   # The macro name can have no blanks nor dashes
 106
 107         my $str;
 108         my $suffix;
 109         if (defined $flag && $flag eq 'native') {
 110             die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
 111             $suffix = '_NATIVE';
 112             $str = sprintf "0x%02X", $cp;        # Is a numeric constant
 113         }
 114         else {
 115             $str = join "", map { sprintf "\\x%02X", ord $_ } split //, cp_2_utfbytes($U_cp, $charset);
 116
 117             $suffix = '_UTF8';
 118             if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) {
 119                 $str = "\"$str\"";  # Will be a string constant
 120             } elsif ($flag eq 'tail') {
 121                     $str =~ s/\\x..//;  # Remove the first byte
 122                     $suffix .= '_TAIL';
 123                     $str = "\"$str\"";  # Will be a string constant
 124             }
 125             elsif ($flag eq 'first') {
 126                 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
 127                 $suffix .= '_FIRST_BYTE';
 128                 $str = "0x$str";        # Is a numeric constant
 129             }
 130             else {
 131                 die "Unknown flag at line $.: $_\n";
 132             }
 133         }
 134         printf $out_fh "#   define %s%s  %s    /* U+%04X */\n", $name, $suffix, $str, $U_cp;
 135     }
 136
 137     my $max_PRINT_A = 0;
 138     for my $i (0x20 .. 0x7E) {
 139         $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
 140     }
 141     printf $out_fh "#   define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C   0x%02X   /* The max code point that isPRINT_A */\n", $max_PRINT_A;
 142
 143     print $out_fh "\n" . get_conditional_compile_line_end();
 144
 145 }
 146
 147 use Unicode::UCD 'prop_invlist';
 148
 149 my $count = 0;
 150 my @other_invlist = prop_invlist("Other");
 151 for (my $i = 0; $i < @other_invlist; $i += 2) {
 152     $count += ((defined $other_invlist[$i+1])
 153               ? $other_invlist[$i+1]
 154               : 0x110000)
 155               - $other_invlist[$i];
 156 }
 157 printf $out_fh "\n/* The number of code points not matching \\pC */\n"
 158              . "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C  %d\n",
 159             0x110000 - $count;
 160
 161 # If this release has both the CWCM and CWCF properties, find the highest code
 162 # point which changes under any case change.  We can use this to short-circuit
 163 # code
 164 my @cwcm = prop_invlist('CWCM');
 165 if (@cwcm) {
 166     my @cwcf = prop_invlist('CWCF');
 167     if (@cwcf) {
 168         my $max = ($cwcm[-1] < $cwcf[-1])
 169                   ? $cwcf[-1]
 170                   : $cwcm[-1];
 171         printf $out_fh "\n/* The highest code point that has any type of case change */\n"
 172              . "#define HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C  0x%X\n",
 173             $max - 1;
 174     }
 175 }
 176
 177 print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";
 178
 179 read_only_bottom_close_and_rename($out_fh);
 180
 181 # DATA FORMAT
 182 #
 183 # A blank line is output as-is.
 184 # Comments (lines whose first non-blank is a '#') are converted to C-style,
 185 # though empty comments are converted to blank lines.  Otherwise, each line
 186 # represents one #define, and begins with either a Unicode character name with
 187 # the blanks and dashes in it squeezed out or replaced by underscores; or it
 188 # may be a hexadecimal Unicode code point of the form U+xxxx.  In the latter
 189 # case, the name will be looked-up to use as the name of the macro.  In either
 190 # case, the macro name will have suffixes as listed above, and all blanks and
 191 # dashes will be replaced by underscores.
 192 #
 193 # Each line may optionally have one of the following flags on it, separated by
 194 # white space from the initial token.
 195 #   string  indicates that the output is to be of the string form
 196 #           described in the comments above that are placed in the file.
 197 #   string_skip_ifundef  is the same as 'string', but instead of dying if the
 198 #           code point doesn't exist, the line is just skipped: no output is
 199 #           generated for it
 200 #   first   indicates that the output is to be of the FIRST_BYTE form.
 201 #   tail    indicates that the output is of the _TAIL form.
 202 #   native  indicates that the output is the code point, converted to the
 203 #           platform's native character set if applicable
 204 #
 205 # If the code point has no official name, the desired name may be appended
 206 # after the flag, which will be ignored if there is an official name.
 207 #
 208 # This program is used to make it convenient to create compile time constants
 209 # of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
 210 # having to figure things out.
 211
 212 __DATA__
 213 U+017F string
 214
 215 U+0300 string
 216
 217 U+0399 string
 218 U+03BC string
 219
 220 U+1E9E string_skip_if_undef
 221
 222 U+FB05 string
 223 U+FB06 string
 224 U+0130 string
 225 U+0131 string
 226
 227 U+2010 string
 228 BOM first
 229 BOM tail
 230
 231 NBSP native
 232 NBSP string
 233
 234 DEL native
 235 CR  native
 236 LF  native
 237 VT  native
 238 ESC native
 239 U+00DF native
 240 U+00E5 native
 241 U+00C5 native
 242 U+00FF native
 243 U+00B5 native