| 1 | use v5.16.0; |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | require './regen/regen_lib.pl'; |
| 5 | require './regen/charset_translations.pl'; |
| 6 | use Unicode::UCD; |
| 7 | use charnames qw(:loose); |
| 8 | |
| 9 | my $out_fh = open_new('unicode_constants.h', '>', |
| 10 | {style => '*', by => $0, |
| 11 | from => "Unicode data"}); |
| 12 | |
| 13 | print $out_fh <<END; |
| 14 | |
| 15 | #ifndef PERL_UNICODE_CONSTANTS_H_ /* Guard against nested #includes */ |
| 16 | #define PERL_UNICODE_CONSTANTS_H_ 1 |
| 17 | |
| 18 | /* This file contains #defines for the version of Unicode being used and |
| 19 | * various Unicode code points. The values the code point macros expand to |
| 20 | * are the native Unicode code point, or all or portions of the UTF-8 encoding |
| 21 | * for the code point. In the former case, the macro name has the suffix |
| 22 | * "_NATIVE"; otherwise, the suffix "_UTF8". |
| 23 | * |
| 24 | * The macros that have the suffix "_UTF8" may have further suffixes, as |
| 25 | * follows: |
| 26 | * "_FIRST_BYTE" if the value is just the first byte of the UTF-8 |
| 27 | * representation; the value will be a numeric constant. |
| 28 | * "_TAIL" if instead it represents all but the first byte. This, and |
| 29 | * with no additional suffix are both string constants */ |
| 30 | |
| 31 | /* |
| 32 | =head1 Unicode Support |
| 33 | |
| 34 | =for apidoc AmnU|const char *|BOM_UTF8 |
| 35 | |
| 36 | This is a macro that evaluates to a string constant of the UTF-8 bytes that |
| 37 | define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl |
| 38 | is compiled on. This allows code to use a mnemonic for this character that |
| 39 | works on both ASCII and EBCDIC platforms. |
| 40 | S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in |
| 41 | bytes. |
| 42 | |
| 43 | =for apidoc AmnU|const char *|REPLACEMENT_CHARACTER_UTF8 |
| 44 | |
| 45 | This is a macro that evaluates to a string constant of the UTF-8 bytes that |
| 46 | define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl |
| 47 | is compiled on. This allows code to use a mnemonic for this character that |
| 48 | works on both ASCII and EBCDIC platforms. |
| 49 | S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in |
| 50 | bytes. |
| 51 | |
| 52 | =cut |
| 53 | */ |
| 54 | |
| 55 | END |
| 56 | |
| 57 | my $version = Unicode::UCD::UnicodeVersion(); |
| 58 | my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x; |
| 59 | $dotdot = 0 unless defined $dotdot; |
| 60 | |
| 61 | print $out_fh <<END; |
| 62 | #define UNICODE_MAJOR_VERSION $major |
| 63 | #define UNICODE_DOT_VERSION $dot |
| 64 | #define UNICODE_DOT_DOT_VERSION $dotdot |
| 65 | |
| 66 | END |
| 67 | |
| 68 | # The data are at __DATA__ in this file. |
| 69 | |
| 70 | my @data = <DATA>; |
| 71 | |
| 72 | foreach my $charset (get_supported_code_pages()) { |
| 73 | print $out_fh "\n" . get_conditional_compile_line_start($charset); |
| 74 | |
| 75 | my @a2n = @{get_a2n($charset)}; |
| 76 | |
| 77 | for ( @data ) { |
| 78 | chomp; |
| 79 | |
| 80 | # Convert any '#' comments to /* ... */; empty lines and comments are |
| 81 | # output as blank lines |
| 82 | if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) { |
| 83 | my $comment_body = $1 // ""; |
| 84 | if ($comment_body ne "") { |
| 85 | print $out_fh "/* $comment_body */\n"; |
| 86 | } |
| 87 | else { |
| 88 | print $out_fh "\n"; |
| 89 | } |
| 90 | next; |
| 91 | } |
| 92 | |
| 93 | unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token |
| 94 | (?: [\ ]+ ( [^ ]* ) )? # optional flag |
| 95 | (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required |
| 96 | /x) |
| 97 | { |
| 98 | die "Unexpected syntax at line $.: $_\n"; |
| 99 | } |
| 100 | |
| 101 | my $name_or_cp = $1; |
| 102 | my $flag = $2; |
| 103 | my $desired_name = $3; |
| 104 | |
| 105 | my $name; |
| 106 | my $cp; |
| 107 | my $U_cp; # code point in Unicode (not-native) terms |
| 108 | |
| 109 | if ($name_or_cp =~ /^U\+(.*)/) { |
| 110 | $U_cp = hex $1; |
| 111 | $name = charnames::viacode($name_or_cp); |
| 112 | if (! defined $name) { |
| 113 | next if $flag =~ /skip_if_undef/; |
| 114 | die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name; |
| 115 | $name = ""; |
| 116 | } |
| 117 | } |
| 118 | else { |
| 119 | $name = $name_or_cp; |
| 120 | die "Unknown name '$name' at line $.: $_\n" unless defined $name; |
| 121 | $U_cp = charnames::vianame($name =~ s/_/ /gr); |
| 122 | } |
| 123 | |
| 124 | $cp = ($U_cp < 256) |
| 125 | ? $a2n[$U_cp] |
| 126 | : $U_cp; |
| 127 | |
| 128 | $name = $desired_name if $name eq "" && $desired_name; |
| 129 | $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes |
| 130 | |
| 131 | my $str; |
| 132 | my $suffix; |
| 133 | if (defined $flag && $flag eq 'native') { |
| 134 | die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff; |
| 135 | $suffix = '_NATIVE'; |
| 136 | $str = sprintf "0x%02X", $cp; # Is a numeric constant |
| 137 | } |
| 138 | else { |
| 139 | $str = join "", map { sprintf "\\x%02X", ord $_ } split //, cp_2_utfbytes($U_cp, $charset); |
| 140 | |
| 141 | $suffix = '_UTF8'; |
| 142 | if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) { |
| 143 | $str = "\"$str\""; # Will be a string constant |
| 144 | } elsif ($flag eq 'tail') { |
| 145 | $str =~ s/\\x..//; # Remove the first byte |
| 146 | $suffix .= '_TAIL'; |
| 147 | $str = "\"$str\""; # Will be a string constant |
| 148 | } |
| 149 | elsif ($flag eq 'first') { |
| 150 | $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte |
| 151 | $suffix .= '_FIRST_BYTE'; |
| 152 | $str = "0x$str"; # Is a numeric constant |
| 153 | } |
| 154 | else { |
| 155 | die "Unknown flag at line $.: $_\n"; |
| 156 | } |
| 157 | } |
| 158 | printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp; |
| 159 | } |
| 160 | |
| 161 | my $max_PRINT_A = 0; |
| 162 | for my $i (0x20 .. 0x7E) { |
| 163 | $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A; |
| 164 | } |
| 165 | printf $out_fh "# define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C 0x%02X /* The max code point that isPRINT_A */\n", $max_PRINT_A; |
| 166 | |
| 167 | print $out_fh "\n" . get_conditional_compile_line_end(); |
| 168 | |
| 169 | } |
| 170 | |
| 171 | use Unicode::UCD 'prop_invlist'; |
| 172 | |
| 173 | my $count = 0; |
| 174 | my @other_invlist = prop_invlist("Other"); |
| 175 | for (my $i = 0; $i < @other_invlist; $i += 2) { |
| 176 | $count += ((defined $other_invlist[$i+1]) |
| 177 | ? $other_invlist[$i+1] |
| 178 | : 0x110000) |
| 179 | - $other_invlist[$i]; |
| 180 | } |
| 181 | printf $out_fh "\n/* The number of code points not matching \\pC */\n" |
| 182 | . "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C %d\n", |
| 183 | 0x110000 - $count; |
| 184 | |
| 185 | # If this release has both the CWCM and CWCF properties, find the highest code |
| 186 | # point which changes under any case change. We can use this to short-circuit |
| 187 | # code |
| 188 | my @cwcm = prop_invlist('CWCM'); |
| 189 | if (@cwcm) { |
| 190 | my @cwcf = prop_invlist('CWCF'); |
| 191 | if (@cwcf) { |
| 192 | my $max = ($cwcm[-1] < $cwcf[-1]) |
| 193 | ? $cwcf[-1] |
| 194 | : $cwcm[-1]; |
| 195 | printf $out_fh "\n/* The highest code point that has any type of case change */\n" |
| 196 | . "#define HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C 0x%X\n", |
| 197 | $max - 1; |
| 198 | } |
| 199 | } |
| 200 | |
| 201 | print $out_fh "\n#endif /* PERL_UNICODE_CONSTANTS_H_ */\n"; |
| 202 | |
| 203 | read_only_bottom_close_and_rename($out_fh); |
| 204 | |
| 205 | # DATA FORMAT |
| 206 | # |
| 207 | # Note that any apidoc comments you want in the file need to be added to one |
| 208 | # of the prints above |
| 209 | # |
| 210 | # A blank line is output as-is. |
| 211 | # Comments (lines whose first non-blank is a '#') are converted to C-style, |
| 212 | # though empty comments are converted to blank lines. Otherwise, each line |
| 213 | # represents one #define, and begins with either a Unicode character name with |
| 214 | # the blanks and dashes in it squeezed out or replaced by underscores; or it |
| 215 | # may be a hexadecimal Unicode code point of the form U+xxxx. In the latter |
| 216 | # case, the name will be looked-up to use as the name of the macro. In either |
| 217 | # case, the macro name will have suffixes as listed above, and all blanks and |
| 218 | # dashes will be replaced by underscores. |
| 219 | # |
| 220 | # Each line may optionally have one of the following flags on it, separated by |
| 221 | # white space from the initial token. |
| 222 | # string indicates that the output is to be of the string form |
| 223 | # described in the comments above that are placed in the file. |
| 224 | # string_skip_ifundef is the same as 'string', but instead of dying if the |
| 225 | # code point doesn't exist, the line is just skipped: no output is |
| 226 | # generated for it |
| 227 | # first indicates that the output is to be of the FIRST_BYTE form. |
| 228 | # tail indicates that the output is of the _TAIL form. |
| 229 | # native indicates that the output is the code point, converted to the |
| 230 | # platform's native character set if applicable |
| 231 | # |
| 232 | # If the code point has no official name, the desired name may be appended |
| 233 | # after the flag, which will be ignored if there is an official name. |
| 234 | # |
| 235 | # This program is used to make it convenient to create compile time constants |
| 236 | # of UTF-8, and to generate proper EBCDIC as well as ASCII without manually |
| 237 | # having to figure things out. |
| 238 | |
| 239 | __DATA__ |
| 240 | U+017F string |
| 241 | |
| 242 | U+0300 string |
| 243 | U+0307 string |
| 244 | |
| 245 | U+1E9E string_skip_if_undef |
| 246 | |
| 247 | U+FB05 string |
| 248 | U+FB06 string |
| 249 | U+0130 string |
| 250 | U+0131 string |
| 251 | |
| 252 | U+2010 string |
| 253 | BOM first |
| 254 | BOM tail |
| 255 | |
| 256 | BOM string |
| 257 | |
| 258 | U+FFFD string |
| 259 | |
| 260 | U+10FFFF string MAX_UNICODE |
| 261 | |
| 262 | NBSP native |
| 263 | NBSP string |
| 264 | |
| 265 | DEL native |
| 266 | CR native |
| 267 | LF native |
| 268 | VT native |
| 269 | ESC native |
| 270 | U+00DF native |
| 271 | U+00E5 native |
| 272 | U+00C5 native |
| 273 | U+00FF native |
| 274 | U+00B5 native |