X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/45fdf108c5f4d5c719b8d2f7389c81e54795bad7..39eb7305b21af40d48ea23a892ee64a72c37418c:/regen/unicode_constants.pl diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl index 56e5349..baf25f1 100644 --- a/regen/unicode_constants.pl +++ b/regen/unicode_constants.pl @@ -2,10 +2,12 @@ use v5.16.0; use strict; use warnings; require 'regen/regen_lib.pl'; +require 'regen/charset_translations.pl'; +use Unicode::UCD; use charnames qw(:loose); my $out_fh = open_new('unicode_constants.h', '>', - {style => '*', by => $0, + {style => '*', by => $0, from => "Unicode data"}); print $out_fh <; + +foreach my $charset (get_supported_code_pages()) { + print $out_fh "\n" . get_conditional_compile_line_start($charset); + + my @a2n = @{get_a2n($charset)}; + + for ( @data ) { + chomp; + + # Convert any '#' comments to /* ... */; empty lines and comments are + # output as blank lines + if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) { + my $comment_body = $1 // ""; + if ($comment_body ne "") { + print $out_fh "/* $comment_body */\n"; + } + else { + print $out_fh "\n"; + } + next; + } + + unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token + (?: [\ ]+ ( [^ ]* ) )? # optional flag + (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required + /x) + { + die "Unexpected syntax at line $.: $_\n"; + } + + my $name_or_cp = $1; + my $flag = $2; + my $desired_name = $3; + + my $name; + my $cp; + my $U_cp; # code point in Unicode (not-native) terms + + if ($name_or_cp =~ /^U\+(.*)/) { + $U_cp = hex $1; + $name = charnames::viacode($name_or_cp); + if (! defined $name) { + next if $flag =~ /skip_if_undef/; + die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name; + $name = ""; + } + } + else { + $name = $name_or_cp; + die "Unknown name '$name' at line $.: $_\n" unless defined $name; + $U_cp = charnames::vianame($name =~ s/_/ /gr); + } + + $cp = ($U_cp < 256) + ? $a2n[$U_cp] + : $U_cp; + + $name = $desired_name if $name eq "" && $desired_name; + $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes + + my $str; + my $suffix; + if (defined $flag && $flag eq 'native') { + die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff; + $suffix = '_NATIVE'; + $str = sprintf "0x%02X", $cp; # Is a numeric constant + } + else { + $str = join "", map { sprintf "\\x%02X", ord $_ } split //, cp_2_utfbytes($U_cp, $charset); + + $suffix = '_UTF8'; + if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) { + $str = "\"$str\""; # Will be a string constant + } elsif ($flag eq 'tail') { + $str =~ s/\\x..//; # Remove the first byte + $suffix .= '_TAIL'; + $str = "\"$str\""; # Will be a string constant + } + elsif ($flag eq 'first') { + $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte + $suffix .= '_FIRST_BYTE'; + $str = "0x$str"; # Is a numeric constant + } + else { + die "Unknown flag at line $.: $_\n"; + } + } + printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp; + } + + my $max_PRINT_A = 0; + for my $i (0x20 .. 0x7E) { + $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A; + } + printf $out_fh "# define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C 0x%02X /* The max code point that isPRINT_A */\n", $max_PRINT_A; + + print $out_fh "\n" . get_conditional_compile_line_end(); + +} + +use Unicode::UCD 'prop_invlist'; + +my $count = 0; +my @other_invlist = prop_invlist("Other"); +for (my $i = 0; $i < @other_invlist; $i += 2) { + $count += ((defined $other_invlist[$i+1]) + ? $other_invlist[$i+1] + : 0x110000) + - $other_invlist[$i]; +} +printf $out_fh "\n/* The number of code points not matching \\pC */\n" + . "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C %d\n", + 0x110000 - $count; + +# If this release has both the CWCM and CWCF properties, find the highest code +# point which changes under any case change. We can use this to short-circuit +# code +my @cwcm = prop_invlist('CWCM'); +if (@cwcm) { + my @cwcf = prop_invlist('CWCF'); + if (@cwcf) { + my $max = ($cwcm[-1] < $cwcf[-1]) + ? $cwcf[-1] + : $cwcm[-1]; + printf $out_fh "\n/* The highest code point that has any type of case change */\n" + . "#define HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C 0x%X\n", + $max - 1; + } +} + +print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n"; + +read_only_bottom_close_and_rename($out_fh); + +# DATA FORMAT +# +# A blank line is output as-is. +# Comments (lines whose first non-blank is a '#') are converted to C-style, +# though empty comments are converted to blank lines. Otherwise, each line +# represents one #define, and begins with either a Unicode character name with +# the blanks and dashes in it squeezed out or replaced by underscores; or it +# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter # case, the name will be looked-up to use as the name of the macro. In either -# case, the macro name will have suffixes as listed above, and all blanks will -# be replaced by underscores. +# case, the macro name will have suffixes as listed above, and all blanks and +# dashes will be replaced by underscores. # # Each line may optionally have one of the following flags on it, separated by # white space from the initial token. # string indicates that the output is to be of the string form # described in the comments above that are placed in the file. +# string_skip_ifundef is the same as 'string', but instead of dying if the +# code point doesn't exist, the line is just skipped: no output is +# generated for it # first indicates that the output is to be of the FIRST_BYTE form. # tail indicates that the output is of the _TAIL form. # native indicates that the output is the code point, converted to the # platform's native character set if applicable # +# If the code point has no official name, the desired name may be appended +# after the flag, which will be ignored if there is an official name. +# # This program is used to make it convenient to create compile time constants # of UTF-8, and to generate proper EBCDIC as well as ASCII without manually # having to figure things out. -while ( ) { - if ($_ !~ /\S/) { - print $out_fh "\n"; - next; - } - - chomp; - unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token - (?: [\ ]+ ( .* ) )? # optional flag - /x) - { - die "Unexpected syntax at line $.: $_\n"; - } - - my $name_or_cp = $1; - my $flag = $2; - - my $name; - my $cp; - - if ($name_or_cp =~ /[^[:xdigit:]]/) { - - # Anything that isn't a hex value must be a name. - $name = $name_or_cp; - $cp = charnames::vianame($name =~ s/_/ /gr); - die "Unknown name '$name' at line $.: $_\n" unless defined $name; - } - else { - $cp = $name_or_cp; - $name = charnames::viacode("0$cp"); # viacode requires a leading zero - # to be sure that the argument is hex - die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp; - } - - $name =~ s/ /_/g; # The macro name can have no blanks in it - - my $str = join "", map { sprintf "\\x%02X", $_ } - unpack("U0C*", pack("U", hex $cp)); - - my $suffix = '_UTF8'; - if (! defined $flag || $flag eq 'string') { - $str = "\"$str\""; # Will be a string constant - } elsif ($flag eq 'tail') { - $str =~ s/\\x..//; # Remove the first byte - $suffix .= '_TAIL'; - $str = "\"$str\""; # Will be a string constant - } - elsif ($flag eq 'first') { - $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte - $suffix .= '_FIRST_BYTE'; - $str = "0x$str"; # Is a numeric constant - } - elsif ($flag eq 'native') { - die "Are you sure you want to run this on an above-Latin1 code point?" if hex $cp > 0xff; - $suffix = '_NATIVE'; - $str = utf8::unicode_to_native(hex $cp); - $str = "0x$cp"; # Is a numeric constant - } - else { - die "Unknown flag at line $.: $_\n"; - } - print $out_fh "#define ${name}$suffix $str /* U+$cp */\n"; -} +__DATA__ +U+017F string -print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n"; +U+0300 string -read_only_bottom_close_and_rename($out_fh); +U+0399 string +U+03BC string -__DATA__ -0300 string -0301 string -0308 string +U+1E9E string_skip_if_undef -03B9 first -03B9 tail +U+FB05 string +U+FB06 string +U+0130 string +U+0131 string -03C5 first -03C5 tail +U+2010 string +BOM first +BOM tail -2010 string +NBSP native +NBSP string -007F native -00DF native -00E5 native -00C5 native -00FF native -00B5 native -0085 native +DEL native +CR native +LF native +VT native +ESC native +U+00DF native +U+00E5 native +U+00C5 native +U+00FF native +U+00B5 native