[perl5.git] / regen / unicode_constants.pl

use v5.16.0;
use strict;
use warnings;
require './regen/regen_lib.pl';
require './regen/charset_translations.pl';
use Unicode::UCD;
use charnames qw(:loose);

my $out_fh = open_new('unicode_constants.h', '>',
        {style => '*', by => $0,
                      from => "Unicode data"});

print $out_fh <<END;

#ifndef PERL_UNICODE_CONSTANTS_H_   /* Guard against nested #includes */
#define PERL_UNICODE_CONSTANTS_H_   1

/* This file contains #defines for the version of Unicode being used and
 * various Unicode code points.  The values the code point macros expand to
 * are the native Unicode code point, or all or portions of the UTF-8 encoding
 * for the code point.  In the former case, the macro name has the suffix
 * "_NATIVE"; otherwise, the suffix "_UTF8".
 *
 * The macros that have the suffix "_UTF8" may have further suffixes, as
 * follows:
 *  "_FIRST_BYTE" if the value is just the first byte of the UTF-8
 *                representation; the value will be a numeric constant.
 *  "_TAIL"       if instead it represents all but the first byte.  This, and
 *                with no additional suffix are both string constants */

/*
=for apidoc_section \$unicode

=for apidoc AmnU|const char *|BOM_UTF8

This is a macro that evaluates to a string constant of the  UTF-8 bytes that
define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl
is compiled on.  This allows code to use a mnemonic for this character that
works on both ASCII and EBCDIC platforms.
S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in
bytes.

=for apidoc AmnU|const char *|REPLACEMENT_CHARACTER_UTF8

This is a macro that evaluates to a string constant of the  UTF-8 bytes that
define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl
is compiled on.  This allows code to use a mnemonic for this character that
works on both ASCII and EBCDIC platforms.
S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in
bytes.

=cut
*/

END

sub backslash_x_form($$;$) {
    # Output the code point represented by the byte string $bytes as a
    # sequence of \x{} constants.  $bytes should be the UTF-8 for the code
    # point if the final parameter is absent or empty.  Otherwise it should be
    # the Latin1 code point itself.
    #
    # The output is translated into the character set '$charset'.

    my ($bytes, $charset, $non_utf8) = @_;
    if ($non_utf8) {
        die "Must be utf8 if above 255" if $bytes > 255;
        my $a2n = get_a2n($charset);
        return sprintf "\\x%02X", $a2n->[$bytes];
    }
    else {
        return join "", map { sprintf "\\x%02X", ord $_ }
                        split //, cp_2_utfbytes($bytes, $charset);
    }
}

my $version = Unicode::UCD::UnicodeVersion();
my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x;
$dotdot = 0 unless defined $dotdot;

print $out_fh <<END;
#define UNICODE_MAJOR_VERSION   $major
#define UNICODE_DOT_VERSION     $dot
#define UNICODE_DOT_DOT_VERSION $dotdot

END

# The data are at __DATA__  in this file.

my @data = <DATA>;

foreach my $charset (get_supported_code_pages()) {
    print $out_fh "\n" . get_conditional_compile_line_start($charset);

    my @a2n = @{get_a2n($charset)};

    for ( @data ) {
        chomp;

        # Convert any '#' comments to /* ... */; empty lines and comments are
        # output as blank lines
        if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
            my $comment_body = $1 // "";
            if ($comment_body ne "") {
                print $out_fh "/* $comment_body */\n";
            }
            else {
                print $out_fh "\n";
            }
            next;
        }

        unless ($_ =~ m/ ^ ( [^\ ]* )           # Name or code point token
                        (?: [\ ]+ ( [^ ]* ) )?  # optional flag
                        (?: [\ ]+ ( .* ) )?  # name if unnamed; flag is required
                    /x)
        {
            die "Unexpected syntax at line $.: $_\n";
        }

        my $name_or_cp = $1;
        my $flag = $2;
        my $desired_name = $3;

        my $name;
        my $cp;
        my $U_cp;   # code point in Unicode (not-native) terms

        if ($name_or_cp =~ /^U\+(.*)/) {
            $U_cp = hex $1;
            $name = charnames::viacode($name_or_cp);
            if (! defined $name) {
                next if $flag =~ /skip_if_undef/;
                die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name;
                $name = "";
            }
        }
        else {
            $name = $name_or_cp;
            die "Unknown name '$name' at line $.: $_\n" unless defined $name;
            $U_cp = charnames::vianame($name =~ s/_/ /gr);
        }

        $cp = ($U_cp < 256)
            ? $a2n[$U_cp]
            : $U_cp;

        $name = $desired_name if $name eq "" && $desired_name;
        $name =~ s/[- ]/_/g;   # The macro name can have no blanks nor dashes

        my $str;
        my $suffix;
        if (defined $flag && $flag eq 'native') {
            die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
            $suffix = '_NATIVE';
            $str = sprintf "0x%02X", $cp;        # Is a numeric constant
        }
        else {
            $str = backslash_x_form($U_cp, $charset);

            $suffix = '_UTF8';
            if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) {
                $str = "\"$str\"";  # Will be a string constant
            } elsif ($flag eq 'tail') {
                    $str =~ s/\\x..//;  # Remove the first byte
                    $suffix .= '_TAIL';
                    $str = "\"$str\"";  # Will be a string constant
            }
            elsif ($flag eq 'first') {
                $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
                $suffix .= '_FIRST_BYTE';
                $str = "0x$str";        # Is a numeric constant
            }
            else {
                die "Unknown flag at line $.: $_\n";
            }
        }
        printf $out_fh "#   define %s%s  %s    /* U+%04X */\n", $name, $suffix, $str, $U_cp;
    }

    my $max_PRINT_A = 0;
    for my $i (0x20 .. 0x7E) {
        $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
    }
    $max_PRINT_A = sprintf "0x%02X", $max_PRINT_A;
    print $out_fh <<"EOT";

#   ifdef PERL_IN_REGCOMP_C
#     define MAX_PRINT_A  $max_PRINT_A   /* The max code point that isPRINT_A */
#   endif
EOT

    print $out_fh get_conditional_compile_line_end();

}

use Unicode::UCD 'prop_invlist';

my $count = 0;
my @other_invlist = prop_invlist("Other");
for (my $i = 0; $i < @other_invlist; $i += 2) {
    $count += ((defined $other_invlist[$i+1])
              ? $other_invlist[$i+1]
              : 0x110000)
              - $other_invlist[$i];
}
$count = 0x110000 - $count;
print $out_fh <<~"EOT";

    /* The number of code points not matching \\pC */
    #ifdef PERL_IN_REGCOMP_C
    #  define NON_OTHER_COUNT  $count
    #endif
    EOT

# If this release has both the CWCM and CWCF properties, find the highest code
# point which changes under any case change.  We can use this to short-circuit
# code
my @cwcm = prop_invlist('CWCM');
if (@cwcm) {
    my @cwcf = prop_invlist('CWCF');
    if (@cwcf) {
        my $max = ($cwcm[-1] < $cwcf[-1])
                  ? $cwcf[-1]
                  : $cwcm[-1];
        $max = sprintf "0x%X", $max - 1;
        print $out_fh <<~"EOS";

            /* The highest code point that has any type of case change */
            #ifdef PERL_IN_UTF8_C
            #  define HIGHEST_CASE_CHANGING_CP  $max
            #endif
            EOS
    }
}

print $out_fh "\n#endif /* PERL_UNICODE_CONSTANTS_H_ */\n";

read_only_bottom_close_and_rename($out_fh);

# DATA FORMAT
#
# Note that any apidoc comments you want in the file need to be added to one
# of the prints above
#
# A blank line is output as-is.
# Comments (lines whose first non-blank is a '#') are converted to C-style,
# though empty comments are converted to blank lines.  Otherwise, each line
# represents one #define, and begins with either a Unicode character name with
# the blanks and dashes in it squeezed out or replaced by underscores; or it
# may be a hexadecimal Unicode code point of the form U+xxxx.  In the latter
# case, the name will be looked-up to use as the name of the macro.  In either
# case, the macro name will have suffixes as listed above, and all blanks and
# dashes will be replaced by underscores.
#
# Each line may optionally have one of the following flags on it, separated by
# white space from the initial token.
#   string  indicates that the output is to be of the string form
#           described in the comments above that are placed in the file.
#   string_skip_ifundef  is the same as 'string', but instead of dying if the
#           code point doesn't exist, the line is just skipped: no output is
#           generated for it
#   first   indicates that the output is to be of the FIRST_BYTE form.
#   tail    indicates that the output is of the _TAIL form.
#   native  indicates that the output is the code point, converted to the
#           platform's native character set if applicable
#
# If the code point has no official name, the desired name may be appended
# after the flag, which will be ignored if there is an official name.
#
# This program is used to make it convenient to create compile time constants
# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
# having to figure things out.

__DATA__
U+017F string

U+0300 string
U+0307 string

U+1E9E string_skip_if_undef

U+FB05 string
U+FB06 string
U+0130 string
U+0131 string

U+2010 string
BOM first
BOM tail

BOM string

U+FFFD string

U+10FFFF string MAX_UNICODE

NBSP native
NBSP string

DEL native
CR  native
LF  native
VT  native
ESC native
U+00DF native
U+00DF string
U+00E5 native
U+00C5 native
U+00FF native
U+00B5 native
U+00B5 string
Commit	Line	Data
61dad979 KW	1	use v5.16.0;
	2	use strict;
	3	use warnings;
3d7c117d MB	4	require './regen/regen_lib.pl';
3d7c117d MB	5	require './regen/charset_translations.pl';
4b4853d1	6	use Unicode::UCD;
61dad979 KW	7	use charnames qw(:loose);
61dad979 KW	8
1b0f46bf	9	my $out_fh = open_new('unicode_constants.h', '>',
ad88cddb	10	{style => '*', by => $0,
61dad979 KW	11	from => "Unicode data"});
	12
	13	print $out_fh <<END;
d10c72f2	14
6a5bc5ac KW	15	#ifndef PERL_UNICODE_CONSTANTS_H_ /* Guard against nested #includes */
6a5bc5ac KW	16	#define PERL_UNICODE_CONSTANTS_H_ 1
d10c72f2	17
4b4853d1 KW	18	/* This file contains #defines for the version of Unicode being used and
	19	* various Unicode code points. The values the code point macros expand to
	20	* are the native Unicode code point, or all or portions of the UTF-8 encoding
	21	* for the code point. In the former case, the macro name has the suffix
	22	* "_NATIVE"; otherwise, the suffix "_UTF8".
61dad979	23	*
525b6419 KW	24	* The macros that have the suffix "_UTF8" may have further suffixes, as
	25	* follows:
	26	* "_FIRST_BYTE" if the value is just the first byte of the UTF-8
	27	* representation; the value will be a numeric constant.
	28	* "_TAIL" if instead it represents all but the first byte. This, and
	29	* with no additional suffix are both string constants */
61dad979	30
69bc4c1f	31	/*
3f620621	32	=for apidoc_section \$unicode
69bc4c1f	33
78342678	34	=for apidoc AmnU\|const char *\|BOM_UTF8
69bc4c1f KW	35
	36	This is a macro that evaluates to a string constant of the UTF-8 bytes that
	37	define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl
	38	is compiled on. This allows code to use a mnemonic for this character that
	39	works on both ASCII and EBCDIC platforms.
	40	S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in
	41	bytes.
	42
78342678	43	=for apidoc AmnU\|const char *\|REPLACEMENT_CHARACTER_UTF8
69bc4c1f KW	44
	45	This is a macro that evaluates to a string constant of the UTF-8 bytes that
	46	define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl
	47	is compiled on. This allows code to use a mnemonic for this character that
	48	works on both ASCII and EBCDIC platforms.
	49	S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in
	50	bytes.
	51
	52	=cut
	53	*/
	54
61dad979 KW	55	END
61dad979 KW	56
63cd44e4 KW	57	sub backslash_x_form($$;$) {
	58	# Output the code point represented by the byte string $bytes as a
	59	# sequence of \x{} constants. $bytes should be the UTF-8 for the code
	60	# point if the final parameter is absent or empty. Otherwise it should be
	61	# the Latin1 code point itself.
	62	#
	63	# The output is translated into the character set '$charset'.
	64
	65	my ($bytes, $charset, $non_utf8) = @_;
	66	if ($non_utf8) {
	67	die "Must be utf8 if above 255" if $bytes > 255;
	68	my $a2n = get_a2n($charset);
	69	return sprintf "\\x%02X", $a2n->[$bytes];
	70	}
	71	else {
	72	return join "", map { sprintf "\\x%02X", ord $_ }
	73	split //, cp_2_utfbytes($bytes, $charset);
	74	}
	75	}
	76
4b4853d1 KW	77	my $version = Unicode::UCD::UnicodeVersion();
	78	my ($major, $dot, $dotdot) = $version =~ / (.?) \. (.?) (?: \. (.*) )? $ /x;
	79	$dotdot = 0 unless defined $dotdot;
	80
	81	print $out_fh <<END;
	82	#define UNICODE_MAJOR_VERSION $major
	83	#define UNICODE_DOT_VERSION $dot
	84	#define UNICODE_DOT_DOT_VERSION $dotdot
	85
	86	END
	87
9d8e3074	88	# The data are at __DATA__ in this file.
61dad979	89
ad88cddb KW	90	my @data = <DATA>;
	91
	92	foreach my $charset (get_supported_code_pages()) {
	93	print $out_fh "\n" . get_conditional_compile_line_start($charset);
	94
c30a0cf2	95	my @a2n = @{get_a2n($charset)};
ad88cddb	96
4a4b1311 KW	97	for ( @data ) {
	98	chomp;
	99
	100	# Convert any '#' comments to /* ... */; empty lines and comments are
	101	# output as blank lines
	102	if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
	103	my $comment_body = $1 // "";
	104	if ($comment_body ne "") {
	105	print $out_fh "/* $comment_body */\n";
	106	}
	107	else {
	108	print $out_fh "\n";
	109	}
	110	next;
5a731a17	111	}
76837d21	112
4a4b1311 KW	113	unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
	114	(?: [\ ]+ ( [^ ]* ) )? # optional flag
	115	(?: [\ ]+ ( .* ) )? # name if unnamed; flag is required
	116	/x)
	117	{
	118	die "Unexpected syntax at line $.: $_\n";
	119	}
61dad979	120
4a4b1311 KW	121	my $name_or_cp = $1;
	122	my $flag = $2;
	123	my $desired_name = $3;
	124
	125	my $name;
	126	my $cp;
	127	my $U_cp; # code point in Unicode (not-native) terms
4a4b1311 KW	128
	129	if ($name_or_cp =~ /^U\+(.*)/) {
	130	$U_cp = hex $1;
	131	$name = charnames::viacode($name_or_cp);
	132	if (! defined $name) {
280ac755 KW	133	next if $flag =~ /skip_if_undef/;
280ac755 KW	134	die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name;
4a4b1311 KW	135	$name = "";
	136	}
	137	}
	138	else {
	139	$name = $name_or_cp;
	140	die "Unknown name '$name' at line $.: $_\n" unless defined $name;
	141	$U_cp = charnames::vianame($name =~ s/_/ /gr);
632c9f80	142	}
61dad979	143
4a4b1311 KW	144	$cp = ($U_cp < 256)
	145	? $a2n[$U_cp]
	146	: $U_cp;
ad88cddb	147
4a4b1311 KW	148	$name = $desired_name if $name eq "" && $desired_name;
4a4b1311 KW	149	$name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes
61dad979	150
4a4b1311 KW	151	my $str;
	152	my $suffix;
	153	if (defined $flag && $flag eq 'native') {
	154	die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
	155	$suffix = '_NATIVE';
	156	$str = sprintf "0x%02X", $cp; # Is a numeric constant
81a2a11f KW	157	}
81a2a11f KW	158	else {
63cd44e4	159	$str = backslash_x_form($U_cp, $charset);
4a4b1311 KW	160
	161	$suffix = '_UTF8';
	162	if (! defined $flag \|\| $flag =~ /^ string (_skip_if_undef)? $/x) {
	163	$str = "\"$str\""; # Will be a string constant
	164	} elsif ($flag eq 'tail') {
	165	$str =~ s/\\x..//; # Remove the first byte
	166	$suffix .= '_TAIL';
	167	$str = "\"$str\""; # Will be a string constant
	168	}
	169	elsif ($flag eq 'first') {
	170	$str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
	171	$suffix .= '_FIRST_BYTE';
	172	$str = "0x$str"; # Is a numeric constant
	173	}
	174	else {
	175	die "Unknown flag at line $.: $_\n";
	176	}
81a2a11f	177	}
4a4b1311	178	printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp;
a1beba5b	179	}
09cc440d KW	180
	181	my $max_PRINT_A = 0;
	182	for my $i (0x20 .. 0x7E) {
	183	$max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
	184	}
c62fdeb7 KW	185	$max_PRINT_A = sprintf "0x%02X", $max_PRINT_A;
c62fdeb7 KW	186	print $out_fh <<"EOT";
09cc440d	187
e80ffeda KW	188	# ifdef PERL_IN_REGCOMP_C
	189	# define MAX_PRINT_A $max_PRINT_A /* The max code point that isPRINT_A */
	190	# endif
c62fdeb7 KW	191	EOT
	192
	193	print $out_fh get_conditional_compile_line_end();
b35552de KW	194
	195	}
	196
	197	use Unicode::UCD 'prop_invlist';
	198
	199	my $count = 0;
	200	my @other_invlist = prop_invlist("Other");
	201	for (my $i = 0; $i < @other_invlist; $i += 2) {
	202	$count += ((defined $other_invlist[$i+1])
	203	? $other_invlist[$i+1]
	204	: 0x110000)
	205	- $other_invlist[$i];
61dad979	206	}
c62fdeb7 KW	207	$count = 0x110000 - $count;
	208	print $out_fh <<~"EOT";
	209
	210	/* The number of code points not matching \\pC */
	211	#ifdef PERL_IN_REGCOMP_C
	212	# define NON_OTHER_COUNT $count
	213	#endif
	214	EOT
61dad979	215
3bfc1e70 KW	216	# If this release has both the CWCM and CWCF properties, find the highest code
	217	# point which changes under any case change. We can use this to short-circuit
	218	# code
	219	my @cwcm = prop_invlist('CWCM');
	220	if (@cwcm) {
	221	my @cwcf = prop_invlist('CWCF');
	222	if (@cwcf) {
	223	my $max = ($cwcm[-1] < $cwcf[-1])
	224	? $cwcf[-1]
	225	: $cwcm[-1];
c62fdeb7 KW	226	$max = sprintf "0x%X", $max - 1;
	227	print $out_fh <<~"EOS";
	228
	229	/* The highest code point that has any type of case change */
	230	#ifdef PERL_IN_UTF8_C
	231	# define HIGHEST_CASE_CHANGING_CP $max
	232	#endif
	233	EOS
3bfc1e70 KW	234	}
	235	}
	236
6a5bc5ac	237	print $out_fh "\n#endif /* PERL_UNICODE_CONSTANTS_H_ */\n";
d10c72f2	238
61dad979 KW	239	read_only_bottom_close_and_rename($out_fh);
61dad979 KW	240
9d8e3074 KW	241	# DATA FORMAT
9d8e3074 KW	242	#
69bc4c1f KW	243	# Note that any apidoc comments you want in the file need to be added to one
	244	# of the prints above
	245	#
9d8e3074 KW	246	# A blank line is output as-is.
	247	# Comments (lines whose first non-blank is a '#') are converted to C-style,
	248	# though empty comments are converted to blank lines. Otherwise, each line
	249	# represents one #define, and begins with either a Unicode character name with
	250	# the blanks and dashes in it squeezed out or replaced by underscores; or it
	251	# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter
	252	# case, the name will be looked-up to use as the name of the macro. In either
	253	# case, the macro name will have suffixes as listed above, and all blanks and
	254	# dashes will be replaced by underscores.
	255	#
	256	# Each line may optionally have one of the following flags on it, separated by
	257	# white space from the initial token.
	258	# string indicates that the output is to be of the string form
	259	# described in the comments above that are placed in the file.
	260	# string_skip_ifundef is the same as 'string', but instead of dying if the
	261	# code point doesn't exist, the line is just skipped: no output is
	262	# generated for it
	263	# first indicates that the output is to be of the FIRST_BYTE form.
	264	# tail indicates that the output is of the _TAIL form.
	265	# native indicates that the output is the code point, converted to the
	266	# platform's native character set if applicable
	267	#
	268	# If the code point has no official name, the desired name may be appended
	269	# after the flag, which will be ignored if there is an official name.
	270	#
	271	# This program is used to make it convenient to create compile time constants
	272	# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
	273	# having to figure things out.
	274
61dad979	275	__DATA__
f2e06375	276	U+017F string
76837d21	277
1dfa4f52	278	U+0300 string
2a614cdc	279	U+0307 string
a78bc3c6	280
8f57fa7d	281	U+1E9E string_skip_if_undef
f2e06375	282
a9f50d33 KW	283	U+FB05 string
a9f50d33 KW	284	U+FB06 string
a0ffb25e KW	285	U+0130 string
a0ffb25e KW	286	U+0131 string
a9f50d33	287
1dfa4f52	288	U+2010 string
5f0aa340 KW	289	BOM first
5f0aa340 KW	290	BOM tail
525b6419	291
69bc4c1f KW	292	BOM string
	293
	294	U+FFFD string
	295
566efd88 KW	296	U+10FFFF string MAX_UNICODE
566efd88 KW	297
df758df2 KW	298	NBSP native
	299	NBSP string
	300
05016631	301	DEL native
c5eda08a KW	302	CR native
c5eda08a KW	303	LF native
d804860b KW	304	VT native
d804860b KW	305	ESC native
1dfa4f52	306	U+00DF native
69ffc8e3	307	U+00DF string
1dfa4f52 KW	308	U+00E5 native
	309	U+00C5 native
	310	U+00FF native
	311	U+00B5 native
69ffc8e3	312	U+00B5 string