This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/utf8_strings.pl: Add ability to get native charset
[perl5.git] / regen / utf8_strings.pl
CommitLineData
61dad979
KW
1use v5.16.0;
2use strict;
3use warnings;
4require 'regen/regen_lib.pl';
5use charnames qw(:loose);
6
7my $out_fh = open_new('utf8_strings.h', '>',
8 {style => '*', by => $0,
9 from => "Unicode data"});
10
11print $out_fh <<END;
d10c72f2
KW
12
13#ifndef H_UTF8_STRINGS /* Guard against nested #includes */
14#define H_UTF8_STRINGS 1
15
61dad979 16/* This file contains #defines for various Unicode code points. The values
525b6419
KW
17 * the macros expand to are the native Unicode code point, or all or portions
18 * of the UTF-8 encoding for the code point. In the former case, the macro
19 * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8".
61dad979 20 *
525b6419
KW
21 * The macros that have the suffix "_UTF8" may have further suffixes, as
22 * follows:
23 * "_FIRST_BYTE" if the value is just the first byte of the UTF-8
24 * representation; the value will be a numeric constant.
25 * "_TAIL" if instead it represents all but the first byte. This, and
26 * with no additional suffix are both string constants */
61dad979
KW
27
28END
29
76837d21
KW
30# The data are at the end of this file. A blank line is output as-is.
31# Otherwise, each line represents one #define, and begins with either a
32# Unicode character name with the blanks in it squeezed out or replaced by
33# underscores; or it may be a hexadecimal Unicode code point. In the latter
34# case, the name will be looked-up to use as the name of the macro. In either
35# case, the macro name will have suffixes as listed above, and all blanks will
36# be replaced by underscores.
61dad979
KW
37#
38# Each line may optionally have one of the following flags on it, separated by
39# white space from the initial token.
5f1720e9 40# string indicates that the output is to be of the string form
61dad979 41# described in the comments above that are placed in the file.
5f1720e9 42# first indicates that the output is to be of the FIRST_BYTE form.
61dad979 43# tail indicates that the output is of the _TAIL form.
525b6419
KW
44# native indicates that the output is the code point, converted to the
45# platform's native character set if applicable
61dad979
KW
46#
47# This program is used to make it convenient to create compile time constants
48# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
49# having to figure things out.
50
51while ( <DATA> ) {
76837d21
KW
52 if ($_ !~ /\S/) {
53 print $out_fh "\n";
54 next;
55 }
56
61dad979
KW
57 chomp;
58 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
59 (?: [\ ]+ ( .* ) )? # optional flag
60 /x)
61 {
62 die "Unexpected syntax at line $.: $_\n";
63 }
64
65 my $name_or_cp = $1;
66 my $flag = $2;
67
68 my $name;
69 my $cp;
70
71 if ($name_or_cp =~ /[^[:xdigit:]]/) {
72
73 # Anything that isn't a hex value must be a name.
74 $name = $name_or_cp;
75 $cp = charnames::vianame($name =~ s/_/ /gr);
76 die "Unknown name '$name' at line $.: $_\n" unless defined $name;
77 }
78 else {
79 $cp = $name_or_cp;
80 $name = charnames::viacode("0$cp"); # viacode requires a leading zero
81 # to be sure that the argument is hex
82 die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp;
83 }
84
85 $name =~ s/ /_/g; # The macro name can have no blanks in it
86
87 my $str = join "", map { sprintf "\\x%02X", $_ }
88 unpack("U0C*", pack("U", hex $cp));
89
90 my $suffix = '_UTF8';
5f1720e9 91 if (! defined $flag || $flag eq 'string') {
61dad979
KW
92 $str = "\"$str\""; # Will be a string constant
93 } elsif ($flag eq 'tail') {
94 $str =~ s/\\x..//; # Remove the first byte
95 $suffix .= '_TAIL';
96 $str = "\"$str\""; # Will be a string constant
97 }
98 elsif ($flag eq 'first') {
99 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
100 $suffix .= '_FIRST_BYTE';
101 $str = "0x$str"; # Is a numeric constant
102 }
525b6419
KW
103 elsif ($flag eq 'native') {
104 die "Are you sure you want to run this on an above-Latin1 code point?" if hex $cp > 0xff;
105 $suffix = '_NATIVE';
106 $str = utf8::unicode_to_native(hex $cp);
107 $str = "0x$cp"; # Is a numeric constant
108 }
61dad979
KW
109 else {
110 die "Unknown flag at line $.: $_\n";
111 }
112 print $out_fh "#define ${name}$suffix $str /* U+$cp */\n";
113}
114
d10c72f2
KW
115print $out_fh "\n#endif /* H_UTF8_STRINGS */\n";
116
61dad979
KW
117read_only_bottom_close_and_rename($out_fh);
118
119__DATA__
5f1720e9
KW
1200300 string
1210301 string
1220308 string
76837d21 123
61dad979 12403B9 first
76837d21
KW
12503B9 tail
126
61dad979 12703C5 first
76837d21
KW
12803C5 tail
129
61dad979
KW
1301100
1311160
13211A8
5f1720e9 1332010 string
525b6419
KW
134
135007F native
13600DF native
13700E5 native
13800C5 native
13900FF native
14000B5 native
1410085 native