4 require 'regen/regen_lib.pl';
5 use charnames qw(:loose);
7 my $out_fh = open_new('utf8_strings.h', '>',
8 {style => '*', by => $0,
9 from => "Unicode data"});
13 #ifndef H_UTF8_STRINGS /* Guard against nested #includes */
14 #define H_UTF8_STRINGS 1
16 /* This file contains #defines for various Unicode code points. The values
17 * for the macros are all or portions of the UTF-8 encoding for the code
18 * point. Note that the names all have the suffix "_UTF8".
20 * The suffix "_FIRST_BYTE" may be appended to the name if the value is just
21 * the first byte of the UTF-8 representation; the value will be a numeric
24 * The suffix "_TAIL" is appened if instead it represents all but the first
25 * byte. This, and with no suffix are both string constants */
29 # The data are at the end of this file. A blank line is output as-is.
30 # Otherwise, each line represents one #define, and begins with either a
31 # Unicode character name with the blanks in it squeezed out or replaced by
32 # underscores; or it may be a hexadecimal Unicode code point. In the latter
33 # case, the name will be looked-up to use as the name of the macro. In either
34 # case, the macro name will have suffixes as listed above, and all blanks will
35 # be replaced by underscores.
37 # Each line may optionally have one of the following flags on it, separated by
38 # white space from the initial token.
39 # first indicates that the output is to be of the FIRST_BYTE form
40 # described in the comments above that are placed in the file.
41 # tail indicates that the output is of the _TAIL form.
43 # This program is used to make it convenient to create compile time constants
44 # of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
45 # having to figure things out.
54 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
55 (?: [\ ]+ ( .* ) )? # optional flag
58 die "Unexpected syntax at line $.: $_\n";
67 if ($name_or_cp =~ /[^[:xdigit:]]/) {
69 # Anything that isn't a hex value must be a name.
71 $cp = charnames::vianame($name =~ s/_/ /gr);
72 die "Unknown name '$name' at line $.: $_\n" unless defined $name;
76 $name = charnames::viacode("0$cp"); # viacode requires a leading zero
77 # to be sure that the argument is hex
78 die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp;
81 $name =~ s/ /_/g; # The macro name can have no blanks in it
83 my $str = join "", map { sprintf "\\x%02X", $_ }
84 unpack("U0C*", pack("U", hex $cp));
87 if (! defined $flag) {
88 $str = "\"$str\""; # Will be a string constant
89 } elsif ($flag eq 'tail') {
90 $str =~ s/\\x..//; # Remove the first byte
92 $str = "\"$str\""; # Will be a string constant
94 elsif ($flag eq 'first') {
95 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
96 $suffix .= '_FIRST_BYTE';
97 $str = "0x$str"; # Is a numeric constant
100 die "Unknown flag at line $.: $_\n";
102 print $out_fh "#define ${name}$suffix $str /* U+$cp */\n";
105 print $out_fh "\n#endif /* H_UTF8_STRINGS */\n";
107 read_only_bottom_close_and_rename($out_fh);