This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add utility and .h for character's UTF-8
[perl5.git] / regen / utf8_strings.pl
CommitLineData
61dad979
KW
1use v5.16.0;
2use strict;
3use warnings;
4require 'regen/regen_lib.pl';
5use charnames qw(:loose);
6
7my $out_fh = open_new('utf8_strings.h', '>',
8 {style => '*', by => $0,
9 from => "Unicode data"});
10
11print $out_fh <<END;
12/* This file contains #defines for various Unicode code points. The values
13 * for the macros are all or portions of the UTF-8 encoding for the code
14 * point. Note that the names all have the suffix "_UTF8".
15 *
16 * The suffix "_FIRST_BYTE" may be appended to the name if the value is just
17 * the first byte of the UTF-8 representation; the value will be a numeric
18 * constant.
19 *
20 * The suffix "_TAIL" is appened if instead it represents all but the first
21 * byte. This, and with no suffix are both string constants */
22
23END
24
25# The data are at the end of this file. Each line represents one #define.
26# Each line begins with either a Unicode character name with the blanks in it
27# squeezed out or replaced by underscores; or it may be a hexadecimal code
28# point. In the latter case, the name will be looked-up to use as the name
29# of the macro. In either case, the macro name will have suffixes as
30# listed above, and all blanks will be replaced by underscores.
31#
32# Each line may optionally have one of the following flags on it, separated by
33# white space from the initial token.
34# first indicates that the output is to be of the FIRST_BYTE form
35# described in the comments above that are placed in the file.
36# tail indicates that the output is of the _TAIL form.
37#
38# This program is used to make it convenient to create compile time constants
39# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
40# having to figure things out.
41
42while ( <DATA> ) {
43 chomp;
44 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
45 (?: [\ ]+ ( .* ) )? # optional flag
46 /x)
47 {
48 die "Unexpected syntax at line $.: $_\n";
49 }
50
51 my $name_or_cp = $1;
52 my $flag = $2;
53
54 my $name;
55 my $cp;
56
57 if ($name_or_cp =~ /[^[:xdigit:]]/) {
58
59 # Anything that isn't a hex value must be a name.
60 $name = $name_or_cp;
61 $cp = charnames::vianame($name =~ s/_/ /gr);
62 die "Unknown name '$name' at line $.: $_\n" unless defined $name;
63 }
64 else {
65 $cp = $name_or_cp;
66 $name = charnames::viacode("0$cp"); # viacode requires a leading zero
67 # to be sure that the argument is hex
68 die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp;
69 }
70
71 $name =~ s/ /_/g; # The macro name can have no blanks in it
72
73 my $str = join "", map { sprintf "\\x%02X", $_ }
74 unpack("U0C*", pack("U", hex $cp));
75
76 my $suffix = '_UTF8';
77 if (! defined $flag) {
78 $str = "\"$str\""; # Will be a string constant
79 } elsif ($flag eq 'tail') {
80 $str =~ s/\\x..//; # Remove the first byte
81 $suffix .= '_TAIL';
82 $str = "\"$str\""; # Will be a string constant
83 }
84 elsif ($flag eq 'first') {
85 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
86 $suffix .= '_FIRST_BYTE';
87 $str = "0x$str"; # Is a numeric constant
88 }
89 else {
90 die "Unknown flag at line $.: $_\n";
91 }
92 print $out_fh "#define ${name}$suffix $str /* U+$cp */\n";
93}
94
95read_only_bottom_close_and_rename($out_fh);
96
97__DATA__
980300
990301
1000308
10103B9 tail
10203C5 tail
10303B9 first
10403C5 first
1051100
1061160
10711A8
1082010