This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Move 2 functions from utf8.c to regexec.c
[perl5.git] / regen / unicode_constants.pl
CommitLineData
61dad979
KW
1use v5.16.0;
2use strict;
3use warnings;
4require 'regen/regen_lib.pl';
5use charnames qw(:loose);
6
1b0f46bf 7my $out_fh = open_new('unicode_constants.h', '>',
61dad979
KW
8 {style => '*', by => $0,
9 from => "Unicode data"});
10
11print $out_fh <<END;
d10c72f2 12
1b0f46bf
KW
13#ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */
14#define H_UNICODE_CONSTANTS 1
d10c72f2 15
61dad979 16/* This file contains #defines for various Unicode code points. The values
525b6419
KW
17 * the macros expand to are the native Unicode code point, or all or portions
18 * of the UTF-8 encoding for the code point. In the former case, the macro
19 * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8".
61dad979 20 *
525b6419
KW
21 * The macros that have the suffix "_UTF8" may have further suffixes, as
22 * follows:
23 * "_FIRST_BYTE" if the value is just the first byte of the UTF-8
24 * representation; the value will be a numeric constant.
25 * "_TAIL" if instead it represents all but the first byte. This, and
26 * with no additional suffix are both string constants */
61dad979
KW
27
28END
29
76837d21
KW
30# The data are at the end of this file. A blank line is output as-is.
31# Otherwise, each line represents one #define, and begins with either a
32# Unicode character name with the blanks in it squeezed out or replaced by
33# underscores; or it may be a hexadecimal Unicode code point. In the latter
34# case, the name will be looked-up to use as the name of the macro. In either
35# case, the macro name will have suffixes as listed above, and all blanks will
36# be replaced by underscores.
61dad979
KW
37#
38# Each line may optionally have one of the following flags on it, separated by
39# white space from the initial token.
5f1720e9 40# string indicates that the output is to be of the string form
61dad979 41# described in the comments above that are placed in the file.
5f1720e9 42# first indicates that the output is to be of the FIRST_BYTE form.
61dad979 43# tail indicates that the output is of the _TAIL form.
525b6419
KW
44# native indicates that the output is the code point, converted to the
45# platform's native character set if applicable
61dad979
KW
46#
47# This program is used to make it convenient to create compile time constants
48# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
49# having to figure things out.
50
51while ( <DATA> ) {
76837d21
KW
52 if ($_ !~ /\S/) {
53 print $out_fh "\n";
54 next;
55 }
56
61dad979
KW
57 chomp;
58 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
59 (?: [\ ]+ ( .* ) )? # optional flag
60 /x)
61 {
62 die "Unexpected syntax at line $.: $_\n";
63 }
64
65 my $name_or_cp = $1;
66 my $flag = $2;
67
68 my $name;
69 my $cp;
70
71 if ($name_or_cp =~ /[^[:xdigit:]]/) {
72
73 # Anything that isn't a hex value must be a name.
74 $name = $name_or_cp;
75 $cp = charnames::vianame($name =~ s/_/ /gr);
76 die "Unknown name '$name' at line $.: $_\n" unless defined $name;
77 }
78 else {
79 $cp = $name_or_cp;
80 $name = charnames::viacode("0$cp"); # viacode requires a leading zero
81 # to be sure that the argument is hex
82 die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp;
83 }
84
85 $name =~ s/ /_/g; # The macro name can have no blanks in it
86
87 my $str = join "", map { sprintf "\\x%02X", $_ }
88 unpack("U0C*", pack("U", hex $cp));
89
90 my $suffix = '_UTF8';
5f1720e9 91 if (! defined $flag || $flag eq 'string') {
61dad979
KW
92 $str = "\"$str\""; # Will be a string constant
93 } elsif ($flag eq 'tail') {
94 $str =~ s/\\x..//; # Remove the first byte
95 $suffix .= '_TAIL';
96 $str = "\"$str\""; # Will be a string constant
97 }
98 elsif ($flag eq 'first') {
99 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
100 $suffix .= '_FIRST_BYTE';
101 $str = "0x$str"; # Is a numeric constant
102 }
525b6419
KW
103 elsif ($flag eq 'native') {
104 die "Are you sure you want to run this on an above-Latin1 code point?" if hex $cp > 0xff;
105 $suffix = '_NATIVE';
106 $str = utf8::unicode_to_native(hex $cp);
107 $str = "0x$cp"; # Is a numeric constant
108 }
61dad979
KW
109 else {
110 die "Unknown flag at line $.: $_\n";
111 }
112 print $out_fh "#define ${name}$suffix $str /* U+$cp */\n";
113}
114
1b0f46bf 115print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";
d10c72f2 116
61dad979
KW
117read_only_bottom_close_and_rename($out_fh);
118
119__DATA__
5f1720e9
KW
1200300 string
1210301 string
1220308 string
76837d21 123
61dad979 12403B9 first
76837d21
KW
12503B9 tail
126
61dad979 12703C5 first
76837d21
KW
12803C5 tail
129
5f1720e9 1302010 string
525b6419
KW
131
132007F native
13300DF native
13400E5 native
13500C5 native
13600FF native
13700B5 native
1380085 native