4 no warnings 'experimental::regex_sets';
5 require './regen/regen_lib.pl';
6 require './regen/charset_translations.pl';
7 use Unicode::UCD qw(prop_invlist prop_invmap search_invlist);
8 use charnames qw(:loose);
9 binmode(STDERR, ":utf8");
11 # Set this to 1 temporarily to get on stderr the complete list of paired
12 # string delimiters this generates. This list is suitable for plugging into a
16 # Set this to 1 temporarily to get on stderr the complete list of punctuation
17 # marks and symbols that look to be directional but we didn't include for some
19 my $output_omitteds = 0;
21 my $out_fh = open_new('unicode_constants.h', '>',
22 {style => '*', by => $0,
23 from => "Unicode data"});
27 #ifndef PERL_UNICODE_CONSTANTS_H_ /* Guard against nested #includes */
28 #define PERL_UNICODE_CONSTANTS_H_ 1
30 /* This file contains #defines for the version of Unicode being used and
31 * various Unicode code points. The values the code point macros expand to
32 * are the native Unicode code point, or all or portions of the UTF-8 encoding
33 * for the code point. In the former case, the macro name has the suffix
34 * "_NATIVE"; otherwise, the suffix "_UTF8".
36 * The macros that have the suffix "_UTF8" may have further suffixes, as
38 * "_FIRST_BYTE" if the value is just the first byte of the UTF-8
39 * representation; the value will be a numeric constant.
40 * "_TAIL" if instead it represents all but the first byte. This, and
41 * with no additional suffix are both string constants */
44 =for apidoc_section \$unicode
46 =for apidoc AmnU|const char *|BOM_UTF8
48 This is a macro that evaluates to a string constant of the UTF-8 bytes that
49 define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl
50 is compiled on. This allows code to use a mnemonic for this character that
51 works on both ASCII and EBCDIC platforms.
52 S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in
55 =for apidoc AmnU|const char *|REPLACEMENT_CHARACTER_UTF8
57 This is a macro that evaluates to a string constant of the UTF-8 bytes that
58 define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl
59 is compiled on. This allows code to use a mnemonic for this character that
60 works on both ASCII and EBCDIC platforms.
61 S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in
69 sub backslash_x_form($$;$) {
70 # Output the code point represented by the byte string $bytes as a
71 # sequence of \x{} constants. $bytes should be the UTF-8 for the code
72 # point if the final parameter is absent or empty. Otherwise it should be
73 # the Latin1 code point itself.
75 # The output is translated into the character set '$charset'.
77 my ($bytes, $charset, $non_utf8) = @_;
79 die "Must be utf8 if above 255" if $bytes > 255;
80 my $a2n = get_a2n($charset);
81 return sprintf "\\x%02X", $a2n->[$bytes];
84 return join "", map { sprintf "\\x%02X", ord $_ }
85 split //, cp_2_utfbytes($bytes, $charset);
90 my %opposite_of = ( LEFT => 'RIGHT', RIGHT =>'LEFT' );
92 my $directional_re = qr/\b(LEFT|RIGHT)\b/; # Make sure to capture $1
94 sub format_pairs_line($;$) {
97 # Format a line containing a character singleton or pair in preparation
98 # for output, suitable for pod.
100 my $lhs_name = charnames::viacode($from);
101 my $lhs_hex = sprintf "%04X", $from;
104 my $name = $lhs_name;
106 my $hanging_indent = 26;
108 # Treat a trivial pair as a singleton
109 undef $to if defined $to && $to == $from;
112 my $rhs_name = charnames::viacode($to);
113 $rhs_hex = sprintf "%04X", $to;
115 # Most of the names differ only in LEFT vs RIGHT; some in
116 # LESS-THAN vs GREATER-THAN. It takes less space, and is easier to
117 # understand if they are displayed combined.
118 if ($name =~ s/$directional_re/$opposite_of{$1}/gr eq $rhs_name) {
119 $name =~ s,$directional_re,$1/$opposite_of{$1},g;
121 else { # Otherwise, display them sequentially
122 $name .= ", " . $rhs_name;
126 # Handle double-width characters, based on the East Asian Width property.
127 # Add an extra space to non-wide ones so things stay vertically aligned.
129 my $output_line = " " # Indent in case output being used for verbatim
132 if (chr($from) =~ /[\p{EA=W}\p{EA=F}]/) {
133 $extra++; # The length() will be shorter than the displayed
140 $output_line .= " " . chr $to;
141 if (chr($to) =~ /[\p{EA=W}\p{EA=F}]/) {
152 $output_line .= " U+$lhs_hex";
153 $output_line .= ", U+$rhs_hex" if defined $to;;
154 my $cur_len = $extra + length $output_line;
155 $output_line .= " " x ($hanging_indent - $cur_len);
157 my $max_len = 74; # Pod formatter will indent 4 spaces
158 $cur_len = length $output_line;
160 if ($cur_len + length $name <= $max_len) {
161 $output_line .= $name; # It will fit
163 else { # It won't fit. Append a segment that is unbreakable until would
164 # exceed the available width; then start on a new line
165 # Doesn't handle the case where the whole segment doesn't fit;
166 # this just doesn't come up with the input data.
167 while ($name =~ / ( .+? ) \b{lb} /xg) {
169 my $added_length = length $segment;
170 if ($cur_len + $added_length > $max_len) {
171 $output_line =~ s/ +$//;
172 $output_line .= "\n" . " " x $hanging_indent;
173 $cur_len = $hanging_indent;
176 $output_line .= $segment;
177 $cur_len += $added_length;
181 return $output_line . "\n";
184 my $version = Unicode::UCD::UnicodeVersion();
185 my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x;
186 $dotdot = 0 unless defined $dotdot;
189 #define UNICODE_MAJOR_VERSION $major
190 #define UNICODE_DOT_VERSION $dot
191 #define UNICODE_DOT_DOT_VERSION $dotdot
195 # Gather the characters in Unicode that have left/right symmetry suitable for
196 # paired string delimiters
197 my %paireds = ( ord '<' => ord '>' ); # We don't normally use math ones, but
198 # this is traditionally included
200 # So don't have to grep an array to determine if have already dealt with the
201 # characters that are the keys
202 my %inverted_paireds;
204 # This property is the universe of all characters in Unicode which
205 # are of some import to the Bidirectional Algorithm, and for which there is
206 # another Unicode character that is a mirror of it.
207 my ($bmg_invlist, $bmg_invmap, $format, $bmg_default) =
208 prop_invmap("Bidi_Mirroring_Glyph");
210 # Keep track of the characters we don't use, and why not.
212 my $non_directional = 'No perceived horizontal direction';
213 my $not_considered_directional_because = "Not considered directional because";
214 my $unpaired = "Didn't find a mirror";
215 my $no_encoded_mate = "Mirrored, but Unicode has no encoded mirror";
217 # The current list of characters that Perl considers to be paired
218 # opening/closing delimiters is quite conservative, consisting of those
219 # from the above property that other Unicode properties classify as
221 foreach my $list (qw(PI PF PS PE)) {
222 my @invlist = prop_invlist($list);
223 die "Empty list $list" unless @invlist;
225 # Convert from an inversion list to an array containing everything that
226 # matches. (This uses the recipe given in Unicode::UCD.)
228 for (my $i = 0; $i < @invlist; $i += 2) {
229 my $upper = ($i + 1) < @invlist
230 ? $invlist[$i+1] - 1 # In range
231 : $Unicode::UCD::MAX_CP; # To infinity.
232 for my $j ($invlist[$i] .. $upper) {
238 foreach my $code_point (@full_list) {
239 #print STDERR __FILE__, ": ", __LINE__, ": ", sprintf("%04x ", $code_point), charnames::viacode($code_point), "\n";
240 my $chr = chr $code_point;
242 # Don't reexamine something we've already determined. This happens
243 # when its mate was earlier processed and found this one.
244 foreach my $hash_ref (\%paireds, \%inverted_paireds) {
245 next CODE_POINT if exists $hash_ref->{$code_point}
248 my $name = charnames::viacode($code_point);
250 my $mirror_code_point;
252 # If Unicode considers this to have a mirror, we don't have to go
254 if ($chr =~ /\p{Bidi_Mirrored}/) {
255 my $i = search_invlist($bmg_invlist, $code_point);
256 $mirror_code_point = $bmg_invmap->[$i];
257 if ( $mirror_code_point eq $bmg_default) {
258 $discards{$code_point} = { reason => $no_encoded_mate,
264 # Certain Unicode properties classify some mirrored characters as
265 # opening (left) vs closing (right). Skip the closing ones this
266 # iteration; they will be handled later when the opening mate
268 if ($chr =~ /(?[ \p{BPT=Close}
269 | \p{Gc=Close_Punctuation}
272 next; # Get this when its opening mirror comes up.
274 elsif ($chr =~ /(?[ \p{BPT=Open}
275 | \p{Gc=Open_Punctuation}
276 | \p{Gc=Initial_Punctuation}
277 | \p{Gc=Final_Punctuation}
280 # Here, it's a left delimiter. (The ones in Final Punctuation
281 # can be opening ones in some languages.)
282 $paireds{$code_point} = $mirror_code_point;
283 $inverted_paireds{$mirror_code_point} = $code_point;
285 # If the delimiter can be used on either side, add its
287 if ($chr =~ /(?[ \p{Gc=Initial_Punctuation}
288 | \p{Gc=Final_Punctuation}
291 $paireds{$mirror_code_point} = $code_point;
292 $inverted_paireds{$code_point} = $mirror_code_point;
300 else { # Here is not involved with the bidirectional algorithm.
302 # Get the mirror (if any) from reversing the directions in the
303 # name, and looking that up
305 $mirror =~ s/$directional_re/$opposite_of{$1}/g;
306 $mirror_code_point = charnames::vianame($mirror);
309 # There are several hundred characters other characters that clearly
310 # should be mirrors of each other, like LEFTWARDS ARROW and RIGHTWARDS
311 # ARROW. Unicode did not bother to classify them as mirrors mostly
312 # because they aren't of import in the Bidirectional Algorithm. Most
313 # of them are symbols. These are not considered opening/closing by
316 # Certain names are always treated as non directional.
318 # The VERTICAL marks these as not actually
320 PRESENTATION [ ] FORM [ ] FOR [ ] VERTICAL
323 $discards{$code_point}
324 = { reason => "$not_considered_directional_because name"
326 mirror => $mirror_code_point
331 # If these are equal, it means the original had no horizontal
333 if ($name eq $mirror) {
334 $discards{$code_point} = { reason => $non_directional,
340 if (! defined $mirror_code_point) {
341 $discards{$code_point} = { reason => $unpaired,
347 if ($code_point == $mirror_code_point) {
348 $discards{$code_point} =
349 { reason => "$unpaired - Single character, multiple"
350 . " names; Unicode name correction",
351 mirror => $mirror_code_point
356 $paireds{$code_point} = $mirror_code_point;
357 $inverted_paireds{$mirror_code_point} = $code_point;
359 # Again, accept either one at either end for these ambiguous
360 # punctuation delimiters
361 if ($chr =~ /[\p{PI}\p{PF}]/x) {
362 $paireds{$mirror_code_point} = $code_point;
363 $inverted_paireds{$code_point} = $mirror_code_point;
365 } # End of loop through code points
366 } # End of loop through properties
368 # The rest of the data are at __DATA__ in this file.
372 foreach my $charset (get_supported_code_pages()) {
373 print $out_fh "\n" . get_conditional_compile_line_start($charset);
375 my @a2n = @{get_a2n($charset)};
380 # Convert any '#' comments to /* ... */; empty lines and comments are
381 # output as blank lines
382 if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
383 my $comment_body = $1 // "";
384 if ($comment_body ne "") {
385 print $out_fh "/* $comment_body */\n";
393 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
394 (?: [\ ]+ ( [^ ]* ) )? # optional flag
395 (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required
398 die "Unexpected syntax at line $.: $_\n";
403 my $desired_name = $3;
407 my $U_cp; # code point in Unicode (not-native) terms
409 if ($name_or_cp =~ /^U\+(.*)/) {
411 $name = charnames::viacode($name_or_cp);
412 if (! defined $name) {
413 next if $flag =~ /skip_if_undef/;
414 die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name;
420 die "Unknown name '$name' at line $.: $_\n" unless defined $name;
421 $U_cp = charnames::vianame($name =~ s/_/ /gr);
428 $name = $desired_name if $name eq "" && $desired_name;
429 $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes
433 if (defined $flag && $flag eq 'native') {
434 die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
436 $str = sprintf "0x%02X", $cp; # Is a numeric constant
439 $str = backslash_x_form($U_cp, $charset);
442 if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) {
443 $str = "\"$str\""; # Will be a string constant
444 } elsif ($flag eq 'tail') {
445 $str =~ s/\\x..//; # Remove the first byte
447 $str = "\"$str\""; # Will be a string constant
449 elsif ($flag eq 'first') {
450 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
451 $suffix .= '_FIRST_BYTE';
452 $str = "0x$str"; # Is a numeric constant
455 die "Unknown flag at line $.: $_\n";
458 printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp;
461 # Now output the strings of opening/closing delimiters. The Unicode
462 # values were earlier entered into %paireds
463 my $utf8_opening = "";
464 my $utf8_closing = "";
465 my $non_utf8_opening = "";
466 my $non_utf8_closing = "";
467 my $deprecated_if_not_mirrored = "";
468 my $non_utf8_deprecated_if_not_mirrored = "";
470 for my $from (sort { $a <=> $b } keys %paireds) {
471 my $to = $paireds{$from};
472 my $utf8_from_backslashed = backslash_x_form($from, $charset);
473 my $utf8_to_backslashed = backslash_x_form($to, $charset);
474 my $non_utf8_from_backslashed;
475 my $non_utf8_to_backslashed;
477 $utf8_opening .= $utf8_from_backslashed;
478 $utf8_closing .= $utf8_to_backslashed;
481 $non_utf8_from_backslashed =
482 backslash_x_form($from, $charset, 'not_utf8');
483 $non_utf8_to_backslashed =
484 backslash_x_form($to, $charset, 'not_utf8');
486 $non_utf8_opening .= $non_utf8_from_backslashed;
487 $non_utf8_closing .= $non_utf8_to_backslashed;
490 # Only the ASCII range paired delimiters have traditionally been
491 # accepted. Until the feature is considered standard, the non-ASCII
492 # opening ones must be deprecated when the feature isn't in effect, so
493 # as to warn about behavior that is planned to change.
495 $deprecated_if_not_mirrored .= $utf8_from_backslashed;
496 $non_utf8_deprecated_if_not_mirrored .=
497 $non_utf8_from_backslashed if $from < 256;
499 # We deprecate using any of these strongly directional characters
500 # at either end of the string, in part so we could allow them to
502 $deprecated_if_not_mirrored .= $utf8_to_backslashed
503 if index ($deprecated_if_not_mirrored,
504 $utf8_to_backslashed) < 0;
507 # The implementing code in toke.c assumes that the byte length of each
508 # opening delimiter is the same as its mirrored closing one. This
509 # makes sure of that by checking upon each iteration of the loop.
510 if (length $utf8_opening != length $utf8_closing) {
511 die "Byte length of representation of '"
512 . charnames::viacode($from)
513 . " differs from its mapping '"
514 . charnames::viacode($to)
518 print STDERR format_pairs_line($from, $to) if $output_lists;
520 $output_lists = 0; # Only output in first iteration
522 print $out_fh <<~"EOT";
524 # ifdef PERL_IN_TOKE_C
525 /* Paired characters for quote-like operators, in UTF-8 */
526 # define EXTRA_OPENING_UTF8_BRACKETS "$utf8_opening"
527 # define EXTRA_CLOSING_UTF8_BRACKETS "$utf8_closing"
529 /* And not in UTF-8 */
530 # define EXTRA_OPENING_NON_UTF8_BRACKETS "$non_utf8_opening"
531 # define EXTRA_CLOSING_NON_UTF8_BRACKETS "$non_utf8_closing"
533 /* And what's deprecated */
534 # define DEPRECATED_OPENING_UTF8_BRACKETS "$deprecated_if_not_mirrored"
535 # define DEPRECATED_OPENING_NON_UTF8_BRACKETS "$non_utf8_deprecated_if_not_mirrored"
540 for my $i (0x20 .. 0x7E) {
541 $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
543 $max_PRINT_A = sprintf "0x%02X", $max_PRINT_A;
544 print $out_fh <<"EOT";
546 # ifdef PERL_IN_REGCOMP_C
547 # define MAX_PRINT_A $max_PRINT_A /* The max code point that isPRINT_A */
551 print $out_fh get_conditional_compile_line_end();
555 if ($output_omitteds) {
556 # We haven't bothered to delete things that later became used.
557 foreach my $which (\%paireds) {
558 foreach my $lhs (keys $which->%*) {
559 delete $discards{$lhs};
560 delete $discards{$which->{$lhs}};
564 # Invert %discards so that all the code points for a given reason are
565 # keyed by that reason.
566 my %inverted_discards;
567 foreach my $code_point (sort { $a <=> $b } keys %discards) {
568 my $type = $discards{$code_point}{reason};
569 push $inverted_discards{$type}->@*, [ $code_point,
570 $discards{$code_point}{mirror}
574 # Then output each list
575 foreach my $type (sort keys %inverted_discards) {
576 print STDERR "\n$type\n" if $type ne "";
577 foreach my $ref ($inverted_discards{$type}->@*) {
578 print STDERR format_pairs_line($ref->[0], $ref->[1]);
584 my @other_invlist = prop_invlist("Other");
585 for (my $i = 0; $i < @other_invlist; $i += 2) {
586 $count += ((defined $other_invlist[$i+1])
587 ? $other_invlist[$i+1]
589 - $other_invlist[$i];
591 $count = 0x110000 - $count;
592 print $out_fh <<~"EOT";
594 /* The number of code points not matching \\pC */
595 #ifdef PERL_IN_REGCOMP_C
596 # define NON_OTHER_COUNT $count
600 # If this release has both the CWCM and CWCF properties, find the highest code
601 # point which changes under any case change. We can use this to short-circuit
603 my @cwcm = prop_invlist('CWCM');
605 my @cwcf = prop_invlist('CWCF');
607 my $max = ($cwcm[-1] < $cwcf[-1])
610 $max = sprintf "0x%X", $max - 1;
611 print $out_fh <<~"EOS";
613 /* The highest code point that has any type of case change */
614 #ifdef PERL_IN_UTF8_C
615 # define HIGHEST_CASE_CHANGING_CP $max
621 print $out_fh "\n#endif /* PERL_UNICODE_CONSTANTS_H_ */\n";
623 read_only_bottom_close_and_rename($out_fh);
627 # Note that any apidoc comments you want in the file need to be added to one
628 # of the prints above
630 # A blank line is output as-is.
631 # Comments (lines whose first non-blank is a '#') are converted to C-style,
632 # though empty comments are converted to blank lines. Otherwise, each line
633 # represents one #define, and begins with either a Unicode character name with
634 # the blanks and dashes in it squeezed out or replaced by underscores; or it
635 # may be a hexadecimal Unicode code point of the form U+xxxx. In the latter
636 # case, the name will be looked-up to use as the name of the macro. In either
637 # case, the macro name will have suffixes as listed above, and all blanks and
638 # dashes will be replaced by underscores.
640 # Each line may optionally have one of the following flags on it, separated by
641 # white space from the initial token.
642 # string indicates that the output is to be of the string form
643 # described in the comments above that are placed in the file.
644 # string_skip_ifundef is the same as 'string', but instead of dying if the
645 # code point doesn't exist, the line is just skipped: no output is
647 # first indicates that the output is to be of the FIRST_BYTE form.
648 # tail indicates that the output is of the _TAIL form.
649 # native indicates that the output is the code point, converted to the
650 # platform's native character set if applicable
652 # If the code point has no official name, the desired name may be appended
653 # after the flag, which will be ignored if there is an official name.
655 # This program is used to make it convenient to create compile time constants
656 # of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
657 # having to figure things out.
665 U+1E9E string_skip_if_undef
680 U+10FFFF string MAX_UNICODE