This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/mph.pl - Clean up diagnostics logic, allow DEBUG from env.
[perl5.git] / regen / unicode_constants.pl
CommitLineData
61dad979
KW
1use v5.16.0;
2use strict;
3use warnings;
c7b32e72 4no warnings 'experimental::regex_sets';
3d7c117d
MB
5require './regen/regen_lib.pl';
6require './regen/charset_translations.pl';
1df06fae 7use Unicode::UCD qw(prop_invlist prop_invmap search_invlist);
61dad979 8use charnames qw(:loose);
dce1e563
KW
9binmode(STDERR, ":utf8");
10
11# Set this to 1 temporarily to get on stderr the complete list of paired
12# string delimiters this generates. This list is suitable for plugging into a
13# pod.
14my $output_lists = 0;
61dad979 15
9dfe4b75
KW
16# Set this to 1 temporarily to get on stderr the complete list of punctuation
17# marks and symbols that look to be directional but we didn't include for some
18# reason.
19my $output_omitteds = 0;
20
1b0f46bf 21my $out_fh = open_new('unicode_constants.h', '>',
ad88cddb 22 {style => '*', by => $0,
61dad979
KW
23 from => "Unicode data"});
24
25print $out_fh <<END;
d10c72f2 26
6a5bc5ac
KW
27#ifndef PERL_UNICODE_CONSTANTS_H_ /* Guard against nested #includes */
28#define PERL_UNICODE_CONSTANTS_H_ 1
d10c72f2 29
4b4853d1
KW
30/* This file contains #defines for the version of Unicode being used and
31 * various Unicode code points. The values the code point macros expand to
32 * are the native Unicode code point, or all or portions of the UTF-8 encoding
33 * for the code point. In the former case, the macro name has the suffix
34 * "_NATIVE"; otherwise, the suffix "_UTF8".
61dad979 35 *
525b6419
KW
36 * The macros that have the suffix "_UTF8" may have further suffixes, as
37 * follows:
38 * "_FIRST_BYTE" if the value is just the first byte of the UTF-8
39 * representation; the value will be a numeric constant.
40 * "_TAIL" if instead it represents all but the first byte. This, and
41 * with no additional suffix are both string constants */
61dad979 42
69bc4c1f 43/*
3f620621 44=for apidoc_section \$unicode
69bc4c1f 45
78342678 46=for apidoc AmnU|const char *|BOM_UTF8
69bc4c1f
KW
47
48This is a macro that evaluates to a string constant of the UTF-8 bytes that
49define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl
50is compiled on. This allows code to use a mnemonic for this character that
51works on both ASCII and EBCDIC platforms.
52S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in
53bytes.
54
78342678 55=for apidoc AmnU|const char *|REPLACEMENT_CHARACTER_UTF8
69bc4c1f
KW
56
57This is a macro that evaluates to a string constant of the UTF-8 bytes that
58define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl
59is compiled on. This allows code to use a mnemonic for this character that
60works on both ASCII and EBCDIC platforms.
61S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in
62bytes.
63
64=cut
65*/
66
61dad979
KW
67END
68
63cd44e4
KW
69sub backslash_x_form($$;$) {
70 # Output the code point represented by the byte string $bytes as a
71 # sequence of \x{} constants. $bytes should be the UTF-8 for the code
72 # point if the final parameter is absent or empty. Otherwise it should be
73 # the Latin1 code point itself.
74 #
75 # The output is translated into the character set '$charset'.
76
77 my ($bytes, $charset, $non_utf8) = @_;
78 if ($non_utf8) {
79 die "Must be utf8 if above 255" if $bytes > 255;
80 my $a2n = get_a2n($charset);
81 return sprintf "\\x%02X", $a2n->[$bytes];
82 }
83 else {
84 return join "", map { sprintf "\\x%02X", ord $_ }
85 split //, cp_2_utfbytes($bytes, $charset);
86 }
87}
88
36327261
KW
89# The most complicated thing this program does is generate paired string
90# delimiters from the Unicode database. Some of these come from the
91# Unicode Bidirectional (bidi) algorithm.
92
93# These all visually look like left and right delimiters
7e4a71c6 94my @bidi_strong_lefts = ( 'LESS-THAN',
c1b67e77 95 'ELEMENT OF',
6763e244
KW
96 'PRECEDE',
97 'PRECEDES',
36327261 98 'SMALLER THAN',
0a4ec1c0 99 'SUBSET',
7e4a71c6
KW
100 );
101my @bidi_strong_rights = ( 'GREATER-THAN',
c1b67e77 102 'CONTAINS',
6763e244
KW
103 'SUCCEED',
104 'SUCCEEDS',
36327261 105 'LARGER THAN',
0a4ec1c0 106 'SUPERSET',
7e4a71c6
KW
107 );
108
109# Create an array of hashes for these, so as to translate between them, and
110# avoid recompiling patterns in the loop.
111my @bidi_strong_directionals;
112for (my $i = 0; $i < @bidi_strong_lefts; $i++) {
113 push @bidi_strong_directionals,
114 {
115 LHS => $bidi_strong_lefts[$i],
116 RHS => $bidi_strong_rights[$i],
117 L_pattern => qr/\b$bidi_strong_lefts[$i]\b/,
118 R_pattern => qr/\b$bidi_strong_rights[$i]\b/,
119 };
120}
121
50b17915 122my @ok_bidi_symbols = (
c7c4369b 123 'TACK',
84c1ecba 124 'TURNSTILE',
50b17915
KW
125 );
126my $ok_bidi_symbols_re = join '|', @ok_bidi_symbols;
127$ok_bidi_symbols_re = qr/\b($ok_bidi_symbols_re)\b/n;
128
129
130# Many characters have mirrors that Unicode hasn't included in their Bidi
131# algorithm. This program uses their names to find them. The next few
132# definitions are towards that end.
133
134# Most horizontal directionality is based on LEFT vs RIGHT. But it's
135# complicated:
136# 1) a barb on one or the other side of a harpoon doesn't indicate
137# directionality of the character. (A HARPOON is the word Unicode uses
138# to indicate an arrow with a one-sided tip.)
139my $no_barb_re = qr/(*nlb:BARB )/;
140
141# 2) RIGHT-SHADED doesn't signify anything about direction of the character
142# itself. These are the suffixes Unicode uses to indicate this. /aa is
143# needed because the wildcard names feature currently requires it for names.
144my $shaded_re = qr/ [- ] (SHADED | SHADOWED) /naax;
145
146# 3a) there are a few anomalies caught here. 'LEFT LUGGAGE' would have been
147# better named UNCLAIMED, and doesn't indicate directionality.
148my $real_LEFT_re = qr/ \b $no_barb_re LEFT (*nla: $shaded_re)
149 (*nla: [ ] LUGGAGE \b)
150 /nx;
151# 3b) And in most cases,a RIGHT TRIANGLE also doesn't refer to
152# directionality, but indicates it contains a 90 degree angle.
153my $real_RIGHT_re = qr/ \b $no_barb_re RIGHT (*nla: $shaded_re)
154 (*nla: [ ] (TRI)? ANGLE \b)
155 /nx;
156# More items could be added to these as needed
157
158# 4) something that is pointing R goes on the left, so is different than
159# the character on the R. For example, a RIGHT BRACKET would be
160# different from a RIGHT-FACING bracket. These patterns capture the
161# typical ways that Unicode character names indicate the latter meaning
162# as a suffix to RIGHT or LEFT
163my $pointing_suffix_re = qr/ ( WARDS # e.g., RIGHTWARDS
164 | [ ] ARROW # A R arrow points to the R
165 | [ -] FACING
166 | [ -] POINTING
167 | [ ] PENCIL # Implies a direction of its
168 # point
169 ) \b /nx;
170# And correspondingly for a prefix for LEFT RIGHT
171my $pointing_prefix_re = qr/ \b ( # e.g. UP RIGHT implies a direction
172 UP ( [ ] AND)?
173 | DOWN ( [ ] AND)?
174 | CONVERGING
175 | POINTING [ ] (DIRECTLY)?
176 | TO [ ] THE
177 )
178 [ ]
179 /nx;
180
7e4a71c6
KW
181my @other_directionals =
182 {
183 LHS => 'LEFT',
184 RHS => 'RIGHT',
185 L_pattern =>
50b17915
KW
186 # Something goes on the left if it contains LEFT and doesn't
187 # point left, or it contains RIGHT and does point right.
188 qr/ \b (*nlb: $pointing_prefix_re) $real_LEFT_re
189 (*nla: $pointing_suffix_re)
190 | \b (*plb: $pointing_prefix_re) $real_RIGHT_re \b
191 | \b $real_RIGHT_re (*pla: $pointing_suffix_re)
192 /nx,
7e4a71c6 193 R_pattern =>
50b17915
KW
194 qr/ \b (*nlb: $pointing_prefix_re) $real_RIGHT_re
195 (*nla: $pointing_suffix_re)
196 | \b (*plb: $pointing_prefix_re) $real_LEFT_re \b
197 | \b $real_LEFT_re (*pla: $pointing_suffix_re)
198 /nx,
7e4a71c6
KW
199 };
200
50b17915
KW
201# Some horizontal directionality is based on EAST vs WEST. These words are
202# almost always used by Unicode to indicate the direction pointing to, without
203# the general consistency in phrasing in L/R above. There are a handful of
204# possible exceptions, with only WEST WIND ever at all possibly an issue
205push @other_directionals,
206 {
207 LHS => 'EAST',
208 RHS => 'WEST',
209 L_pattern => qr/ \b ( EAST (*nla: [ ] WIND)
210 | WEST (*pla: [ ] WIND)) \b /x,
211 R_pattern => qr/ \b ( WEST (*nla: [ ] WIND)
212 | EAST (*pla: [ ] WIND)) \b /x,
213 };
214
215# The final way the Unicode signals mirroring is by using the words REVERSE or
216# REVERSED;
565fbe1b 217my $reverse_re = qr/ \b REVERSE D? [- ] /x;
dce1e563 218
7e4a71c6
KW
219# Create a mapping from each direction to its opposite one
220my %opposite_of;
221foreach my $directional (@bidi_strong_directionals, @other_directionals) {
222 $opposite_of{$directional->{LHS}} = $directional->{RHS};
223 $opposite_of{$directional->{RHS}} = $directional->{LHS};
224}
225
226# Join the two types of each direction as alternatives
227my $L_re = join "|", map { $_->{L_pattern} } @bidi_strong_directionals,
228 @other_directionals;
229my $R_re = join "|", map { $_->{R_pattern} } @bidi_strong_directionals,
230 @other_directionals;
231# And anything containing directionality will be either one of these two
232my $directional_re = join "|", $L_re, $R_re;
dce1e563 233
7e4a71c6
KW
234# Now compile the strings that result from above
235$L_re = qr/$L_re/;
236$R_re = qr/$R_re/;
237$directional_re = qr/($directional_re)/; # Make sure to capture $1
dce1e563 238
3e9e4fd8 239my @included_symbols = (
806eaaf9 240 0x2326, 0x232B, # ERASE
a29825fb
KW
241 0x23E9 .. 0x23EA, # DOUBLE TRIANGLE
242 0x23ED .. 0x23EE, # DOUBLE TRIANGLE with BAR
3e9e4fd8
KW
243 0x269E .. 0x269F, # THREE LINES CONVERGING
244 0x1D102 .. 0x1D103, # MUSIC STAVES
245 0x1D106 .. 0x1D107, # MUSIC STAVES
ad558b4e
KW
246 0x1F57B, # TELEPHONE RECEIVER
247 0x1F57D, # TELEPHONE RECEIVER
a863e562
KW
248 0x1F508 .. 0x1F50A, # LOUD SPEAKER
249 0x1F568 .. 0x1F56A, # LOUD SPEAKER
e73a9e14 250 0x1F5E6 .. 0x1F5E7, # THREE RAYS
3e9e4fd8
KW
251 );
252my %included_symbols;
253$included_symbols{$_} = 1 for @included_symbols;
254
9dfe4b75 255sub format_pairs_line($;$) {
dce1e563
KW
256 my ($from, $to) = @_;
257
9dfe4b75 258 # Format a line containing a character singleton or pair in preparation
dce1e563
KW
259 # for output, suitable for pod.
260
261 my $lhs_name = charnames::viacode($from);
262 my $lhs_hex = sprintf "%04X", $from;
263 my $rhs_name;
264 my $rhs_hex;
265 my $name = $lhs_name;
266
267 my $hanging_indent = 26;
268
9dfe4b75
KW
269 # Treat a trivial pair as a singleton
270 undef $to if defined $to && $to == $from;
271
dce1e563
KW
272 if (defined $to) {
273 my $rhs_name = charnames::viacode($to);
274 $rhs_hex = sprintf "%04X", $to;
275
276 # Most of the names differ only in LEFT vs RIGHT; some in
277 # LESS-THAN vs GREATER-THAN. It takes less space, and is easier to
278 # understand if they are displayed combined.
279 if ($name =~ s/$directional_re/$opposite_of{$1}/gr eq $rhs_name) {
280 $name =~ s,$directional_re,$1/$opposite_of{$1},g;
281 }
282 else { # Otherwise, display them sequentially
283 $name .= ", " . $rhs_name;
284 }
285 }
286
287 # Handle double-width characters, based on the East Asian Width property.
288 # Add an extra space to non-wide ones so things stay vertically aligned.
289 my $extra = 0;
290 my $output_line = " " # Indent in case output being used for verbatim
291 # pod
292 . chr $from;
293 if (chr($from) =~ /[\p{EA=W}\p{EA=F}]/) {
294 $extra++; # The length() will be shorter than the displayed
295 # width
296 }
297 else {
298 $output_line .= " ";
299 }
300 if (defined $to) {
301 $output_line .= " " . chr $to;
302 if (chr($to) =~ /[\p{EA=W}\p{EA=F}]/) {
303 $extra++;
304 }
305 else {
306 $output_line .= " ";
307 }
308 }
309 else {
310 $output_line .= " ";
311 }
312
313 $output_line .= " U+$lhs_hex";
314 $output_line .= ", U+$rhs_hex" if defined $to;;
315 my $cur_len = $extra + length $output_line;
316 $output_line .= " " x ($hanging_indent - $cur_len);
317
318 my $max_len = 74; # Pod formatter will indent 4 spaces
319 $cur_len = length $output_line;
320
321 if ($cur_len + length $name <= $max_len) {
322 $output_line .= $name; # It will fit
323 }
324 else { # It won't fit. Append a segment that is unbreakable until would
325 # exceed the available width; then start on a new line
326 # Doesn't handle the case where the whole segment doesn't fit;
327 # this just doesn't come up with the input data.
328 while ($name =~ / ( .+? ) \b{lb} /xg) {
329 my $segment = $1;
330 my $added_length = length $segment;
331 if ($cur_len + $added_length > $max_len) {
332 $output_line =~ s/ +$//;
333 $output_line .= "\n" . " " x $hanging_indent;
334 $cur_len = $hanging_indent;
335 }
336
337 $output_line .= $segment;
338 $cur_len += $added_length;
339 }
340 }
341
342 return $output_line . "\n";
343}
344
4b4853d1
KW
345my $version = Unicode::UCD::UnicodeVersion();
346my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x;
347$dotdot = 0 unless defined $dotdot;
348
349print $out_fh <<END;
350#define UNICODE_MAJOR_VERSION $major
351#define UNICODE_DOT_VERSION $dot
352#define UNICODE_DOT_DOT_VERSION $dotdot
353
354END
355
c7b32e72
KW
356# Gather the characters in Unicode that have left/right symmetry suitable for
357# paired string delimiters
7e4a71c6 358my %paireds;
c7b32e72 359
1df06fae
KW
360# So don't have to grep an array to determine if have already dealt with the
361# characters that are the keys
362my %inverted_paireds;
363
c7b32e72
KW
364# This property is the universe of all characters in Unicode which
365# are of some import to the Bidirectional Algorithm, and for which there is
366# another Unicode character that is a mirror of it.
367my ($bmg_invlist, $bmg_invmap, $format, $bmg_default) =
368 prop_invmap("Bidi_Mirroring_Glyph");
369
9dfe4b75
KW
370# Keep track of the characters we don't use, and why not.
371my %discards;
372my $non_directional = 'No perceived horizontal direction';
373my $not_considered_directional_because = "Not considered directional because";
50b17915 374my $trailing_up_down = 'Vertical direction after all L/R direction';
9dfe4b75 375my $unpaired = "Didn't find a mirror";
210ad843 376my $illegal = "Mirror illegal";
9dfe4b75 377my $no_encoded_mate = "Mirrored, but Unicode has no encoded mirror";
7e4a71c6 378my $bidirectional = "Bidirectional";
9dfe4b75 379
50b17915
KW
380my %unused_bidi_pairs;
381my %inverted_unused_bidi_pairs;
382my %unused_pairs; #
383my %inverted_unused_pairs;
384
385# Could be more explicit about allowing, e.g. ARROWS, ARROWHEAD, but this
386# suffices
387my $arrow_like_re = qr/\b(ARROW|HARPOON)/;
388
389# Go through the Unicode Punctuation and Symbol characters looking for ones
390# that have mirrors, suitable for being string delimiters. Some of these are
391# easily derivable from Unicode properties dealing with the bidirectional
392# algorithm. But the purpose of that algorithm isn't the same as ours, and
393# excludes many suitable ones. In particular, no arrows are included in it.
394# To find suitable ones, we also look at character names to see if there is a
395# character with that name, but the horizontal direction reversed. That will
396# almost certainly be a mirror.
210ad843 397foreach my $list (qw(Punctuation Symbol)) {
1df06fae
KW
398 my @invlist = prop_invlist($list);
399 die "Empty list $list" unless @invlist;
400
7e4a71c6
KW
401 my $is_Symbol = $list eq 'Symbol';
402
1df06fae
KW
403 # Convert from an inversion list to an array containing everything that
404 # matches. (This uses the recipe given in Unicode::UCD.)
405 my @full_list;
406 for (my $i = 0; $i < @invlist; $i += 2) {
407 my $upper = ($i + 1) < @invlist
408 ? $invlist[$i+1] - 1 # In range
409 : $Unicode::UCD::MAX_CP; # To infinity.
410 for my $j ($invlist[$i] .. $upper) {
411 push @full_list, $j;
412 }
413 }
414
415 CODE_POINT:
416 foreach my $code_point (@full_list) {
417 #print STDERR __FILE__, ": ", __LINE__, ": ", sprintf("%04x ", $code_point), charnames::viacode($code_point), "\n";
418 my $chr = chr $code_point;
c7b32e72 419
1df06fae
KW
420 # Don't reexamine something we've already determined. This happens
421 # when its mate was earlier processed and found this one.
50b17915
KW
422 foreach my $hash_ref (\%paireds, \%inverted_paireds,
423 \%unused_bidi_pairs, \%inverted_unused_bidi_pairs,
424 \%unused_pairs, \%inverted_unused_pairs)
425 {
1df06fae
KW
426 next CODE_POINT if exists $hash_ref->{$code_point}
427 }
c7b32e72 428
1df06fae 429 my $name = charnames::viacode($code_point);
50b17915 430 my $original_had_REVERSE;
1df06fae
KW
431 my $mirror;
432 my $mirror_code_point;
433
434 # If Unicode considers this to have a mirror, we don't have to go
435 # looking
436 if ($chr =~ /\p{Bidi_Mirrored}/) {
437 my $i = search_invlist($bmg_invlist, $code_point);
438 $mirror_code_point = $bmg_invmap->[$i];
439 if ( $mirror_code_point eq $bmg_default) {
9dfe4b75
KW
440 $discards{$code_point} = { reason => $no_encoded_mate,
441 mirror => undef
442 };
1df06fae
KW
443 next;
444 }
c7b32e72 445
1df06fae
KW
446 # Certain Unicode properties classify some mirrored characters as
447 # opening (left) vs closing (right). Skip the closing ones this
448 # iteration; they will be handled later when the opening mate
449 # comes along.
450 if ($chr =~ /(?[ \p{BPT=Close}
451 | \p{Gc=Close_Punctuation}
452 ])/)
453 {
454 next; # Get this when its opening mirror comes up.
455 }
456 elsif ($chr =~ /(?[ \p{BPT=Open}
c7b32e72
KW
457 | \p{Gc=Open_Punctuation}
458 | \p{Gc=Initial_Punctuation}
1df06fae 459 | \p{Gc=Final_Punctuation}
c7b32e72 460 ])/)
1df06fae
KW
461 {
462 # Here, it's a left delimiter. (The ones in Final Punctuation
463 # can be opening ones in some languages.)
464 $paireds{$code_point} = $mirror_code_point;
465 $inverted_paireds{$mirror_code_point} = $code_point;
466
467 # If the delimiter can be used on either side, add its
468 # complement
469 if ($chr =~ /(?[ \p{Gc=Initial_Punctuation}
470 | \p{Gc=Final_Punctuation}
471 ])/)
472 {
473 $paireds{$mirror_code_point} = $code_point;
474 $inverted_paireds{$code_point} = $mirror_code_point;
475 }
476
477 next;
478 }
835f2666 479
7e4a71c6
KW
480 # Unicode doesn't consider '< >' to be brackets, but Perl does. There are
481 # lots of variants of these in Unicode; easiest to accept all of
482 # them that aren't bidirectional (which would be visually
483 # confusing).
484 for (my $i = 0; $i < @bidi_strong_directionals; $i++) {
485 my $hash_ref = $bidi_strong_directionals[$i];
486
487 next if $name !~ $hash_ref->{L_pattern};
488
489 if ($name =~ $hash_ref->{R_pattern}) {
490 $discards{$code_point} = { reason => $bidirectional,
491 mirror => $mirror_code_point
492 };
493 next CODE_POINT;
494 }
495
496 $paireds{$code_point} = $mirror_code_point;
497 $inverted_paireds{$mirror_code_point} = $code_point;
50b17915 498 $original_had_REVERSE = $name =~ /$reverse_re/;
7e4a71c6
KW
499 next CODE_POINT;
500 }
501
50b17915
KW
502 # The other paired symbols are more iffy as being desirable paired
503 # delimiters; we let the code below decide what to do with them.
504 $mirror = charnames::viacode($mirror_code_point);
1df06fae
KW
505 }
506 else { # Here is not involved with the bidirectional algorithm.
c7b32e72 507
1df06fae
KW
508 # Get the mirror (if any) from reversing the directions in the
509 # name, and looking that up
510 $mirror = $name;
511 $mirror =~ s/$directional_re/$opposite_of{$1}/g;
50b17915 512 $original_had_REVERSE = $mirror =~ s/$reverse_re//g;
1df06fae
KW
513 $mirror_code_point = charnames::vianame($mirror);
514 }
515
50b17915
KW
516 # Letter-like symbols don't really stand on their own and don't look
517 # like traditional delimiters.
518 if ($chr =~ /\p{Sk}/) {
519 $discards{$code_point}
520 = { reason => "Letter-like symbols are not eligible",
521 mirror => $mirror_code_point
522 };
523 next CODE_POINT;
524 }
1df06fae
KW
525
526 # Certain names are always treated as non directional.
50b17915
KW
527 if ($name =~ m{ \b ( WITH [ ] (?:LEFT|RIGHT) [ ] HALF [ ] BLACK
528 | BLOCK
529 | BOX [ ] DRAWINGS
530 | CIRCLE [ ] WITH
531 | EXTENSION
532 | (?: UPPER | LOWER ) [ ] HOOK
533
1df06fae
KW
534 # The VERTICAL marks these as not actually
535 # L/R mirrored.
50b17915
KW
536 | PRESENTATION [ ] FORM [ ] FOR [ ] VERTICAL
537 | QUADRANT
538 | SHADE
539 | SQUARE [ ] WITH
1df06fae
KW
540 ) \b }x)
541 {
9dfe4b75
KW
542 $discards{$code_point}
543 = { reason => "$not_considered_directional_because name"
544 . " contains '$1'",
545 mirror => $mirror_code_point
546 };
1df06fae
KW
547 next CODE_POINT;
548 }
549
550 # If these are equal, it means the original had no horizontal
551 # directioning
552 if ($name eq $mirror) {
9dfe4b75
KW
553 $discards{$code_point} = { reason => $non_directional,
554 mirror => undef
555 };
1df06fae
KW
556 next CODE_POINT;
557 }
558
50b17915
KW
559 # If the name has both left and right directions, it is bidirectional,
560 # so not suited to be a paired delimiter.
561 if ($name =~ $L_re && $name =~ $R_re) {
562 $discards{$code_point} = { reason => $bidirectional,
563 mirror => $mirror_code_point
564 };
565 next CODE_POINT;
566 }
567
9153861e
KW
568 # If no mate was found, it could be that it's like the case of
569 # SPEAKER vs RIGHT SPEAKER (which probably means the mirror was added
570 # in a later version than the original. Check by removing all
571 # directionality and trying to see if there is a character with that
572 # name.
1df06fae 573 if (! defined $mirror_code_point) {
9153861e
KW
574 $mirror =~ s/$directional_re //;
575 $mirror_code_point = charnames::vianame($mirror);
576 if (! defined $mirror_code_point) {
577
578 # Still no mate.
9dfe4b75
KW
579 $discards{$code_point} = { reason => $unpaired,
580 mirror => undef
581 };
1df06fae 582 next;
9153861e 583 }
1df06fae
KW
584 }
585
586 if ($code_point == $mirror_code_point) {
9dfe4b75
KW
587 $discards{$code_point} =
588 { reason => "$unpaired - Single character, multiple"
589 . " names; Unicode name correction",
590 mirror => $mirror_code_point
591 };
1df06fae
KW
592 next;
593 }
594
50b17915
KW
595 if ($is_Symbol) {
596
597 # Skip if the the direction is followed by a vertical motion
598 # (which defeats the left-right directionality).
599 if ( $name =~ / ^ .* $no_barb_re
600 \b (UP|DOWN|NORTH|SOUTH) /gx
601 and not $name =~ /$directional_re/g)
602 {
603 $discards{$code_point} = { reason => $trailing_up_down,
604 mirror => $mirror_code_point
605 };
606 next;
607 }
608 }
609
210ad843
KW
610 # There are a few characters like REVERSED SEMICOLON that are mirrors,
611 # but have always commonly been used unmirrored. There is also the
612 # PILCROW SIGN and its mirror which might be considered to be
613 # legitimate mirrors, but maybe not. Additionally the current
614 # algorithm for finding the mirror depends on each member of a pair
615 # being respresented by the same number of bytes as its mate. By
616 # skipping these, we solve both problems
617 if ($code_point < 256 != $mirror_code_point < 256) {
618 $discards{$code_point} = { reason => $illegal,
619 mirror => $mirror_code_point
620 };
621 next;
622 }
623
624 # And '/' and '\' are mirrors that we don't accept
625 if ( $name =~ /SOLIDUS/
626 && $name =~ s/REVERSE SOLIDUS/SOLIDUS/r
627 eq $mirror =~ s/REVERSE SOLIDUS/SOLIDUS/r)
628 {
629 $discards{$code_point} = { reason => $illegal,
630 mirror => $mirror_code_point
631 };
632 next;
633 }
634
50b17915
KW
635 # We enter the pair with the original code point on the left; if it
636 # should instead be on the R, swap. Most Symbols that contain the
637 # word REVERSE go on the rhs, except those whose names explicitly
638 # indicate lhs. FINAL in the name indicates stays on the rhs.
639 if ($name =~ $R_re || ( $original_had_REVERSE
640 && $is_Symbol
641 && $name !~ $L_re
642 && $name !~ /\bFINAL\b/
643 ))
644 {
645 my $temp = $code_point;
646 $code_point = $mirror_code_point;
647 $mirror_code_point = $temp;
648 }
649
4a4b7455
KW
650 # Only a few symbols are currently used, determined by inspection, but
651 # all the (few) remaining paired punctuations.
50b17915 652 if ( ! $is_Symbol
3e9e4fd8 653 || defined $included_symbols{$code_point}
50b17915
KW
654 || ( $chr =~ /\p{BidiMirrored}/
655 && ( $name =~ $ok_bidi_symbols_re
656 || $mirror =~ $ok_bidi_symbols_re))
4a4b7455 657 || $name =~ /\bINDEX\b/ # index FINGER pointing
0b6e3da1
KW
658
659 # Also accept most arrows that don't have N/S in their
660 # names. (Those are almost all currently pointing at an
661 # angle, like SW anyway.)
662 || ( $name !~ /\bNORTH|SOUTH\b/
663 && $name =~ $arrow_like_re
664
665 # Arguably bi-directional
666 && $name !~ /U-SHAPED/)
50b17915 667 ) {
1df06fae
KW
668 $paireds{$code_point} = $mirror_code_point;
669 $inverted_paireds{$mirror_code_point} = $code_point;
670
671 # Again, accept either one at either end for these ambiguous
672 # punctuation delimiters
673 if ($chr =~ /[\p{PI}\p{PF}]/x) {
674 $paireds{$mirror_code_point} = $code_point;
675 $inverted_paireds{$code_point} = $mirror_code_point;
676 }
50b17915
KW
677 }
678 elsif ( $chr =~ /\p{BidiMirrored}/
679 && ! exists $inverted_unused_bidi_pairs{$code_point}
680 && ! defined $inverted_unused_bidi_pairs{$code_point})
681 {
682 $unused_bidi_pairs{$code_point} = $mirror_code_point;
683 $inverted_unused_bidi_pairs{$mirror_code_point} = $code_point;
684 }
685 elsif ( ! exists $inverted_unused_pairs{$code_point}
686 && ! defined $inverted_unused_pairs{$code_point})
687 { # A pair that we don't currently accept
688 $unused_pairs{$code_point} = $mirror_code_point;
689 $inverted_unused_pairs{$mirror_code_point} = $code_point;
690 }
1df06fae
KW
691 } # End of loop through code points
692} # End of loop through properties
c7b32e72
KW
693
694# The rest of the data are at __DATA__ in this file.
61dad979 695
ad88cddb
KW
696my @data = <DATA>;
697
698foreach my $charset (get_supported_code_pages()) {
699 print $out_fh "\n" . get_conditional_compile_line_start($charset);
700
c30a0cf2 701 my @a2n = @{get_a2n($charset)};
ad88cddb 702
4a4b1311
KW
703 for ( @data ) {
704 chomp;
705
706 # Convert any '#' comments to /* ... */; empty lines and comments are
707 # output as blank lines
708 if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
709 my $comment_body = $1 // "";
710 if ($comment_body ne "") {
711 print $out_fh "/* $comment_body */\n";
712 }
713 else {
714 print $out_fh "\n";
715 }
716 next;
5a731a17 717 }
76837d21 718
4a4b1311
KW
719 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
720 (?: [\ ]+ ( [^ ]* ) )? # optional flag
721 (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required
722 /x)
723 {
724 die "Unexpected syntax at line $.: $_\n";
725 }
61dad979 726
4a4b1311
KW
727 my $name_or_cp = $1;
728 my $flag = $2;
729 my $desired_name = $3;
730
731 my $name;
732 my $cp;
733 my $U_cp; # code point in Unicode (not-native) terms
4a4b1311
KW
734
735 if ($name_or_cp =~ /^U\+(.*)/) {
736 $U_cp = hex $1;
737 $name = charnames::viacode($name_or_cp);
738 if (! defined $name) {
280ac755
KW
739 next if $flag =~ /skip_if_undef/;
740 die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name;
4a4b1311
KW
741 $name = "";
742 }
743 }
744 else {
745 $name = $name_or_cp;
746 die "Unknown name '$name' at line $.: $_\n" unless defined $name;
747 $U_cp = charnames::vianame($name =~ s/_/ /gr);
632c9f80 748 }
61dad979 749
4a4b1311
KW
750 $cp = ($U_cp < 256)
751 ? $a2n[$U_cp]
752 : $U_cp;
ad88cddb 753
4a4b1311
KW
754 $name = $desired_name if $name eq "" && $desired_name;
755 $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes
61dad979 756
4a4b1311
KW
757 my $str;
758 my $suffix;
759 if (defined $flag && $flag eq 'native') {
760 die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
761 $suffix = '_NATIVE';
762 $str = sprintf "0x%02X", $cp; # Is a numeric constant
81a2a11f
KW
763 }
764 else {
63cd44e4 765 $str = backslash_x_form($U_cp, $charset);
4a4b1311
KW
766
767 $suffix = '_UTF8';
768 if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) {
769 $str = "\"$str\""; # Will be a string constant
770 } elsif ($flag eq 'tail') {
771 $str =~ s/\\x..//; # Remove the first byte
772 $suffix .= '_TAIL';
773 $str = "\"$str\""; # Will be a string constant
774 }
775 elsif ($flag eq 'first') {
776 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
777 $suffix .= '_FIRST_BYTE';
778 $str = "0x$str"; # Is a numeric constant
779 }
780 else {
781 die "Unknown flag at line $.: $_\n";
782 }
81a2a11f 783 }
4a4b1311 784 printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp;
a1beba5b 785 }
09cc440d 786
c7b32e72
KW
787 # Now output the strings of opening/closing delimiters. The Unicode
788 # values were earlier entered into %paireds
789 my $utf8_opening = "";
790 my $utf8_closing = "";
791 my $non_utf8_opening = "";
792 my $non_utf8_closing = "";
793 my $deprecated_if_not_mirrored = "";
794 my $non_utf8_deprecated_if_not_mirrored = "";
795
796 for my $from (sort { $a <=> $b } keys %paireds) {
797 my $to = $paireds{$from};
798 my $utf8_from_backslashed = backslash_x_form($from, $charset);
799 my $utf8_to_backslashed = backslash_x_form($to, $charset);
800 my $non_utf8_from_backslashed;
801 my $non_utf8_to_backslashed;
802
803 $utf8_opening .= $utf8_from_backslashed;
804 $utf8_closing .= $utf8_to_backslashed;
805
806 if ($from < 256) {
807 $non_utf8_from_backslashed =
808 backslash_x_form($from, $charset, 'not_utf8');
809 $non_utf8_to_backslashed =
810 backslash_x_form($to, $charset, 'not_utf8');
811
812 $non_utf8_opening .= $non_utf8_from_backslashed;
813 $non_utf8_closing .= $non_utf8_to_backslashed;
814 }
815
816 # Only the ASCII range paired delimiters have traditionally been
817 # accepted. Until the feature is considered standard, the non-ASCII
818 # opening ones must be deprecated when the feature isn't in effect, so
819 # as to warn about behavior that is planned to change.
820 if ($from > 127) {
821 $deprecated_if_not_mirrored .= $utf8_from_backslashed;
822 $non_utf8_deprecated_if_not_mirrored .=
823 $non_utf8_from_backslashed if $from < 256;
835f2666
KW
824
825 # We deprecate using any of these strongly directional characters
826 # at either end of the string, in part so we could allow them to
827 # be reversed.
828 $deprecated_if_not_mirrored .= $utf8_to_backslashed
829 if index ($deprecated_if_not_mirrored,
830 $utf8_to_backslashed) < 0;
c7b32e72
KW
831 }
832
833 # The implementing code in toke.c assumes that the byte length of each
834 # opening delimiter is the same as its mirrored closing one. This
835 # makes sure of that by checking upon each iteration of the loop.
836 if (length $utf8_opening != length $utf8_closing) {
837 die "Byte length of representation of '"
838 . charnames::viacode($from)
839 . " differs from its mapping '"
840 . charnames::viacode($to)
841 . "'";
842 }
dce1e563
KW
843
844 print STDERR format_pairs_line($from, $to) if $output_lists;
c7b32e72 845 }
dce1e563 846 $output_lists = 0; # Only output in first iteration
c7b32e72
KW
847
848 print $out_fh <<~"EOT";
849
850 # ifdef PERL_IN_TOKE_C
851 /* Paired characters for quote-like operators, in UTF-8 */
852 # define EXTRA_OPENING_UTF8_BRACKETS "$utf8_opening"
853 # define EXTRA_CLOSING_UTF8_BRACKETS "$utf8_closing"
854
855 /* And not in UTF-8 */
856 # define EXTRA_OPENING_NON_UTF8_BRACKETS "$non_utf8_opening"
857 # define EXTRA_CLOSING_NON_UTF8_BRACKETS "$non_utf8_closing"
858
859 /* And what's deprecated */
860 # define DEPRECATED_OPENING_UTF8_BRACKETS "$deprecated_if_not_mirrored"
861 # define DEPRECATED_OPENING_NON_UTF8_BRACKETS "$non_utf8_deprecated_if_not_mirrored"
862 # endif
863 EOT
864
09cc440d
KW
865 my $max_PRINT_A = 0;
866 for my $i (0x20 .. 0x7E) {
867 $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
868 }
c62fdeb7
KW
869 $max_PRINT_A = sprintf "0x%02X", $max_PRINT_A;
870 print $out_fh <<"EOT";
09cc440d 871
e80ffeda
KW
872# ifdef PERL_IN_REGCOMP_C
873# define MAX_PRINT_A $max_PRINT_A /* The max code point that isPRINT_A */
874# endif
c62fdeb7
KW
875EOT
876
877 print $out_fh get_conditional_compile_line_end();
b35552de
KW
878
879}
880
9dfe4b75
KW
881if ($output_omitteds) {
882 # We haven't bothered to delete things that later became used.
50b17915
KW
883 foreach my $which (\%paireds,
884 \%unused_bidi_pairs,
885 \%unused_pairs)
886 {
9dfe4b75
KW
887 foreach my $lhs (keys $which->%*) {
888 delete $discards{$lhs};
889 delete $discards{$which->{$lhs}};
890 }
891 }
50b17915
KW
892
893 print STDERR "\nMirrored says Unicode, but not currently used as paired string delimiters\n";
894 foreach my $from (sort { $a <=> $b } keys %unused_bidi_pairs) {
895 print STDERR format_pairs_line($from, $unused_bidi_pairs{$from});
896 }
897
898 print STDERR "\nMirror found by name, but not currently used as paired string delimiters\n";
899 foreach my $from (sort { $a <=> $b } keys %unused_pairs) {
900 print STDERR format_pairs_line($from, $unused_pairs{$from});
901 }
9dfe4b75
KW
902
903 # Invert %discards so that all the code points for a given reason are
904 # keyed by that reason.
905 my %inverted_discards;
906 foreach my $code_point (sort { $a <=> $b } keys %discards) {
907 my $type = $discards{$code_point}{reason};
908 push $inverted_discards{$type}->@*, [ $code_point,
909 $discards{$code_point}{mirror}
910 ];
911 }
912
913 # Then output each list
914 foreach my $type (sort keys %inverted_discards) {
915 print STDERR "\n$type\n" if $type ne "";
916 foreach my $ref ($inverted_discards{$type}->@*) {
917 print STDERR format_pairs_line($ref->[0], $ref->[1]);
918 }
919 }
920}
921
b35552de
KW
922my $count = 0;
923my @other_invlist = prop_invlist("Other");
924for (my $i = 0; $i < @other_invlist; $i += 2) {
925 $count += ((defined $other_invlist[$i+1])
926 ? $other_invlist[$i+1]
927 : 0x110000)
928 - $other_invlist[$i];
61dad979 929}
c62fdeb7
KW
930$count = 0x110000 - $count;
931print $out_fh <<~"EOT";
932
933 /* The number of code points not matching \\pC */
934 #ifdef PERL_IN_REGCOMP_C
935 # define NON_OTHER_COUNT $count
936 #endif
937 EOT
61dad979 938
3bfc1e70
KW
939# If this release has both the CWCM and CWCF properties, find the highest code
940# point which changes under any case change. We can use this to short-circuit
941# code
942my @cwcm = prop_invlist('CWCM');
943if (@cwcm) {
944 my @cwcf = prop_invlist('CWCF');
945 if (@cwcf) {
946 my $max = ($cwcm[-1] < $cwcf[-1])
947 ? $cwcf[-1]
948 : $cwcm[-1];
c62fdeb7
KW
949 $max = sprintf "0x%X", $max - 1;
950 print $out_fh <<~"EOS";
951
952 /* The highest code point that has any type of case change */
953 #ifdef PERL_IN_UTF8_C
954 # define HIGHEST_CASE_CHANGING_CP $max
955 #endif
956 EOS
3bfc1e70
KW
957 }
958}
959
6a5bc5ac 960print $out_fh "\n#endif /* PERL_UNICODE_CONSTANTS_H_ */\n";
d10c72f2 961
61dad979
KW
962read_only_bottom_close_and_rename($out_fh);
963
9d8e3074
KW
964# DATA FORMAT
965#
69bc4c1f
KW
966# Note that any apidoc comments you want in the file need to be added to one
967# of the prints above
968#
9d8e3074
KW
969# A blank line is output as-is.
970# Comments (lines whose first non-blank is a '#') are converted to C-style,
971# though empty comments are converted to blank lines. Otherwise, each line
972# represents one #define, and begins with either a Unicode character name with
973# the blanks and dashes in it squeezed out or replaced by underscores; or it
974# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter
975# case, the name will be looked-up to use as the name of the macro. In either
976# case, the macro name will have suffixes as listed above, and all blanks and
977# dashes will be replaced by underscores.
978#
979# Each line may optionally have one of the following flags on it, separated by
980# white space from the initial token.
981# string indicates that the output is to be of the string form
982# described in the comments above that are placed in the file.
983# string_skip_ifundef is the same as 'string', but instead of dying if the
984# code point doesn't exist, the line is just skipped: no output is
985# generated for it
986# first indicates that the output is to be of the FIRST_BYTE form.
987# tail indicates that the output is of the _TAIL form.
988# native indicates that the output is the code point, converted to the
989# platform's native character set if applicable
990#
991# If the code point has no official name, the desired name may be appended
992# after the flag, which will be ignored if there is an official name.
993#
994# This program is used to make it convenient to create compile time constants
995# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
996# having to figure things out.
997
61dad979 998__DATA__
f2e06375 999U+017F string
76837d21 1000
1dfa4f52 1001U+0300 string
2a614cdc 1002U+0307 string
a78bc3c6 1003
8f57fa7d 1004U+1E9E string_skip_if_undef
f2e06375 1005
a9f50d33
KW
1006U+FB05 string
1007U+FB06 string
a0ffb25e
KW
1008U+0130 string
1009U+0131 string
a9f50d33 1010
1dfa4f52 1011U+2010 string
5f0aa340
KW
1012BOM first
1013BOM tail
525b6419 1014
69bc4c1f
KW
1015BOM string
1016
1017U+FFFD string
1018
566efd88
KW
1019U+10FFFF string MAX_UNICODE
1020
df758df2
KW
1021NBSP native
1022NBSP string
1023
05016631 1024DEL native
c5eda08a
KW
1025CR native
1026LF native
d804860b
KW
1027VT native
1028ESC native
1dfa4f52 1029U+00DF native
69ffc8e3 1030U+00DF string
1dfa4f52
KW
1031U+00E5 native
1032U+00C5 native
1033U+00FF native
1034U+00B5 native
69ffc8e3 1035U+00B5 string