Commit | Line | Data |
---|---|---|
61dad979 KW |
1 | use v5.16.0; |
2 | use strict; | |
3 | use warnings; | |
c7b32e72 | 4 | no warnings 'experimental::regex_sets'; |
3d7c117d MB |
5 | require './regen/regen_lib.pl'; |
6 | require './regen/charset_translations.pl'; | |
1df06fae | 7 | use Unicode::UCD qw(prop_invlist prop_invmap search_invlist); |
61dad979 | 8 | use charnames qw(:loose); |
dce1e563 KW |
9 | binmode(STDERR, ":utf8"); |
10 | ||
11 | # Set this to 1 temporarily to get on stderr the complete list of paired | |
12 | # string delimiters this generates. This list is suitable for plugging into a | |
13 | # pod. | |
14 | my $output_lists = 0; | |
61dad979 | 15 | |
9dfe4b75 KW |
16 | # Set this to 1 temporarily to get on stderr the complete list of punctuation |
17 | # marks and symbols that look to be directional but we didn't include for some | |
18 | # reason. | |
19 | my $output_omitteds = 0; | |
20 | ||
1b0f46bf | 21 | my $out_fh = open_new('unicode_constants.h', '>', |
ad88cddb | 22 | {style => '*', by => $0, |
61dad979 KW |
23 | from => "Unicode data"}); |
24 | ||
25 | print $out_fh <<END; | |
d10c72f2 | 26 | |
6a5bc5ac KW |
27 | #ifndef PERL_UNICODE_CONSTANTS_H_ /* Guard against nested #includes */ |
28 | #define PERL_UNICODE_CONSTANTS_H_ 1 | |
d10c72f2 | 29 | |
4b4853d1 KW |
30 | /* This file contains #defines for the version of Unicode being used and |
31 | * various Unicode code points. The values the code point macros expand to | |
32 | * are the native Unicode code point, or all or portions of the UTF-8 encoding | |
33 | * for the code point. In the former case, the macro name has the suffix | |
34 | * "_NATIVE"; otherwise, the suffix "_UTF8". | |
61dad979 | 35 | * |
525b6419 KW |
36 | * The macros that have the suffix "_UTF8" may have further suffixes, as |
37 | * follows: | |
38 | * "_FIRST_BYTE" if the value is just the first byte of the UTF-8 | |
39 | * representation; the value will be a numeric constant. | |
40 | * "_TAIL" if instead it represents all but the first byte. This, and | |
41 | * with no additional suffix are both string constants */ | |
61dad979 | 42 | |
69bc4c1f | 43 | /* |
3f620621 | 44 | =for apidoc_section \$unicode |
69bc4c1f | 45 | |
78342678 | 46 | =for apidoc AmnU|const char *|BOM_UTF8 |
69bc4c1f KW |
47 | |
48 | This is a macro that evaluates to a string constant of the UTF-8 bytes that | |
49 | define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl | |
50 | is compiled on. This allows code to use a mnemonic for this character that | |
51 | works on both ASCII and EBCDIC platforms. | |
52 | S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in | |
53 | bytes. | |
54 | ||
78342678 | 55 | =for apidoc AmnU|const char *|REPLACEMENT_CHARACTER_UTF8 |
69bc4c1f KW |
56 | |
57 | This is a macro that evaluates to a string constant of the UTF-8 bytes that | |
58 | define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl | |
59 | is compiled on. This allows code to use a mnemonic for this character that | |
60 | works on both ASCII and EBCDIC platforms. | |
61 | S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in | |
62 | bytes. | |
63 | ||
64 | =cut | |
65 | */ | |
66 | ||
61dad979 KW |
67 | END |
68 | ||
63cd44e4 KW |
69 | sub backslash_x_form($$;$) { |
70 | # Output the code point represented by the byte string $bytes as a | |
71 | # sequence of \x{} constants. $bytes should be the UTF-8 for the code | |
72 | # point if the final parameter is absent or empty. Otherwise it should be | |
73 | # the Latin1 code point itself. | |
74 | # | |
75 | # The output is translated into the character set '$charset'. | |
76 | ||
77 | my ($bytes, $charset, $non_utf8) = @_; | |
78 | if ($non_utf8) { | |
79 | die "Must be utf8 if above 255" if $bytes > 255; | |
80 | my $a2n = get_a2n($charset); | |
81 | return sprintf "\\x%02X", $a2n->[$bytes]; | |
82 | } | |
83 | else { | |
84 | return join "", map { sprintf "\\x%02X", ord $_ } | |
85 | split //, cp_2_utfbytes($bytes, $charset); | |
86 | } | |
87 | } | |
88 | ||
36327261 KW |
89 | # The most complicated thing this program does is generate paired string |
90 | # delimiters from the Unicode database. Some of these come from the | |
91 | # Unicode Bidirectional (bidi) algorithm. | |
92 | ||
93 | # These all visually look like left and right delimiters | |
7e4a71c6 | 94 | my @bidi_strong_lefts = ( 'LESS-THAN', |
c1b67e77 | 95 | 'ELEMENT OF', |
6763e244 KW |
96 | 'PRECEDE', |
97 | 'PRECEDES', | |
36327261 | 98 | 'SMALLER THAN', |
0a4ec1c0 | 99 | 'SUBSET', |
7e4a71c6 KW |
100 | ); |
101 | my @bidi_strong_rights = ( 'GREATER-THAN', | |
c1b67e77 | 102 | 'CONTAINS', |
6763e244 KW |
103 | 'SUCCEED', |
104 | 'SUCCEEDS', | |
36327261 | 105 | 'LARGER THAN', |
0a4ec1c0 | 106 | 'SUPERSET', |
7e4a71c6 KW |
107 | ); |
108 | ||
109 | # Create an array of hashes for these, so as to translate between them, and | |
110 | # avoid recompiling patterns in the loop. | |
111 | my @bidi_strong_directionals; | |
112 | for (my $i = 0; $i < @bidi_strong_lefts; $i++) { | |
113 | push @bidi_strong_directionals, | |
114 | { | |
115 | LHS => $bidi_strong_lefts[$i], | |
116 | RHS => $bidi_strong_rights[$i], | |
117 | L_pattern => qr/\b$bidi_strong_lefts[$i]\b/, | |
118 | R_pattern => qr/\b$bidi_strong_rights[$i]\b/, | |
119 | }; | |
120 | } | |
121 | ||
50b17915 | 122 | my @ok_bidi_symbols = ( |
c7c4369b | 123 | 'TACK', |
84c1ecba | 124 | 'TURNSTILE', |
50b17915 KW |
125 | ); |
126 | my $ok_bidi_symbols_re = join '|', @ok_bidi_symbols; | |
127 | $ok_bidi_symbols_re = qr/\b($ok_bidi_symbols_re)\b/n; | |
128 | ||
129 | ||
130 | # Many characters have mirrors that Unicode hasn't included in their Bidi | |
131 | # algorithm. This program uses their names to find them. The next few | |
132 | # definitions are towards that end. | |
133 | ||
134 | # Most horizontal directionality is based on LEFT vs RIGHT. But it's | |
135 | # complicated: | |
136 | # 1) a barb on one or the other side of a harpoon doesn't indicate | |
137 | # directionality of the character. (A HARPOON is the word Unicode uses | |
138 | # to indicate an arrow with a one-sided tip.) | |
139 | my $no_barb_re = qr/(*nlb:BARB )/; | |
140 | ||
141 | # 2) RIGHT-SHADED doesn't signify anything about direction of the character | |
142 | # itself. These are the suffixes Unicode uses to indicate this. /aa is | |
143 | # needed because the wildcard names feature currently requires it for names. | |
144 | my $shaded_re = qr/ [- ] (SHADED | SHADOWED) /naax; | |
145 | ||
146 | # 3a) there are a few anomalies caught here. 'LEFT LUGGAGE' would have been | |
147 | # better named UNCLAIMED, and doesn't indicate directionality. | |
148 | my $real_LEFT_re = qr/ \b $no_barb_re LEFT (*nla: $shaded_re) | |
149 | (*nla: [ ] LUGGAGE \b) | |
150 | /nx; | |
151 | # 3b) And in most cases,a RIGHT TRIANGLE also doesn't refer to | |
152 | # directionality, but indicates it contains a 90 degree angle. | |
153 | my $real_RIGHT_re = qr/ \b $no_barb_re RIGHT (*nla: $shaded_re) | |
154 | (*nla: [ ] (TRI)? ANGLE \b) | |
155 | /nx; | |
156 | # More items could be added to these as needed | |
157 | ||
158 | # 4) something that is pointing R goes on the left, so is different than | |
159 | # the character on the R. For example, a RIGHT BRACKET would be | |
160 | # different from a RIGHT-FACING bracket. These patterns capture the | |
161 | # typical ways that Unicode character names indicate the latter meaning | |
162 | # as a suffix to RIGHT or LEFT | |
163 | my $pointing_suffix_re = qr/ ( WARDS # e.g., RIGHTWARDS | |
164 | | [ ] ARROW # A R arrow points to the R | |
165 | | [ -] FACING | |
166 | | [ -] POINTING | |
167 | | [ ] PENCIL # Implies a direction of its | |
168 | # point | |
169 | ) \b /nx; | |
170 | # And correspondingly for a prefix for LEFT RIGHT | |
171 | my $pointing_prefix_re = qr/ \b ( # e.g. UP RIGHT implies a direction | |
172 | UP ( [ ] AND)? | |
173 | | DOWN ( [ ] AND)? | |
174 | | CONVERGING | |
175 | | POINTING [ ] (DIRECTLY)? | |
176 | | TO [ ] THE | |
177 | ) | |
178 | [ ] | |
179 | /nx; | |
180 | ||
7e4a71c6 KW |
181 | my @other_directionals = |
182 | { | |
183 | LHS => 'LEFT', | |
184 | RHS => 'RIGHT', | |
185 | L_pattern => | |
50b17915 KW |
186 | # Something goes on the left if it contains LEFT and doesn't |
187 | # point left, or it contains RIGHT and does point right. | |
188 | qr/ \b (*nlb: $pointing_prefix_re) $real_LEFT_re | |
189 | (*nla: $pointing_suffix_re) | |
190 | | \b (*plb: $pointing_prefix_re) $real_RIGHT_re \b | |
191 | | \b $real_RIGHT_re (*pla: $pointing_suffix_re) | |
192 | /nx, | |
7e4a71c6 | 193 | R_pattern => |
50b17915 KW |
194 | qr/ \b (*nlb: $pointing_prefix_re) $real_RIGHT_re |
195 | (*nla: $pointing_suffix_re) | |
196 | | \b (*plb: $pointing_prefix_re) $real_LEFT_re \b | |
197 | | \b $real_LEFT_re (*pla: $pointing_suffix_re) | |
198 | /nx, | |
7e4a71c6 KW |
199 | }; |
200 | ||
50b17915 KW |
201 | # Some horizontal directionality is based on EAST vs WEST. These words are |
202 | # almost always used by Unicode to indicate the direction pointing to, without | |
203 | # the general consistency in phrasing in L/R above. There are a handful of | |
204 | # possible exceptions, with only WEST WIND ever at all possibly an issue | |
205 | push @other_directionals, | |
206 | { | |
207 | LHS => 'EAST', | |
208 | RHS => 'WEST', | |
209 | L_pattern => qr/ \b ( EAST (*nla: [ ] WIND) | |
210 | | WEST (*pla: [ ] WIND)) \b /x, | |
211 | R_pattern => qr/ \b ( WEST (*nla: [ ] WIND) | |
212 | | EAST (*pla: [ ] WIND)) \b /x, | |
213 | }; | |
214 | ||
215 | # The final way the Unicode signals mirroring is by using the words REVERSE or | |
216 | # REVERSED; | |
565fbe1b | 217 | my $reverse_re = qr/ \b REVERSE D? [- ] /x; |
dce1e563 | 218 | |
7e4a71c6 KW |
219 | # Create a mapping from each direction to its opposite one |
220 | my %opposite_of; | |
221 | foreach my $directional (@bidi_strong_directionals, @other_directionals) { | |
222 | $opposite_of{$directional->{LHS}} = $directional->{RHS}; | |
223 | $opposite_of{$directional->{RHS}} = $directional->{LHS}; | |
224 | } | |
225 | ||
226 | # Join the two types of each direction as alternatives | |
227 | my $L_re = join "|", map { $_->{L_pattern} } @bidi_strong_directionals, | |
228 | @other_directionals; | |
229 | my $R_re = join "|", map { $_->{R_pattern} } @bidi_strong_directionals, | |
230 | @other_directionals; | |
231 | # And anything containing directionality will be either one of these two | |
232 | my $directional_re = join "|", $L_re, $R_re; | |
dce1e563 | 233 | |
7e4a71c6 KW |
234 | # Now compile the strings that result from above |
235 | $L_re = qr/$L_re/; | |
236 | $R_re = qr/$R_re/; | |
237 | $directional_re = qr/($directional_re)/; # Make sure to capture $1 | |
dce1e563 | 238 | |
3e9e4fd8 | 239 | my @included_symbols = ( |
806eaaf9 | 240 | 0x2326, 0x232B, # ERASE |
a29825fb KW |
241 | 0x23E9 .. 0x23EA, # DOUBLE TRIANGLE |
242 | 0x23ED .. 0x23EE, # DOUBLE TRIANGLE with BAR | |
3e9e4fd8 KW |
243 | 0x269E .. 0x269F, # THREE LINES CONVERGING |
244 | 0x1D102 .. 0x1D103, # MUSIC STAVES | |
245 | 0x1D106 .. 0x1D107, # MUSIC STAVES | |
ad558b4e KW |
246 | 0x1F57B, # TELEPHONE RECEIVER |
247 | 0x1F57D, # TELEPHONE RECEIVER | |
a863e562 KW |
248 | 0x1F508 .. 0x1F50A, # LOUD SPEAKER |
249 | 0x1F568 .. 0x1F56A, # LOUD SPEAKER | |
e73a9e14 | 250 | 0x1F5E6 .. 0x1F5E7, # THREE RAYS |
3e9e4fd8 KW |
251 | ); |
252 | my %included_symbols; | |
253 | $included_symbols{$_} = 1 for @included_symbols; | |
254 | ||
9dfe4b75 | 255 | sub format_pairs_line($;$) { |
dce1e563 KW |
256 | my ($from, $to) = @_; |
257 | ||
9dfe4b75 | 258 | # Format a line containing a character singleton or pair in preparation |
dce1e563 KW |
259 | # for output, suitable for pod. |
260 | ||
261 | my $lhs_name = charnames::viacode($from); | |
262 | my $lhs_hex = sprintf "%04X", $from; | |
263 | my $rhs_name; | |
264 | my $rhs_hex; | |
265 | my $name = $lhs_name; | |
266 | ||
267 | my $hanging_indent = 26; | |
268 | ||
9dfe4b75 KW |
269 | # Treat a trivial pair as a singleton |
270 | undef $to if defined $to && $to == $from; | |
271 | ||
dce1e563 KW |
272 | if (defined $to) { |
273 | my $rhs_name = charnames::viacode($to); | |
274 | $rhs_hex = sprintf "%04X", $to; | |
275 | ||
276 | # Most of the names differ only in LEFT vs RIGHT; some in | |
277 | # LESS-THAN vs GREATER-THAN. It takes less space, and is easier to | |
278 | # understand if they are displayed combined. | |
279 | if ($name =~ s/$directional_re/$opposite_of{$1}/gr eq $rhs_name) { | |
280 | $name =~ s,$directional_re,$1/$opposite_of{$1},g; | |
281 | } | |
282 | else { # Otherwise, display them sequentially | |
283 | $name .= ", " . $rhs_name; | |
284 | } | |
285 | } | |
286 | ||
287 | # Handle double-width characters, based on the East Asian Width property. | |
288 | # Add an extra space to non-wide ones so things stay vertically aligned. | |
289 | my $extra = 0; | |
290 | my $output_line = " " # Indent in case output being used for verbatim | |
291 | # pod | |
292 | . chr $from; | |
293 | if (chr($from) =~ /[\p{EA=W}\p{EA=F}]/) { | |
294 | $extra++; # The length() will be shorter than the displayed | |
295 | # width | |
296 | } | |
297 | else { | |
298 | $output_line .= " "; | |
299 | } | |
300 | if (defined $to) { | |
301 | $output_line .= " " . chr $to; | |
302 | if (chr($to) =~ /[\p{EA=W}\p{EA=F}]/) { | |
303 | $extra++; | |
304 | } | |
305 | else { | |
306 | $output_line .= " "; | |
307 | } | |
308 | } | |
309 | else { | |
310 | $output_line .= " "; | |
311 | } | |
312 | ||
313 | $output_line .= " U+$lhs_hex"; | |
314 | $output_line .= ", U+$rhs_hex" if defined $to;; | |
315 | my $cur_len = $extra + length $output_line; | |
316 | $output_line .= " " x ($hanging_indent - $cur_len); | |
317 | ||
318 | my $max_len = 74; # Pod formatter will indent 4 spaces | |
319 | $cur_len = length $output_line; | |
320 | ||
321 | if ($cur_len + length $name <= $max_len) { | |
322 | $output_line .= $name; # It will fit | |
323 | } | |
324 | else { # It won't fit. Append a segment that is unbreakable until would | |
325 | # exceed the available width; then start on a new line | |
326 | # Doesn't handle the case where the whole segment doesn't fit; | |
327 | # this just doesn't come up with the input data. | |
328 | while ($name =~ / ( .+? ) \b{lb} /xg) { | |
329 | my $segment = $1; | |
330 | my $added_length = length $segment; | |
331 | if ($cur_len + $added_length > $max_len) { | |
332 | $output_line =~ s/ +$//; | |
333 | $output_line .= "\n" . " " x $hanging_indent; | |
334 | $cur_len = $hanging_indent; | |
335 | } | |
336 | ||
337 | $output_line .= $segment; | |
338 | $cur_len += $added_length; | |
339 | } | |
340 | } | |
341 | ||
342 | return $output_line . "\n"; | |
343 | } | |
344 | ||
4b4853d1 KW |
345 | my $version = Unicode::UCD::UnicodeVersion(); |
346 | my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x; | |
347 | $dotdot = 0 unless defined $dotdot; | |
348 | ||
349 | print $out_fh <<END; | |
350 | #define UNICODE_MAJOR_VERSION $major | |
351 | #define UNICODE_DOT_VERSION $dot | |
352 | #define UNICODE_DOT_DOT_VERSION $dotdot | |
353 | ||
354 | END | |
355 | ||
c7b32e72 KW |
356 | # Gather the characters in Unicode that have left/right symmetry suitable for |
357 | # paired string delimiters | |
7e4a71c6 | 358 | my %paireds; |
c7b32e72 | 359 | |
1df06fae KW |
360 | # So don't have to grep an array to determine if have already dealt with the |
361 | # characters that are the keys | |
362 | my %inverted_paireds; | |
363 | ||
c7b32e72 KW |
364 | # This property is the universe of all characters in Unicode which |
365 | # are of some import to the Bidirectional Algorithm, and for which there is | |
366 | # another Unicode character that is a mirror of it. | |
367 | my ($bmg_invlist, $bmg_invmap, $format, $bmg_default) = | |
368 | prop_invmap("Bidi_Mirroring_Glyph"); | |
369 | ||
9dfe4b75 KW |
370 | # Keep track of the characters we don't use, and why not. |
371 | my %discards; | |
372 | my $non_directional = 'No perceived horizontal direction'; | |
373 | my $not_considered_directional_because = "Not considered directional because"; | |
50b17915 | 374 | my $trailing_up_down = 'Vertical direction after all L/R direction'; |
9dfe4b75 | 375 | my $unpaired = "Didn't find a mirror"; |
210ad843 | 376 | my $illegal = "Mirror illegal"; |
9dfe4b75 | 377 | my $no_encoded_mate = "Mirrored, but Unicode has no encoded mirror"; |
7e4a71c6 | 378 | my $bidirectional = "Bidirectional"; |
9dfe4b75 | 379 | |
50b17915 KW |
380 | my %unused_bidi_pairs; |
381 | my %inverted_unused_bidi_pairs; | |
382 | my %unused_pairs; # | |
383 | my %inverted_unused_pairs; | |
384 | ||
385 | # Could be more explicit about allowing, e.g. ARROWS, ARROWHEAD, but this | |
386 | # suffices | |
387 | my $arrow_like_re = qr/\b(ARROW|HARPOON)/; | |
388 | ||
389 | # Go through the Unicode Punctuation and Symbol characters looking for ones | |
390 | # that have mirrors, suitable for being string delimiters. Some of these are | |
391 | # easily derivable from Unicode properties dealing with the bidirectional | |
392 | # algorithm. But the purpose of that algorithm isn't the same as ours, and | |
393 | # excludes many suitable ones. In particular, no arrows are included in it. | |
394 | # To find suitable ones, we also look at character names to see if there is a | |
395 | # character with that name, but the horizontal direction reversed. That will | |
396 | # almost certainly be a mirror. | |
210ad843 | 397 | foreach my $list (qw(Punctuation Symbol)) { |
1df06fae KW |
398 | my @invlist = prop_invlist($list); |
399 | die "Empty list $list" unless @invlist; | |
400 | ||
7e4a71c6 KW |
401 | my $is_Symbol = $list eq 'Symbol'; |
402 | ||
1df06fae KW |
403 | # Convert from an inversion list to an array containing everything that |
404 | # matches. (This uses the recipe given in Unicode::UCD.) | |
405 | my @full_list; | |
406 | for (my $i = 0; $i < @invlist; $i += 2) { | |
407 | my $upper = ($i + 1) < @invlist | |
408 | ? $invlist[$i+1] - 1 # In range | |
409 | : $Unicode::UCD::MAX_CP; # To infinity. | |
410 | for my $j ($invlist[$i] .. $upper) { | |
411 | push @full_list, $j; | |
412 | } | |
413 | } | |
414 | ||
415 | CODE_POINT: | |
416 | foreach my $code_point (@full_list) { | |
417 | #print STDERR __FILE__, ": ", __LINE__, ": ", sprintf("%04x ", $code_point), charnames::viacode($code_point), "\n"; | |
418 | my $chr = chr $code_point; | |
c7b32e72 | 419 | |
1df06fae KW |
420 | # Don't reexamine something we've already determined. This happens |
421 | # when its mate was earlier processed and found this one. | |
50b17915 KW |
422 | foreach my $hash_ref (\%paireds, \%inverted_paireds, |
423 | \%unused_bidi_pairs, \%inverted_unused_bidi_pairs, | |
424 | \%unused_pairs, \%inverted_unused_pairs) | |
425 | { | |
1df06fae KW |
426 | next CODE_POINT if exists $hash_ref->{$code_point} |
427 | } | |
c7b32e72 | 428 | |
1df06fae | 429 | my $name = charnames::viacode($code_point); |
50b17915 | 430 | my $original_had_REVERSE; |
1df06fae KW |
431 | my $mirror; |
432 | my $mirror_code_point; | |
433 | ||
434 | # If Unicode considers this to have a mirror, we don't have to go | |
435 | # looking | |
436 | if ($chr =~ /\p{Bidi_Mirrored}/) { | |
437 | my $i = search_invlist($bmg_invlist, $code_point); | |
438 | $mirror_code_point = $bmg_invmap->[$i]; | |
439 | if ( $mirror_code_point eq $bmg_default) { | |
9dfe4b75 KW |
440 | $discards{$code_point} = { reason => $no_encoded_mate, |
441 | mirror => undef | |
442 | }; | |
1df06fae KW |
443 | next; |
444 | } | |
c7b32e72 | 445 | |
1df06fae KW |
446 | # Certain Unicode properties classify some mirrored characters as |
447 | # opening (left) vs closing (right). Skip the closing ones this | |
448 | # iteration; they will be handled later when the opening mate | |
449 | # comes along. | |
450 | if ($chr =~ /(?[ \p{BPT=Close} | |
451 | | \p{Gc=Close_Punctuation} | |
452 | ])/) | |
453 | { | |
454 | next; # Get this when its opening mirror comes up. | |
455 | } | |
456 | elsif ($chr =~ /(?[ \p{BPT=Open} | |
c7b32e72 KW |
457 | | \p{Gc=Open_Punctuation} |
458 | | \p{Gc=Initial_Punctuation} | |
1df06fae | 459 | | \p{Gc=Final_Punctuation} |
c7b32e72 | 460 | ])/) |
1df06fae KW |
461 | { |
462 | # Here, it's a left delimiter. (The ones in Final Punctuation | |
463 | # can be opening ones in some languages.) | |
464 | $paireds{$code_point} = $mirror_code_point; | |
465 | $inverted_paireds{$mirror_code_point} = $code_point; | |
466 | ||
467 | # If the delimiter can be used on either side, add its | |
468 | # complement | |
469 | if ($chr =~ /(?[ \p{Gc=Initial_Punctuation} | |
470 | | \p{Gc=Final_Punctuation} | |
471 | ])/) | |
472 | { | |
473 | $paireds{$mirror_code_point} = $code_point; | |
474 | $inverted_paireds{$code_point} = $mirror_code_point; | |
475 | } | |
476 | ||
477 | next; | |
478 | } | |
835f2666 | 479 | |
7e4a71c6 KW |
480 | # Unicode doesn't consider '< >' to be brackets, but Perl does. There are |
481 | # lots of variants of these in Unicode; easiest to accept all of | |
482 | # them that aren't bidirectional (which would be visually | |
483 | # confusing). | |
484 | for (my $i = 0; $i < @bidi_strong_directionals; $i++) { | |
485 | my $hash_ref = $bidi_strong_directionals[$i]; | |
486 | ||
487 | next if $name !~ $hash_ref->{L_pattern}; | |
488 | ||
489 | if ($name =~ $hash_ref->{R_pattern}) { | |
490 | $discards{$code_point} = { reason => $bidirectional, | |
491 | mirror => $mirror_code_point | |
492 | }; | |
493 | next CODE_POINT; | |
494 | } | |
495 | ||
496 | $paireds{$code_point} = $mirror_code_point; | |
497 | $inverted_paireds{$mirror_code_point} = $code_point; | |
50b17915 | 498 | $original_had_REVERSE = $name =~ /$reverse_re/; |
7e4a71c6 KW |
499 | next CODE_POINT; |
500 | } | |
501 | ||
50b17915 KW |
502 | # The other paired symbols are more iffy as being desirable paired |
503 | # delimiters; we let the code below decide what to do with them. | |
504 | $mirror = charnames::viacode($mirror_code_point); | |
1df06fae KW |
505 | } |
506 | else { # Here is not involved with the bidirectional algorithm. | |
c7b32e72 | 507 | |
1df06fae KW |
508 | # Get the mirror (if any) from reversing the directions in the |
509 | # name, and looking that up | |
510 | $mirror = $name; | |
511 | $mirror =~ s/$directional_re/$opposite_of{$1}/g; | |
50b17915 | 512 | $original_had_REVERSE = $mirror =~ s/$reverse_re//g; |
1df06fae KW |
513 | $mirror_code_point = charnames::vianame($mirror); |
514 | } | |
515 | ||
50b17915 KW |
516 | # Letter-like symbols don't really stand on their own and don't look |
517 | # like traditional delimiters. | |
518 | if ($chr =~ /\p{Sk}/) { | |
519 | $discards{$code_point} | |
520 | = { reason => "Letter-like symbols are not eligible", | |
521 | mirror => $mirror_code_point | |
522 | }; | |
523 | next CODE_POINT; | |
524 | } | |
1df06fae KW |
525 | |
526 | # Certain names are always treated as non directional. | |
50b17915 KW |
527 | if ($name =~ m{ \b ( WITH [ ] (?:LEFT|RIGHT) [ ] HALF [ ] BLACK |
528 | | BLOCK | |
529 | | BOX [ ] DRAWINGS | |
530 | | CIRCLE [ ] WITH | |
531 | | EXTENSION | |
532 | | (?: UPPER | LOWER ) [ ] HOOK | |
533 | ||
1df06fae KW |
534 | # The VERTICAL marks these as not actually |
535 | # L/R mirrored. | |
50b17915 KW |
536 | | PRESENTATION [ ] FORM [ ] FOR [ ] VERTICAL |
537 | | QUADRANT | |
538 | | SHADE | |
539 | | SQUARE [ ] WITH | |
1df06fae KW |
540 | ) \b }x) |
541 | { | |
9dfe4b75 KW |
542 | $discards{$code_point} |
543 | = { reason => "$not_considered_directional_because name" | |
544 | . " contains '$1'", | |
545 | mirror => $mirror_code_point | |
546 | }; | |
1df06fae KW |
547 | next CODE_POINT; |
548 | } | |
549 | ||
550 | # If these are equal, it means the original had no horizontal | |
551 | # directioning | |
552 | if ($name eq $mirror) { | |
9dfe4b75 KW |
553 | $discards{$code_point} = { reason => $non_directional, |
554 | mirror => undef | |
555 | }; | |
1df06fae KW |
556 | next CODE_POINT; |
557 | } | |
558 | ||
50b17915 KW |
559 | # If the name has both left and right directions, it is bidirectional, |
560 | # so not suited to be a paired delimiter. | |
561 | if ($name =~ $L_re && $name =~ $R_re) { | |
562 | $discards{$code_point} = { reason => $bidirectional, | |
563 | mirror => $mirror_code_point | |
564 | }; | |
565 | next CODE_POINT; | |
566 | } | |
567 | ||
9153861e KW |
568 | # If no mate was found, it could be that it's like the case of |
569 | # SPEAKER vs RIGHT SPEAKER (which probably means the mirror was added | |
570 | # in a later version than the original. Check by removing all | |
571 | # directionality and trying to see if there is a character with that | |
572 | # name. | |
1df06fae | 573 | if (! defined $mirror_code_point) { |
9153861e KW |
574 | $mirror =~ s/$directional_re //; |
575 | $mirror_code_point = charnames::vianame($mirror); | |
576 | if (! defined $mirror_code_point) { | |
577 | ||
578 | # Still no mate. | |
9dfe4b75 KW |
579 | $discards{$code_point} = { reason => $unpaired, |
580 | mirror => undef | |
581 | }; | |
1df06fae | 582 | next; |
9153861e | 583 | } |
1df06fae KW |
584 | } |
585 | ||
586 | if ($code_point == $mirror_code_point) { | |
9dfe4b75 KW |
587 | $discards{$code_point} = |
588 | { reason => "$unpaired - Single character, multiple" | |
589 | . " names; Unicode name correction", | |
590 | mirror => $mirror_code_point | |
591 | }; | |
1df06fae KW |
592 | next; |
593 | } | |
594 | ||
50b17915 KW |
595 | if ($is_Symbol) { |
596 | ||
597 | # Skip if the the direction is followed by a vertical motion | |
598 | # (which defeats the left-right directionality). | |
599 | if ( $name =~ / ^ .* $no_barb_re | |
600 | \b (UP|DOWN|NORTH|SOUTH) /gx | |
601 | and not $name =~ /$directional_re/g) | |
602 | { | |
603 | $discards{$code_point} = { reason => $trailing_up_down, | |
604 | mirror => $mirror_code_point | |
605 | }; | |
606 | next; | |
607 | } | |
608 | } | |
609 | ||
210ad843 KW |
610 | # There are a few characters like REVERSED SEMICOLON that are mirrors, |
611 | # but have always commonly been used unmirrored. There is also the | |
612 | # PILCROW SIGN and its mirror which might be considered to be | |
613 | # legitimate mirrors, but maybe not. Additionally the current | |
614 | # algorithm for finding the mirror depends on each member of a pair | |
615 | # being respresented by the same number of bytes as its mate. By | |
616 | # skipping these, we solve both problems | |
617 | if ($code_point < 256 != $mirror_code_point < 256) { | |
618 | $discards{$code_point} = { reason => $illegal, | |
619 | mirror => $mirror_code_point | |
620 | }; | |
621 | next; | |
622 | } | |
623 | ||
624 | # And '/' and '\' are mirrors that we don't accept | |
625 | if ( $name =~ /SOLIDUS/ | |
626 | && $name =~ s/REVERSE SOLIDUS/SOLIDUS/r | |
627 | eq $mirror =~ s/REVERSE SOLIDUS/SOLIDUS/r) | |
628 | { | |
629 | $discards{$code_point} = { reason => $illegal, | |
630 | mirror => $mirror_code_point | |
631 | }; | |
632 | next; | |
633 | } | |
634 | ||
50b17915 KW |
635 | # We enter the pair with the original code point on the left; if it |
636 | # should instead be on the R, swap. Most Symbols that contain the | |
637 | # word REVERSE go on the rhs, except those whose names explicitly | |
638 | # indicate lhs. FINAL in the name indicates stays on the rhs. | |
639 | if ($name =~ $R_re || ( $original_had_REVERSE | |
640 | && $is_Symbol | |
641 | && $name !~ $L_re | |
642 | && $name !~ /\bFINAL\b/ | |
643 | )) | |
644 | { | |
645 | my $temp = $code_point; | |
646 | $code_point = $mirror_code_point; | |
647 | $mirror_code_point = $temp; | |
648 | } | |
649 | ||
4a4b7455 KW |
650 | # Only a few symbols are currently used, determined by inspection, but |
651 | # all the (few) remaining paired punctuations. | |
50b17915 | 652 | if ( ! $is_Symbol |
3e9e4fd8 | 653 | || defined $included_symbols{$code_point} |
50b17915 KW |
654 | || ( $chr =~ /\p{BidiMirrored}/ |
655 | && ( $name =~ $ok_bidi_symbols_re | |
656 | || $mirror =~ $ok_bidi_symbols_re)) | |
4a4b7455 | 657 | || $name =~ /\bINDEX\b/ # index FINGER pointing |
0b6e3da1 KW |
658 | |
659 | # Also accept most arrows that don't have N/S in their | |
660 | # names. (Those are almost all currently pointing at an | |
661 | # angle, like SW anyway.) | |
662 | || ( $name !~ /\bNORTH|SOUTH\b/ | |
663 | && $name =~ $arrow_like_re | |
664 | ||
665 | # Arguably bi-directional | |
666 | && $name !~ /U-SHAPED/) | |
50b17915 | 667 | ) { |
1df06fae KW |
668 | $paireds{$code_point} = $mirror_code_point; |
669 | $inverted_paireds{$mirror_code_point} = $code_point; | |
670 | ||
671 | # Again, accept either one at either end for these ambiguous | |
672 | # punctuation delimiters | |
673 | if ($chr =~ /[\p{PI}\p{PF}]/x) { | |
674 | $paireds{$mirror_code_point} = $code_point; | |
675 | $inverted_paireds{$code_point} = $mirror_code_point; | |
676 | } | |
50b17915 KW |
677 | } |
678 | elsif ( $chr =~ /\p{BidiMirrored}/ | |
679 | && ! exists $inverted_unused_bidi_pairs{$code_point} | |
680 | && ! defined $inverted_unused_bidi_pairs{$code_point}) | |
681 | { | |
682 | $unused_bidi_pairs{$code_point} = $mirror_code_point; | |
683 | $inverted_unused_bidi_pairs{$mirror_code_point} = $code_point; | |
684 | } | |
685 | elsif ( ! exists $inverted_unused_pairs{$code_point} | |
686 | && ! defined $inverted_unused_pairs{$code_point}) | |
687 | { # A pair that we don't currently accept | |
688 | $unused_pairs{$code_point} = $mirror_code_point; | |
689 | $inverted_unused_pairs{$mirror_code_point} = $code_point; | |
690 | } | |
1df06fae KW |
691 | } # End of loop through code points |
692 | } # End of loop through properties | |
c7b32e72 KW |
693 | |
694 | # The rest of the data are at __DATA__ in this file. | |
61dad979 | 695 | |
ad88cddb KW |
696 | my @data = <DATA>; |
697 | ||
698 | foreach my $charset (get_supported_code_pages()) { | |
699 | print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
700 | ||
c30a0cf2 | 701 | my @a2n = @{get_a2n($charset)}; |
ad88cddb | 702 | |
4a4b1311 KW |
703 | for ( @data ) { |
704 | chomp; | |
705 | ||
706 | # Convert any '#' comments to /* ... */; empty lines and comments are | |
707 | # output as blank lines | |
708 | if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) { | |
709 | my $comment_body = $1 // ""; | |
710 | if ($comment_body ne "") { | |
711 | print $out_fh "/* $comment_body */\n"; | |
712 | } | |
713 | else { | |
714 | print $out_fh "\n"; | |
715 | } | |
716 | next; | |
5a731a17 | 717 | } |
76837d21 | 718 | |
4a4b1311 KW |
719 | unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token |
720 | (?: [\ ]+ ( [^ ]* ) )? # optional flag | |
721 | (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required | |
722 | /x) | |
723 | { | |
724 | die "Unexpected syntax at line $.: $_\n"; | |
725 | } | |
61dad979 | 726 | |
4a4b1311 KW |
727 | my $name_or_cp = $1; |
728 | my $flag = $2; | |
729 | my $desired_name = $3; | |
730 | ||
731 | my $name; | |
732 | my $cp; | |
733 | my $U_cp; # code point in Unicode (not-native) terms | |
4a4b1311 KW |
734 | |
735 | if ($name_or_cp =~ /^U\+(.*)/) { | |
736 | $U_cp = hex $1; | |
737 | $name = charnames::viacode($name_or_cp); | |
738 | if (! defined $name) { | |
280ac755 KW |
739 | next if $flag =~ /skip_if_undef/; |
740 | die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name; | |
4a4b1311 KW |
741 | $name = ""; |
742 | } | |
743 | } | |
744 | else { | |
745 | $name = $name_or_cp; | |
746 | die "Unknown name '$name' at line $.: $_\n" unless defined $name; | |
747 | $U_cp = charnames::vianame($name =~ s/_/ /gr); | |
632c9f80 | 748 | } |
61dad979 | 749 | |
4a4b1311 KW |
750 | $cp = ($U_cp < 256) |
751 | ? $a2n[$U_cp] | |
752 | : $U_cp; | |
ad88cddb | 753 | |
4a4b1311 KW |
754 | $name = $desired_name if $name eq "" && $desired_name; |
755 | $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes | |
61dad979 | 756 | |
4a4b1311 KW |
757 | my $str; |
758 | my $suffix; | |
759 | if (defined $flag && $flag eq 'native') { | |
760 | die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff; | |
761 | $suffix = '_NATIVE'; | |
762 | $str = sprintf "0x%02X", $cp; # Is a numeric constant | |
81a2a11f KW |
763 | } |
764 | else { | |
63cd44e4 | 765 | $str = backslash_x_form($U_cp, $charset); |
4a4b1311 KW |
766 | |
767 | $suffix = '_UTF8'; | |
768 | if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) { | |
769 | $str = "\"$str\""; # Will be a string constant | |
770 | } elsif ($flag eq 'tail') { | |
771 | $str =~ s/\\x..//; # Remove the first byte | |
772 | $suffix .= '_TAIL'; | |
773 | $str = "\"$str\""; # Will be a string constant | |
774 | } | |
775 | elsif ($flag eq 'first') { | |
776 | $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte | |
777 | $suffix .= '_FIRST_BYTE'; | |
778 | $str = "0x$str"; # Is a numeric constant | |
779 | } | |
780 | else { | |
781 | die "Unknown flag at line $.: $_\n"; | |
782 | } | |
81a2a11f | 783 | } |
4a4b1311 | 784 | printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp; |
a1beba5b | 785 | } |
09cc440d | 786 | |
c7b32e72 KW |
787 | # Now output the strings of opening/closing delimiters. The Unicode |
788 | # values were earlier entered into %paireds | |
789 | my $utf8_opening = ""; | |
790 | my $utf8_closing = ""; | |
791 | my $non_utf8_opening = ""; | |
792 | my $non_utf8_closing = ""; | |
793 | my $deprecated_if_not_mirrored = ""; | |
794 | my $non_utf8_deprecated_if_not_mirrored = ""; | |
795 | ||
796 | for my $from (sort { $a <=> $b } keys %paireds) { | |
797 | my $to = $paireds{$from}; | |
798 | my $utf8_from_backslashed = backslash_x_form($from, $charset); | |
799 | my $utf8_to_backslashed = backslash_x_form($to, $charset); | |
800 | my $non_utf8_from_backslashed; | |
801 | my $non_utf8_to_backslashed; | |
802 | ||
803 | $utf8_opening .= $utf8_from_backslashed; | |
804 | $utf8_closing .= $utf8_to_backslashed; | |
805 | ||
806 | if ($from < 256) { | |
807 | $non_utf8_from_backslashed = | |
808 | backslash_x_form($from, $charset, 'not_utf8'); | |
809 | $non_utf8_to_backslashed = | |
810 | backslash_x_form($to, $charset, 'not_utf8'); | |
811 | ||
812 | $non_utf8_opening .= $non_utf8_from_backslashed; | |
813 | $non_utf8_closing .= $non_utf8_to_backslashed; | |
814 | } | |
815 | ||
816 | # Only the ASCII range paired delimiters have traditionally been | |
817 | # accepted. Until the feature is considered standard, the non-ASCII | |
818 | # opening ones must be deprecated when the feature isn't in effect, so | |
819 | # as to warn about behavior that is planned to change. | |
820 | if ($from > 127) { | |
821 | $deprecated_if_not_mirrored .= $utf8_from_backslashed; | |
822 | $non_utf8_deprecated_if_not_mirrored .= | |
823 | $non_utf8_from_backslashed if $from < 256; | |
835f2666 KW |
824 | |
825 | # We deprecate using any of these strongly directional characters | |
826 | # at either end of the string, in part so we could allow them to | |
827 | # be reversed. | |
828 | $deprecated_if_not_mirrored .= $utf8_to_backslashed | |
829 | if index ($deprecated_if_not_mirrored, | |
830 | $utf8_to_backslashed) < 0; | |
c7b32e72 KW |
831 | } |
832 | ||
833 | # The implementing code in toke.c assumes that the byte length of each | |
834 | # opening delimiter is the same as its mirrored closing one. This | |
835 | # makes sure of that by checking upon each iteration of the loop. | |
836 | if (length $utf8_opening != length $utf8_closing) { | |
837 | die "Byte length of representation of '" | |
838 | . charnames::viacode($from) | |
839 | . " differs from its mapping '" | |
840 | . charnames::viacode($to) | |
841 | . "'"; | |
842 | } | |
dce1e563 KW |
843 | |
844 | print STDERR format_pairs_line($from, $to) if $output_lists; | |
c7b32e72 | 845 | } |
dce1e563 | 846 | $output_lists = 0; # Only output in first iteration |
c7b32e72 KW |
847 | |
848 | print $out_fh <<~"EOT"; | |
849 | ||
850 | # ifdef PERL_IN_TOKE_C | |
851 | /* Paired characters for quote-like operators, in UTF-8 */ | |
852 | # define EXTRA_OPENING_UTF8_BRACKETS "$utf8_opening" | |
853 | # define EXTRA_CLOSING_UTF8_BRACKETS "$utf8_closing" | |
854 | ||
855 | /* And not in UTF-8 */ | |
856 | # define EXTRA_OPENING_NON_UTF8_BRACKETS "$non_utf8_opening" | |
857 | # define EXTRA_CLOSING_NON_UTF8_BRACKETS "$non_utf8_closing" | |
858 | ||
859 | /* And what's deprecated */ | |
860 | # define DEPRECATED_OPENING_UTF8_BRACKETS "$deprecated_if_not_mirrored" | |
861 | # define DEPRECATED_OPENING_NON_UTF8_BRACKETS "$non_utf8_deprecated_if_not_mirrored" | |
862 | # endif | |
863 | EOT | |
864 | ||
09cc440d KW |
865 | my $max_PRINT_A = 0; |
866 | for my $i (0x20 .. 0x7E) { | |
867 | $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A; | |
868 | } | |
c62fdeb7 KW |
869 | $max_PRINT_A = sprintf "0x%02X", $max_PRINT_A; |
870 | print $out_fh <<"EOT"; | |
09cc440d | 871 | |
e80ffeda KW |
872 | # ifdef PERL_IN_REGCOMP_C |
873 | # define MAX_PRINT_A $max_PRINT_A /* The max code point that isPRINT_A */ | |
874 | # endif | |
c62fdeb7 KW |
875 | EOT |
876 | ||
877 | print $out_fh get_conditional_compile_line_end(); | |
b35552de KW |
878 | |
879 | } | |
880 | ||
9dfe4b75 KW |
881 | if ($output_omitteds) { |
882 | # We haven't bothered to delete things that later became used. | |
50b17915 KW |
883 | foreach my $which (\%paireds, |
884 | \%unused_bidi_pairs, | |
885 | \%unused_pairs) | |
886 | { | |
9dfe4b75 KW |
887 | foreach my $lhs (keys $which->%*) { |
888 | delete $discards{$lhs}; | |
889 | delete $discards{$which->{$lhs}}; | |
890 | } | |
891 | } | |
50b17915 KW |
892 | |
893 | print STDERR "\nMirrored says Unicode, but not currently used as paired string delimiters\n"; | |
894 | foreach my $from (sort { $a <=> $b } keys %unused_bidi_pairs) { | |
895 | print STDERR format_pairs_line($from, $unused_bidi_pairs{$from}); | |
896 | } | |
897 | ||
898 | print STDERR "\nMirror found by name, but not currently used as paired string delimiters\n"; | |
899 | foreach my $from (sort { $a <=> $b } keys %unused_pairs) { | |
900 | print STDERR format_pairs_line($from, $unused_pairs{$from}); | |
901 | } | |
9dfe4b75 KW |
902 | |
903 | # Invert %discards so that all the code points for a given reason are | |
904 | # keyed by that reason. | |
905 | my %inverted_discards; | |
906 | foreach my $code_point (sort { $a <=> $b } keys %discards) { | |
907 | my $type = $discards{$code_point}{reason}; | |
908 | push $inverted_discards{$type}->@*, [ $code_point, | |
909 | $discards{$code_point}{mirror} | |
910 | ]; | |
911 | } | |
912 | ||
913 | # Then output each list | |
914 | foreach my $type (sort keys %inverted_discards) { | |
915 | print STDERR "\n$type\n" if $type ne ""; | |
916 | foreach my $ref ($inverted_discards{$type}->@*) { | |
917 | print STDERR format_pairs_line($ref->[0], $ref->[1]); | |
918 | } | |
919 | } | |
920 | } | |
921 | ||
b35552de KW |
922 | my $count = 0; |
923 | my @other_invlist = prop_invlist("Other"); | |
924 | for (my $i = 0; $i < @other_invlist; $i += 2) { | |
925 | $count += ((defined $other_invlist[$i+1]) | |
926 | ? $other_invlist[$i+1] | |
927 | : 0x110000) | |
928 | - $other_invlist[$i]; | |
61dad979 | 929 | } |
c62fdeb7 KW |
930 | $count = 0x110000 - $count; |
931 | print $out_fh <<~"EOT"; | |
932 | ||
933 | /* The number of code points not matching \\pC */ | |
934 | #ifdef PERL_IN_REGCOMP_C | |
935 | # define NON_OTHER_COUNT $count | |
936 | #endif | |
937 | EOT | |
61dad979 | 938 | |
3bfc1e70 KW |
939 | # If this release has both the CWCM and CWCF properties, find the highest code |
940 | # point which changes under any case change. We can use this to short-circuit | |
941 | # code | |
942 | my @cwcm = prop_invlist('CWCM'); | |
943 | if (@cwcm) { | |
944 | my @cwcf = prop_invlist('CWCF'); | |
945 | if (@cwcf) { | |
946 | my $max = ($cwcm[-1] < $cwcf[-1]) | |
947 | ? $cwcf[-1] | |
948 | : $cwcm[-1]; | |
c62fdeb7 KW |
949 | $max = sprintf "0x%X", $max - 1; |
950 | print $out_fh <<~"EOS"; | |
951 | ||
952 | /* The highest code point that has any type of case change */ | |
953 | #ifdef PERL_IN_UTF8_C | |
954 | # define HIGHEST_CASE_CHANGING_CP $max | |
955 | #endif | |
956 | EOS | |
3bfc1e70 KW |
957 | } |
958 | } | |
959 | ||
6a5bc5ac | 960 | print $out_fh "\n#endif /* PERL_UNICODE_CONSTANTS_H_ */\n"; |
d10c72f2 | 961 | |
61dad979 KW |
962 | read_only_bottom_close_and_rename($out_fh); |
963 | ||
9d8e3074 KW |
964 | # DATA FORMAT |
965 | # | |
69bc4c1f KW |
966 | # Note that any apidoc comments you want in the file need to be added to one |
967 | # of the prints above | |
968 | # | |
9d8e3074 KW |
969 | # A blank line is output as-is. |
970 | # Comments (lines whose first non-blank is a '#') are converted to C-style, | |
971 | # though empty comments are converted to blank lines. Otherwise, each line | |
972 | # represents one #define, and begins with either a Unicode character name with | |
973 | # the blanks and dashes in it squeezed out or replaced by underscores; or it | |
974 | # may be a hexadecimal Unicode code point of the form U+xxxx. In the latter | |
975 | # case, the name will be looked-up to use as the name of the macro. In either | |
976 | # case, the macro name will have suffixes as listed above, and all blanks and | |
977 | # dashes will be replaced by underscores. | |
978 | # | |
979 | # Each line may optionally have one of the following flags on it, separated by | |
980 | # white space from the initial token. | |
981 | # string indicates that the output is to be of the string form | |
982 | # described in the comments above that are placed in the file. | |
983 | # string_skip_ifundef is the same as 'string', but instead of dying if the | |
984 | # code point doesn't exist, the line is just skipped: no output is | |
985 | # generated for it | |
986 | # first indicates that the output is to be of the FIRST_BYTE form. | |
987 | # tail indicates that the output is of the _TAIL form. | |
988 | # native indicates that the output is the code point, converted to the | |
989 | # platform's native character set if applicable | |
990 | # | |
991 | # If the code point has no official name, the desired name may be appended | |
992 | # after the flag, which will be ignored if there is an official name. | |
993 | # | |
994 | # This program is used to make it convenient to create compile time constants | |
995 | # of UTF-8, and to generate proper EBCDIC as well as ASCII without manually | |
996 | # having to figure things out. | |
997 | ||
61dad979 | 998 | __DATA__ |
f2e06375 | 999 | U+017F string |
76837d21 | 1000 | |
1dfa4f52 | 1001 | U+0300 string |
2a614cdc | 1002 | U+0307 string |
a78bc3c6 | 1003 | |
8f57fa7d | 1004 | U+1E9E string_skip_if_undef |
f2e06375 | 1005 | |
a9f50d33 KW |
1006 | U+FB05 string |
1007 | U+FB06 string | |
a0ffb25e KW |
1008 | U+0130 string |
1009 | U+0131 string | |
a9f50d33 | 1010 | |
1dfa4f52 | 1011 | U+2010 string |
5f0aa340 KW |
1012 | BOM first |
1013 | BOM tail | |
525b6419 | 1014 | |
69bc4c1f KW |
1015 | BOM string |
1016 | ||
1017 | U+FFFD string | |
1018 | ||
566efd88 KW |
1019 | U+10FFFF string MAX_UNICODE |
1020 | ||
df758df2 KW |
1021 | NBSP native |
1022 | NBSP string | |
1023 | ||
05016631 | 1024 | DEL native |
c5eda08a KW |
1025 | CR native |
1026 | LF native | |
d804860b KW |
1027 | VT native |
1028 | ESC native | |
1dfa4f52 | 1029 | U+00DF native |
69ffc8e3 | 1030 | U+00DF string |
1dfa4f52 KW |
1031 | U+00E5 native |
1032 | U+00C5 native | |
1033 | U+00FF native | |
1034 | U+00B5 native | |
69ffc8e3 | 1035 | U+00B5 string |