Commit | Line | Data |
---|---|---|
61dad979 KW |
1 | use v5.16.0; |
2 | use strict; | |
3 | use warnings; | |
c7b32e72 | 4 | no warnings 'experimental::regex_sets'; |
3d7c117d MB |
5 | require './regen/regen_lib.pl'; |
6 | require './regen/charset_translations.pl'; | |
c7b32e72 | 7 | use Unicode::UCD qw(prop_invlist prop_invmap); |
61dad979 | 8 | use charnames qw(:loose); |
dce1e563 KW |
9 | binmode(STDERR, ":utf8"); |
10 | ||
11 | # Set this to 1 temporarily to get on stderr the complete list of paired | |
12 | # string delimiters this generates. This list is suitable for plugging into a | |
13 | # pod. | |
14 | my $output_lists = 0; | |
61dad979 | 15 | |
1b0f46bf | 16 | my $out_fh = open_new('unicode_constants.h', '>', |
ad88cddb | 17 | {style => '*', by => $0, |
61dad979 KW |
18 | from => "Unicode data"}); |
19 | ||
20 | print $out_fh <<END; | |
d10c72f2 | 21 | |
6a5bc5ac KW |
22 | #ifndef PERL_UNICODE_CONSTANTS_H_ /* Guard against nested #includes */ |
23 | #define PERL_UNICODE_CONSTANTS_H_ 1 | |
d10c72f2 | 24 | |
4b4853d1 KW |
25 | /* This file contains #defines for the version of Unicode being used and |
26 | * various Unicode code points. The values the code point macros expand to | |
27 | * are the native Unicode code point, or all or portions of the UTF-8 encoding | |
28 | * for the code point. In the former case, the macro name has the suffix | |
29 | * "_NATIVE"; otherwise, the suffix "_UTF8". | |
61dad979 | 30 | * |
525b6419 KW |
31 | * The macros that have the suffix "_UTF8" may have further suffixes, as |
32 | * follows: | |
33 | * "_FIRST_BYTE" if the value is just the first byte of the UTF-8 | |
34 | * representation; the value will be a numeric constant. | |
35 | * "_TAIL" if instead it represents all but the first byte. This, and | |
36 | * with no additional suffix are both string constants */ | |
61dad979 | 37 | |
69bc4c1f | 38 | /* |
3f620621 | 39 | =for apidoc_section \$unicode |
69bc4c1f | 40 | |
78342678 | 41 | =for apidoc AmnU|const char *|BOM_UTF8 |
69bc4c1f KW |
42 | |
43 | This is a macro that evaluates to a string constant of the UTF-8 bytes that | |
44 | define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl | |
45 | is compiled on. This allows code to use a mnemonic for this character that | |
46 | works on both ASCII and EBCDIC platforms. | |
47 | S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in | |
48 | bytes. | |
49 | ||
78342678 | 50 | =for apidoc AmnU|const char *|REPLACEMENT_CHARACTER_UTF8 |
69bc4c1f KW |
51 | |
52 | This is a macro that evaluates to a string constant of the UTF-8 bytes that | |
53 | define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl | |
54 | is compiled on. This allows code to use a mnemonic for this character that | |
55 | works on both ASCII and EBCDIC platforms. | |
56 | S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in | |
57 | bytes. | |
58 | ||
59 | =cut | |
60 | */ | |
61 | ||
61dad979 KW |
62 | END |
63 | ||
63cd44e4 KW |
64 | sub backslash_x_form($$;$) { |
65 | # Output the code point represented by the byte string $bytes as a | |
66 | # sequence of \x{} constants. $bytes should be the UTF-8 for the code | |
67 | # point if the final parameter is absent or empty. Otherwise it should be | |
68 | # the Latin1 code point itself. | |
69 | # | |
70 | # The output is translated into the character set '$charset'. | |
71 | ||
72 | my ($bytes, $charset, $non_utf8) = @_; | |
73 | if ($non_utf8) { | |
74 | die "Must be utf8 if above 255" if $bytes > 255; | |
75 | my $a2n = get_a2n($charset); | |
76 | return sprintf "\\x%02X", $a2n->[$bytes]; | |
77 | } | |
78 | else { | |
79 | return join "", map { sprintf "\\x%02X", ord $_ } | |
80 | split //, cp_2_utfbytes($bytes, $charset); | |
81 | } | |
82 | } | |
83 | ||
dce1e563 KW |
84 | |
85 | my %opposite_of = ( LEFT => 'RIGHT', RIGHT =>'LEFT' ); | |
86 | ||
87 | my $directional_re = qr/\b(LEFT|RIGHT)\b/; # Make sure to capture $1 | |
88 | ||
89 | sub format_pairs_line($$) { | |
90 | my ($from, $to) = @_; | |
91 | ||
92 | # Format a line containing a character pair in preparation | |
93 | # for output, suitable for pod. | |
94 | ||
95 | my $lhs_name = charnames::viacode($from); | |
96 | my $lhs_hex = sprintf "%04X", $from; | |
97 | my $rhs_name; | |
98 | my $rhs_hex; | |
99 | my $name = $lhs_name; | |
100 | ||
101 | my $hanging_indent = 26; | |
102 | ||
103 | if (defined $to) { | |
104 | my $rhs_name = charnames::viacode($to); | |
105 | $rhs_hex = sprintf "%04X", $to; | |
106 | ||
107 | # Most of the names differ only in LEFT vs RIGHT; some in | |
108 | # LESS-THAN vs GREATER-THAN. It takes less space, and is easier to | |
109 | # understand if they are displayed combined. | |
110 | if ($name =~ s/$directional_re/$opposite_of{$1}/gr eq $rhs_name) { | |
111 | $name =~ s,$directional_re,$1/$opposite_of{$1},g; | |
112 | } | |
113 | else { # Otherwise, display them sequentially | |
114 | $name .= ", " . $rhs_name; | |
115 | } | |
116 | } | |
117 | ||
118 | # Handle double-width characters, based on the East Asian Width property. | |
119 | # Add an extra space to non-wide ones so things stay vertically aligned. | |
120 | my $extra = 0; | |
121 | my $output_line = " " # Indent in case output being used for verbatim | |
122 | # pod | |
123 | . chr $from; | |
124 | if (chr($from) =~ /[\p{EA=W}\p{EA=F}]/) { | |
125 | $extra++; # The length() will be shorter than the displayed | |
126 | # width | |
127 | } | |
128 | else { | |
129 | $output_line .= " "; | |
130 | } | |
131 | if (defined $to) { | |
132 | $output_line .= " " . chr $to; | |
133 | if (chr($to) =~ /[\p{EA=W}\p{EA=F}]/) { | |
134 | $extra++; | |
135 | } | |
136 | else { | |
137 | $output_line .= " "; | |
138 | } | |
139 | } | |
140 | else { | |
141 | $output_line .= " "; | |
142 | } | |
143 | ||
144 | $output_line .= " U+$lhs_hex"; | |
145 | $output_line .= ", U+$rhs_hex" if defined $to;; | |
146 | my $cur_len = $extra + length $output_line; | |
147 | $output_line .= " " x ($hanging_indent - $cur_len); | |
148 | ||
149 | my $max_len = 74; # Pod formatter will indent 4 spaces | |
150 | $cur_len = length $output_line; | |
151 | ||
152 | if ($cur_len + length $name <= $max_len) { | |
153 | $output_line .= $name; # It will fit | |
154 | } | |
155 | else { # It won't fit. Append a segment that is unbreakable until would | |
156 | # exceed the available width; then start on a new line | |
157 | # Doesn't handle the case where the whole segment doesn't fit; | |
158 | # this just doesn't come up with the input data. | |
159 | while ($name =~ / ( .+? ) \b{lb} /xg) { | |
160 | my $segment = $1; | |
161 | my $added_length = length $segment; | |
162 | if ($cur_len + $added_length > $max_len) { | |
163 | $output_line =~ s/ +$//; | |
164 | $output_line .= "\n" . " " x $hanging_indent; | |
165 | $cur_len = $hanging_indent; | |
166 | } | |
167 | ||
168 | $output_line .= $segment; | |
169 | $cur_len += $added_length; | |
170 | } | |
171 | } | |
172 | ||
173 | return $output_line . "\n"; | |
174 | } | |
175 | ||
4b4853d1 KW |
176 | my $version = Unicode::UCD::UnicodeVersion(); |
177 | my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x; | |
178 | $dotdot = 0 unless defined $dotdot; | |
179 | ||
180 | print $out_fh <<END; | |
181 | #define UNICODE_MAJOR_VERSION $major | |
182 | #define UNICODE_DOT_VERSION $dot | |
183 | #define UNICODE_DOT_DOT_VERSION $dotdot | |
184 | ||
185 | END | |
186 | ||
c7b32e72 KW |
187 | # Gather the characters in Unicode that have left/right symmetry suitable for |
188 | # paired string delimiters | |
189 | my %paireds = ( ord '<' => ord '>' ); # We don't normally use math ones, but | |
190 | # this is traditionally included | |
191 | ||
192 | # This property is the universe of all characters in Unicode which | |
193 | # are of some import to the Bidirectional Algorithm, and for which there is | |
194 | # another Unicode character that is a mirror of it. | |
195 | my ($bmg_invlist, $bmg_invmap, $format, $bmg_default) = | |
196 | prop_invmap("Bidi_Mirroring_Glyph"); | |
197 | ||
198 | # The current list of characters that Perl considers to be paired | |
199 | # opening/closing delimiters is quite conservative, consisting of those | |
200 | # from the above property that other Unicode properties classify as | |
201 | # opening/closing. | |
202 | ||
203 | # Find the ones in the bmg list that Unicode thinks are opening ones. | |
204 | for (my $i = 0; $i < $bmg_invlist->@*; $i++) { | |
205 | my $mirror_code_point = $bmg_invmap->[$i]; | |
206 | next if $mirror_code_point eq $bmg_default; # Doesn't map to a character. | |
207 | ||
208 | my $code_point = $bmg_invlist->[$i]; | |
209 | ||
210 | # Bidi_Paired_Bracket_Type=Open and General_Category=Open_Punctuation are | |
211 | # definitely in the list. It is language-dependent whether members of | |
212 | # General_Category=Initial_Punctuation are considered opening or closing; | |
835f2666 | 213 | # we allow either to be at the front |
c7b32e72 KW |
214 | if (chr($code_point) =~ /(?[ \p{BPT=Open} |
215 | | \p{Gc=Open_Punctuation} | |
216 | | \p{Gc=Initial_Punctuation} | |
217 | ])/) | |
218 | { | |
219 | $paireds{$code_point} = $mirror_code_point; | |
220 | } | |
835f2666 KW |
221 | |
222 | if (chr($code_point) =~ /\p{Gc=Initial_Punctuation}/) { | |
223 | $paireds{$mirror_code_point} = $code_point; | |
224 | } | |
c7b32e72 KW |
225 | } |
226 | ||
227 | # There are several hundred characters other characters that clearly should be | |
228 | # mirrors of each other, like LEFTWARDS ARROW and RIGHTWARDS ARROW. Unicode | |
229 | # did not bother to classify them as mirrors mostly because they aren't of | |
230 | # import in the Bidirectional Algorithm. Most of them are symbols. These | |
231 | # are not considered opening/closing by Perl for now. | |
232 | ||
233 | # The rest of the data are at __DATA__ in this file. | |
61dad979 | 234 | |
ad88cddb KW |
235 | my @data = <DATA>; |
236 | ||
237 | foreach my $charset (get_supported_code_pages()) { | |
238 | print $out_fh "\n" . get_conditional_compile_line_start($charset); | |
239 | ||
c30a0cf2 | 240 | my @a2n = @{get_a2n($charset)}; |
ad88cddb | 241 | |
4a4b1311 KW |
242 | for ( @data ) { |
243 | chomp; | |
244 | ||
245 | # Convert any '#' comments to /* ... */; empty lines and comments are | |
246 | # output as blank lines | |
247 | if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) { | |
248 | my $comment_body = $1 // ""; | |
249 | if ($comment_body ne "") { | |
250 | print $out_fh "/* $comment_body */\n"; | |
251 | } | |
252 | else { | |
253 | print $out_fh "\n"; | |
254 | } | |
255 | next; | |
5a731a17 | 256 | } |
76837d21 | 257 | |
4a4b1311 KW |
258 | unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token |
259 | (?: [\ ]+ ( [^ ]* ) )? # optional flag | |
260 | (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required | |
261 | /x) | |
262 | { | |
263 | die "Unexpected syntax at line $.: $_\n"; | |
264 | } | |
61dad979 | 265 | |
4a4b1311 KW |
266 | my $name_or_cp = $1; |
267 | my $flag = $2; | |
268 | my $desired_name = $3; | |
269 | ||
270 | my $name; | |
271 | my $cp; | |
272 | my $U_cp; # code point in Unicode (not-native) terms | |
4a4b1311 KW |
273 | |
274 | if ($name_or_cp =~ /^U\+(.*)/) { | |
275 | $U_cp = hex $1; | |
276 | $name = charnames::viacode($name_or_cp); | |
277 | if (! defined $name) { | |
280ac755 KW |
278 | next if $flag =~ /skip_if_undef/; |
279 | die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name; | |
4a4b1311 KW |
280 | $name = ""; |
281 | } | |
282 | } | |
283 | else { | |
284 | $name = $name_or_cp; | |
285 | die "Unknown name '$name' at line $.: $_\n" unless defined $name; | |
286 | $U_cp = charnames::vianame($name =~ s/_/ /gr); | |
632c9f80 | 287 | } |
61dad979 | 288 | |
4a4b1311 KW |
289 | $cp = ($U_cp < 256) |
290 | ? $a2n[$U_cp] | |
291 | : $U_cp; | |
ad88cddb | 292 | |
4a4b1311 KW |
293 | $name = $desired_name if $name eq "" && $desired_name; |
294 | $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes | |
61dad979 | 295 | |
4a4b1311 KW |
296 | my $str; |
297 | my $suffix; | |
298 | if (defined $flag && $flag eq 'native') { | |
299 | die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff; | |
300 | $suffix = '_NATIVE'; | |
301 | $str = sprintf "0x%02X", $cp; # Is a numeric constant | |
81a2a11f KW |
302 | } |
303 | else { | |
63cd44e4 | 304 | $str = backslash_x_form($U_cp, $charset); |
4a4b1311 KW |
305 | |
306 | $suffix = '_UTF8'; | |
307 | if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) { | |
308 | $str = "\"$str\""; # Will be a string constant | |
309 | } elsif ($flag eq 'tail') { | |
310 | $str =~ s/\\x..//; # Remove the first byte | |
311 | $suffix .= '_TAIL'; | |
312 | $str = "\"$str\""; # Will be a string constant | |
313 | } | |
314 | elsif ($flag eq 'first') { | |
315 | $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte | |
316 | $suffix .= '_FIRST_BYTE'; | |
317 | $str = "0x$str"; # Is a numeric constant | |
318 | } | |
319 | else { | |
320 | die "Unknown flag at line $.: $_\n"; | |
321 | } | |
81a2a11f | 322 | } |
4a4b1311 | 323 | printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp; |
a1beba5b | 324 | } |
09cc440d | 325 | |
c7b32e72 KW |
326 | # Now output the strings of opening/closing delimiters. The Unicode |
327 | # values were earlier entered into %paireds | |
328 | my $utf8_opening = ""; | |
329 | my $utf8_closing = ""; | |
330 | my $non_utf8_opening = ""; | |
331 | my $non_utf8_closing = ""; | |
332 | my $deprecated_if_not_mirrored = ""; | |
333 | my $non_utf8_deprecated_if_not_mirrored = ""; | |
334 | ||
335 | for my $from (sort { $a <=> $b } keys %paireds) { | |
336 | my $to = $paireds{$from}; | |
337 | my $utf8_from_backslashed = backslash_x_form($from, $charset); | |
338 | my $utf8_to_backslashed = backslash_x_form($to, $charset); | |
339 | my $non_utf8_from_backslashed; | |
340 | my $non_utf8_to_backslashed; | |
341 | ||
342 | $utf8_opening .= $utf8_from_backslashed; | |
343 | $utf8_closing .= $utf8_to_backslashed; | |
344 | ||
345 | if ($from < 256) { | |
346 | $non_utf8_from_backslashed = | |
347 | backslash_x_form($from, $charset, 'not_utf8'); | |
348 | $non_utf8_to_backslashed = | |
349 | backslash_x_form($to, $charset, 'not_utf8'); | |
350 | ||
351 | $non_utf8_opening .= $non_utf8_from_backslashed; | |
352 | $non_utf8_closing .= $non_utf8_to_backslashed; | |
353 | } | |
354 | ||
355 | # Only the ASCII range paired delimiters have traditionally been | |
356 | # accepted. Until the feature is considered standard, the non-ASCII | |
357 | # opening ones must be deprecated when the feature isn't in effect, so | |
358 | # as to warn about behavior that is planned to change. | |
359 | if ($from > 127) { | |
360 | $deprecated_if_not_mirrored .= $utf8_from_backslashed; | |
361 | $non_utf8_deprecated_if_not_mirrored .= | |
362 | $non_utf8_from_backslashed if $from < 256; | |
835f2666 KW |
363 | |
364 | # We deprecate using any of these strongly directional characters | |
365 | # at either end of the string, in part so we could allow them to | |
366 | # be reversed. | |
367 | $deprecated_if_not_mirrored .= $utf8_to_backslashed | |
368 | if index ($deprecated_if_not_mirrored, | |
369 | $utf8_to_backslashed) < 0; | |
c7b32e72 KW |
370 | } |
371 | ||
372 | # The implementing code in toke.c assumes that the byte length of each | |
373 | # opening delimiter is the same as its mirrored closing one. This | |
374 | # makes sure of that by checking upon each iteration of the loop. | |
375 | if (length $utf8_opening != length $utf8_closing) { | |
376 | die "Byte length of representation of '" | |
377 | . charnames::viacode($from) | |
378 | . " differs from its mapping '" | |
379 | . charnames::viacode($to) | |
380 | . "'"; | |
381 | } | |
dce1e563 KW |
382 | |
383 | print STDERR format_pairs_line($from, $to) if $output_lists; | |
c7b32e72 | 384 | } |
dce1e563 | 385 | $output_lists = 0; # Only output in first iteration |
c7b32e72 KW |
386 | |
387 | print $out_fh <<~"EOT"; | |
388 | ||
389 | # ifdef PERL_IN_TOKE_C | |
390 | /* Paired characters for quote-like operators, in UTF-8 */ | |
391 | # define EXTRA_OPENING_UTF8_BRACKETS "$utf8_opening" | |
392 | # define EXTRA_CLOSING_UTF8_BRACKETS "$utf8_closing" | |
393 | ||
394 | /* And not in UTF-8 */ | |
395 | # define EXTRA_OPENING_NON_UTF8_BRACKETS "$non_utf8_opening" | |
396 | # define EXTRA_CLOSING_NON_UTF8_BRACKETS "$non_utf8_closing" | |
397 | ||
398 | /* And what's deprecated */ | |
399 | # define DEPRECATED_OPENING_UTF8_BRACKETS "$deprecated_if_not_mirrored" | |
400 | # define DEPRECATED_OPENING_NON_UTF8_BRACKETS "$non_utf8_deprecated_if_not_mirrored" | |
401 | # endif | |
402 | EOT | |
403 | ||
09cc440d KW |
404 | my $max_PRINT_A = 0; |
405 | for my $i (0x20 .. 0x7E) { | |
406 | $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A; | |
407 | } | |
c62fdeb7 KW |
408 | $max_PRINT_A = sprintf "0x%02X", $max_PRINT_A; |
409 | print $out_fh <<"EOT"; | |
09cc440d | 410 | |
e80ffeda KW |
411 | # ifdef PERL_IN_REGCOMP_C |
412 | # define MAX_PRINT_A $max_PRINT_A /* The max code point that isPRINT_A */ | |
413 | # endif | |
c62fdeb7 KW |
414 | EOT |
415 | ||
416 | print $out_fh get_conditional_compile_line_end(); | |
b35552de KW |
417 | |
418 | } | |
419 | ||
b35552de KW |
420 | my $count = 0; |
421 | my @other_invlist = prop_invlist("Other"); | |
422 | for (my $i = 0; $i < @other_invlist; $i += 2) { | |
423 | $count += ((defined $other_invlist[$i+1]) | |
424 | ? $other_invlist[$i+1] | |
425 | : 0x110000) | |
426 | - $other_invlist[$i]; | |
61dad979 | 427 | } |
c62fdeb7 KW |
428 | $count = 0x110000 - $count; |
429 | print $out_fh <<~"EOT"; | |
430 | ||
431 | /* The number of code points not matching \\pC */ | |
432 | #ifdef PERL_IN_REGCOMP_C | |
433 | # define NON_OTHER_COUNT $count | |
434 | #endif | |
435 | EOT | |
61dad979 | 436 | |
3bfc1e70 KW |
437 | # If this release has both the CWCM and CWCF properties, find the highest code |
438 | # point which changes under any case change. We can use this to short-circuit | |
439 | # code | |
440 | my @cwcm = prop_invlist('CWCM'); | |
441 | if (@cwcm) { | |
442 | my @cwcf = prop_invlist('CWCF'); | |
443 | if (@cwcf) { | |
444 | my $max = ($cwcm[-1] < $cwcf[-1]) | |
445 | ? $cwcf[-1] | |
446 | : $cwcm[-1]; | |
c62fdeb7 KW |
447 | $max = sprintf "0x%X", $max - 1; |
448 | print $out_fh <<~"EOS"; | |
449 | ||
450 | /* The highest code point that has any type of case change */ | |
451 | #ifdef PERL_IN_UTF8_C | |
452 | # define HIGHEST_CASE_CHANGING_CP $max | |
453 | #endif | |
454 | EOS | |
3bfc1e70 KW |
455 | } |
456 | } | |
457 | ||
6a5bc5ac | 458 | print $out_fh "\n#endif /* PERL_UNICODE_CONSTANTS_H_ */\n"; |
d10c72f2 | 459 | |
61dad979 KW |
460 | read_only_bottom_close_and_rename($out_fh); |
461 | ||
9d8e3074 KW |
462 | # DATA FORMAT |
463 | # | |
69bc4c1f KW |
464 | # Note that any apidoc comments you want in the file need to be added to one |
465 | # of the prints above | |
466 | # | |
9d8e3074 KW |
467 | # A blank line is output as-is. |
468 | # Comments (lines whose first non-blank is a '#') are converted to C-style, | |
469 | # though empty comments are converted to blank lines. Otherwise, each line | |
470 | # represents one #define, and begins with either a Unicode character name with | |
471 | # the blanks and dashes in it squeezed out or replaced by underscores; or it | |
472 | # may be a hexadecimal Unicode code point of the form U+xxxx. In the latter | |
473 | # case, the name will be looked-up to use as the name of the macro. In either | |
474 | # case, the macro name will have suffixes as listed above, and all blanks and | |
475 | # dashes will be replaced by underscores. | |
476 | # | |
477 | # Each line may optionally have one of the following flags on it, separated by | |
478 | # white space from the initial token. | |
479 | # string indicates that the output is to be of the string form | |
480 | # described in the comments above that are placed in the file. | |
481 | # string_skip_ifundef is the same as 'string', but instead of dying if the | |
482 | # code point doesn't exist, the line is just skipped: no output is | |
483 | # generated for it | |
484 | # first indicates that the output is to be of the FIRST_BYTE form. | |
485 | # tail indicates that the output is of the _TAIL form. | |
486 | # native indicates that the output is the code point, converted to the | |
487 | # platform's native character set if applicable | |
488 | # | |
489 | # If the code point has no official name, the desired name may be appended | |
490 | # after the flag, which will be ignored if there is an official name. | |
491 | # | |
492 | # This program is used to make it convenient to create compile time constants | |
493 | # of UTF-8, and to generate proper EBCDIC as well as ASCII without manually | |
494 | # having to figure things out. | |
495 | ||
61dad979 | 496 | __DATA__ |
f2e06375 | 497 | U+017F string |
76837d21 | 498 | |
1dfa4f52 | 499 | U+0300 string |
2a614cdc | 500 | U+0307 string |
a78bc3c6 | 501 | |
8f57fa7d | 502 | U+1E9E string_skip_if_undef |
f2e06375 | 503 | |
a9f50d33 KW |
504 | U+FB05 string |
505 | U+FB06 string | |
a0ffb25e KW |
506 | U+0130 string |
507 | U+0131 string | |
a9f50d33 | 508 | |
1dfa4f52 | 509 | U+2010 string |
5f0aa340 KW |
510 | BOM first |
511 | BOM tail | |
525b6419 | 512 | |
69bc4c1f KW |
513 | BOM string |
514 | ||
515 | U+FFFD string | |
516 | ||
566efd88 KW |
517 | U+10FFFF string MAX_UNICODE |
518 | ||
df758df2 KW |
519 | NBSP native |
520 | NBSP string | |
521 | ||
05016631 | 522 | DEL native |
c5eda08a KW |
523 | CR native |
524 | LF native | |
d804860b KW |
525 | VT native |
526 | ESC native | |
1dfa4f52 | 527 | U+00DF native |
69ffc8e3 | 528 | U+00DF string |
1dfa4f52 KW |
529 | U+00E5 native |
530 | U+00C5 native | |
531 | U+00FF native | |
532 | U+00B5 native | |
69ffc8e3 | 533 | U+00B5 string |