Commit | Line | Data |
---|---|---|
423cee85 | 1 | package charnames; |
b177ca84 JF |
2 | use strict; |
3 | use warnings; | |
4 | use Carp; | |
51cf30b6 | 5 | use File::Spec; |
b8effcb5 | 6 | our $VERSION = '1.03'; |
b75c8c73 | 7 | |
d5448623 | 8 | use bytes (); # for $bytes::hint_bits |
9cfe5470 | 9 | $charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH |
423cee85 | 10 | |
52ea3e69 JH |
11 | my %alias1 = ( |
12 | # Icky 3.2 names with parentheses. | |
13 | 'LINE FEED' => 'LINE FEED (LF)', | |
14 | 'FORM FEED' => 'FORM FEED (FF)', | |
15 | 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)', | |
16 | 'NEXT LINE' => 'NEXT LINE (NEL)', | |
17 | # Convenience. | |
18 | 'LF' => 'LINE FEED (LF)', | |
19 | 'FF' => 'FORM FEED (FF)', | |
eb380778 | 20 | 'CR' => 'CARRIAGE RETURN (CR)', |
51e9e896 | 21 | 'NEL' => 'NEXT LINE (NEL)', |
24b5d5cc JH |
22 | # More convenience. For futher convencience, |
23 | # it is suggested some way using using the NamesList | |
24 | # aliases is implemented. | |
25 | 'ZWNJ' => 'ZERO WIDTH NON-JOINER', | |
26 | 'ZWJ' => 'ZERO WIDTH JOINER', | |
52ea3e69 JH |
27 | 'BOM' => 'BYTE ORDER MARK', |
28 | ); | |
29 | ||
30 | my %alias2 = ( | |
31 | # Pre-3.2 compatibility (only for the first 256 characters). | |
32 | 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION', | |
33 | 'VERTICAL TABULATION' => 'LINE TABULATION', | |
34 | 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR', | |
35 | 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE', | |
36 | 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO', | |
37 | 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE', | |
38 | 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD', | |
39 | 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD', | |
40 | ); | |
41 | ||
35c0985d MB |
42 | my %alias3 = ( |
43 | # User defined aliasses. Even more convenient :) | |
44 | ); | |
423cee85 JH |
45 | my $txt; |
46 | ||
35c0985d MB |
47 | sub alias (@) |
48 | { | |
49 | @_ or return %alias3; | |
50 | my $alias = ref $_[0] ? $_[0] : { @_ }; | |
51 | @alias3{keys %$alias} = values %$alias; | |
52 | } # alias | |
53 | ||
54 | sub alias_file ($) | |
55 | { | |
51cf30b6 MB |
56 | my ($arg, $file) = @_; |
57 | if (-f $arg && File::Spec->file_name_is_absolute ($arg)) { | |
58 | $file = $arg; | |
59 | } | |
60 | elsif ($arg =~ m/^\w+$/) { | |
61 | $file = "unicore/${arg}_alias.pl"; | |
62 | } | |
63 | else { | |
64 | croak "Charnames alias files can only have identifier characters"; | |
65 | } | |
35c0985d | 66 | if (my @alias = do $file) { |
51cf30b6 MB |
67 | @alias == 1 && !defined $alias[0] and |
68 | croak "$file cannot be used as alias file for charnames"; | |
69 | @alias % 2 and | |
70 | croak "$file did not return a (valid) list of alias pairs"; | |
35c0985d MB |
71 | alias (@alias); |
72 | return (1); | |
73 | } | |
74 | 0; | |
75 | } # alias_file | |
76 | ||
423cee85 | 77 | # This is not optimized in any way yet |
b177ca84 JF |
78 | sub charnames |
79 | { | |
80 | my $name = shift; | |
81 | ||
52ea3e69 | 82 | if (exists $alias1{$name}) { |
35c0985d | 83 | $name = $alias1{$name}; |
52ea3e69 | 84 | } |
35c0985d MB |
85 | elsif (exists $alias2{$name}) { |
86 | require warnings; | |
87 | warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead}); | |
88 | $name = $alias2{$name}; | |
89 | } | |
90 | elsif (exists $alias3{$name}) { | |
91 | $name = $alias3{$name}; | |
52ea3e69 | 92 | } |
b177ca84 | 93 | |
52ea3e69 | 94 | my $ord; |
423cee85 | 95 | my @off; |
52ea3e69 JH |
96 | my $fname; |
97 | ||
98 | if ($name eq "BYTE ORDER MARK") { | |
35c0985d MB |
99 | $fname = $name; |
100 | $ord = 0xFEFF; | |
52ea3e69 | 101 | } else { |
35c0985d MB |
102 | ## Suck in the code/name list as a big string. |
103 | ## Lines look like: | |
104 | ## "0052\t\tLATIN CAPITAL LETTER R\n" | |
105 | $txt = do "unicore/Name.pl" unless $txt; | |
106 | ||
107 | ## @off will hold the index into the code/name string of the start and | |
108 | ## end of the name as we find it. | |
109 | ||
a6d05634 | 110 | ## If :full, look for the name exactly |
35c0985d MB |
111 | if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) { |
112 | @off = ($-[0], $+[0]); | |
113 | } | |
114 | ||
115 | ## If we didn't get above, and :short allowed, look for the short name. | |
116 | ## The short name is like "greek:Sigma" | |
117 | unless (@off) { | |
118 | if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) { | |
119 | my ($script, $cname) = ($1, $2); | |
120 | my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"; | |
121 | if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) { | |
52ea3e69 | 122 | @off = ($-[0], $+[0]); |
35c0985d | 123 | } |
423cee85 | 124 | } |
35c0985d | 125 | } |
b177ca84 | 126 | |
35c0985d MB |
127 | ## If we still don't have it, check for the name among the loaded |
128 | ## scripts. | |
129 | if (not @off) { | |
130 | my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"; | |
131 | for my $script (@{$^H{charnames_scripts}}) { | |
132 | if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) { | |
133 | @off = ($-[0], $+[0]); | |
134 | last; | |
135 | } | |
52ea3e69 | 136 | } |
35c0985d MB |
137 | } |
138 | ||
139 | ## If we don't have it by now, give up. | |
140 | unless (@off) { | |
141 | carp "Unknown charname '$name'"; | |
142 | return "\x{FFFD}"; | |
143 | } | |
144 | ||
145 | ## | |
146 | ## Now know where in the string the name starts. | |
147 | ## The code, in hex, is before that. | |
148 | ## | |
149 | ## The code can be 4-6 characters long, so we've got to sort of | |
150 | ## go look for it, just after the newline that comes before $off[0]. | |
151 | ## | |
152 | ## This would be much easier if unicore/Name.pl had info in | |
153 | ## a name/code order, instead of code/name order. | |
154 | ## | |
155 | ## The +1 after the rindex() is to skip past the newline we're finding, | |
156 | ## or, if the rindex() fails, to put us to an offset of zero. | |
157 | ## | |
158 | my $hexstart = rindex($txt, "\n", $off[0]) + 1; | |
159 | ||
160 | ## we know where it starts, so turn into number - | |
161 | ## the ordinal for the char. | |
162 | $ord = hex substr($txt, $hexstart, $off[0] - $hexstart); | |
423cee85 | 163 | } |
b177ca84 | 164 | |
d5448623 | 165 | if ($^H & $bytes::hint_bits) { # "use bytes" in effect? |
8058d7ab | 166 | use bytes; |
d41ff1b8 | 167 | return chr $ord if $ord <= 255; |
f0175764 | 168 | my $hex = sprintf "%04x", $ord; |
52ea3e69 | 169 | if (not defined $fname) { |
35c0985d | 170 | $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2; |
52ea3e69 | 171 | } |
f0175764 | 172 | croak "Character 0x$hex with name '$fname' is above 0xFF"; |
423cee85 | 173 | } |
f0175764 | 174 | |
52ea3e69 | 175 | no warnings 'utf8'; # allow even illegal characters |
bfa383d6 | 176 | return pack "U", $ord; |
35c0985d | 177 | } # charnames |
423cee85 | 178 | |
b177ca84 JF |
179 | sub import |
180 | { | |
181 | shift; ## ignore class name | |
182 | ||
35c0985d MB |
183 | if (not @_) { |
184 | carp("`use charnames' needs explicit imports list"); | |
b177ca84 | 185 | } |
d5448623 | 186 | $^H |= $charnames::hint_bits; |
423cee85 | 187 | $^H{charnames} = \&charnames ; |
b177ca84 JF |
188 | |
189 | ## | |
190 | ## fill %h keys with our @_ args. | |
191 | ## | |
35c0985d | 192 | my ($promote, %h, @args) = (0); |
e5c3f898 MG |
193 | while (my $arg = shift) { |
194 | if ($arg eq ":alias") { | |
51cf30b6 MB |
195 | @_ or |
196 | croak ":alias needs an argument in charnames"; | |
35c0985d MB |
197 | my $alias = shift; |
198 | if (ref $alias) { | |
199 | ref $alias eq "HASH" or | |
51cf30b6 | 200 | croak "Only HASH reference supported as argument to :alias"; |
35c0985d MB |
201 | alias ($alias); |
202 | next; | |
203 | } | |
51cf30b6 MB |
204 | if ($alias =~ m{:(\w+)$}) { |
205 | $1 eq "full" || $1 eq "short" and | |
206 | croak ":alias cannot use existing pragma :$1 (reversed order?)"; | |
207 | alias_file ($1) and $promote = 1; | |
208 | next; | |
35c0985d | 209 | } |
51cf30b6 MB |
210 | alias_file ($alias); |
211 | next; | |
212 | } | |
e5c3f898 MG |
213 | if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" || $arg eq ":short")) { |
214 | warn "unsupported special '$arg' in charnames"; | |
51cf30b6 | 215 | next; |
35c0985d | 216 | } |
e5c3f898 | 217 | push @args, $arg; |
35c0985d MB |
218 | } |
219 | @args == 0 && $promote and @args = (":full"); | |
220 | @h{@args} = (1) x @args; | |
b177ca84 | 221 | |
423cee85 JH |
222 | $^H{charnames_full} = delete $h{':full'}; |
223 | $^H{charnames_short} = delete $h{':short'}; | |
224 | $^H{charnames_scripts} = [map uc, keys %h]; | |
b177ca84 JF |
225 | |
226 | ## | |
227 | ## If utf8? warnings are enabled, and some scripts were given, | |
228 | ## see if at least we can find one letter of each script. | |
229 | ## | |
35c0985d MB |
230 | if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) { |
231 | $txt = do "unicore/Name.pl" unless $txt; | |
232 | ||
233 | for my $script (@{$^H{charnames_scripts}}) { | |
234 | if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) { | |
235 | warnings::warn('utf8', "No such script: '$script'"); | |
b177ca84 | 236 | } |
35c0985d | 237 | } |
bd62941a | 238 | } |
35c0985d | 239 | } # import |
423cee85 | 240 | |
b8effcb5 YST |
241 | # this comes actually from Unicode::UCD, but it avoids the |
242 | # overhead of loading it | |
243 | sub _getcode { | |
244 | my $arg = shift; | |
245 | ||
246 | if ($arg =~ /^[1-9]\d*$/) { | |
247 | return $arg; | |
248 | } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { | |
249 | return hex($1); | |
250 | } | |
251 | ||
252 | return; | |
253 | } | |
f0175764 | 254 | |
4e2cda5d JH |
255 | my %viacode; |
256 | ||
b177ca84 JF |
257 | sub viacode |
258 | { | |
35c0985d MB |
259 | if (@_ != 1) { |
260 | carp "charnames::viacode() expects one argument"; | |
261 | return () | |
262 | } | |
f0175764 | 263 | |
35c0985d | 264 | my $arg = shift; |
b8effcb5 | 265 | my $code = _getcode($arg); |
b177ca84 | 266 | |
35c0985d | 267 | my $hex; |
f0175764 | 268 | |
35c0985d MB |
269 | if (defined $code) { |
270 | $hex = sprintf "%04X", $arg; | |
271 | } else { | |
272 | carp("unexpected arg \"$arg\" to charnames::viacode()"); | |
273 | return; | |
274 | } | |
b177ca84 | 275 | |
35c0985d MB |
276 | if ($code > 0x10FFFF) { |
277 | carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex; | |
278 | return; | |
279 | } | |
f0175764 | 280 | |
35c0985d | 281 | return $viacode{$hex} if exists $viacode{$hex}; |
4e2cda5d | 282 | |
35c0985d | 283 | $txt = do "unicore/Name.pl" unless $txt; |
b177ca84 | 284 | |
35c0985d MB |
285 | if ($txt =~ m/^$hex\t\t(.+)/m) { |
286 | return $viacode{$hex} = $1; | |
287 | } else { | |
288 | return; | |
289 | } | |
290 | } # viacode | |
daf0d493 | 291 | |
4e2cda5d JH |
292 | my %vianame; |
293 | ||
daf0d493 JH |
294 | sub vianame |
295 | { | |
35c0985d MB |
296 | if (@_ != 1) { |
297 | carp "charnames::vianame() expects one name argument"; | |
298 | return () | |
299 | } | |
daf0d493 | 300 | |
35c0985d | 301 | my $arg = shift; |
daf0d493 | 302 | |
35c0985d | 303 | return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/; |
dbc0d4f2 | 304 | |
35c0985d | 305 | return $vianame{$arg} if exists $vianame{$arg}; |
4e2cda5d | 306 | |
35c0985d | 307 | $txt = do "unicore/Name.pl" unless $txt; |
daf0d493 | 308 | |
35c0985d MB |
309 | my $pos = index $txt, "\t\t$arg\n"; |
310 | if ($[ <= $pos) { | |
311 | my $posLF = rindex $txt, "\n", $pos; | |
312 | (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d; | |
313 | return $vianame{$arg} = hex $code; | |
314 | ||
315 | # If $pos is at the 1st line, $posLF must be $[ - 1 (not found); | |
316 | # then $posLF + 1 equals to $[ (at the beginning of $txt). | |
317 | # Otherwise $posLF is the position of "\n"; | |
318 | # then $posLF + 1 must be the position of the next to "\n" | |
319 | # (the beginning of the line). | |
320 | # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t", | |
321 | # "10300\t", "100000", etc. So we can get the code via removing TAB. | |
322 | } else { | |
323 | return; | |
324 | } | |
325 | } # vianame | |
b177ca84 | 326 | |
423cee85 JH |
327 | |
328 | 1; | |
329 | __END__ | |
330 | ||
331 | =head1 NAME | |
332 | ||
274085e3 | 333 | charnames - define character names for C<\N{named}> string literal escapes |
423cee85 JH |
334 | |
335 | =head1 SYNOPSIS | |
336 | ||
337 | use charnames ':full'; | |
4a2d328f | 338 | print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; |
423cee85 JH |
339 | |
340 | use charnames ':short'; | |
4a2d328f | 341 | print "\N{greek:Sigma} is an upper-case sigma.\n"; |
423cee85 JH |
342 | |
343 | use charnames qw(cyrillic greek); | |
4a2d328f | 344 | print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n"; |
423cee85 | 345 | |
35c0985d MB |
346 | use charnames ":full", ":alias" => { |
347 | e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", | |
76ae0c45 | 348 | }; |
35c0985d MB |
349 | print "\N{e_ACUTE} is a small letter e with an acute.\n"; |
350 | ||
76ae0c45 | 351 | use charnames (); |
a23c04e4 JH |
352 | print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE" |
353 | printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330" | |
b177ca84 | 354 | |
423cee85 JH |
355 | =head1 DESCRIPTION |
356 | ||
35c0985d MB |
357 | Pragma C<use charnames> supports arguments C<:full>, C<:short>, script |
358 | names and customized aliases. If C<:full> is present, for expansion of | |
76ae0c45 RGS |
359 | C<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of |
360 | standard Unicode character names. If C<:short> is present, and | |
423cee85 JH |
361 | C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up |
362 | as a letter in script C<SCRIPT>. If pragma C<use charnames> is used | |
a191c821 | 363 | with script name arguments, then for C<\N{CHARNAME}> the name |
423cee85 | 364 | C<CHARNAME> is looked up as a letter in the given scripts (in the |
35c0985d | 365 | specified order). Customized aliases are explained in L</CUSTOM ALIASES>. |
423cee85 JH |
366 | |
367 | For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME> | |
d5448623 | 368 | this pragma looks for the names |
423cee85 JH |
369 | |
370 | SCRIPTNAME CAPITAL LETTER CHARNAME | |
371 | SCRIPTNAME SMALL LETTER CHARNAME | |
372 | SCRIPTNAME LETTER CHARNAME | |
373 | ||
374 | in the table of standard Unicode names. If C<CHARNAME> is lowercase, | |
daf0d493 JH |
375 | then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant |
376 | is ignored. | |
377 | ||
378 | Note that C<\N{...}> is compile-time, it's a special form of string | |
379 | constant used inside double-quoted strings: in other words, you cannot | |
4e2cda5d | 380 | use variables inside the C<\N{...}>. If you want similar run-time |
daf0d493 | 381 | functionality, use charnames::vianame(). |
423cee85 | 382 | |
301a3cda | 383 | For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F) |
dbc0d4f2 JH |
384 | as of Unicode 3.1, there are no official Unicode names but you can use |
385 | instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). In | |
386 | Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429 | |
387 | has been updated, see L</ALIASES>. Also note that the U+UU80, U+0081, | |
388 | U+0084, and U+0099 do not have names even in ISO 6429. | |
389 | ||
390 | Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}" | |
391 | is the Unicode smiley face, or "\N{WHITE SMILING FACE}". | |
301a3cda | 392 | |
423cee85 JH |
393 | =head1 CUSTOM TRANSLATORS |
394 | ||
d5448623 | 395 | The mechanism of translation of C<\N{...}> escapes is general and not |
423cee85 | 396 | hardwired into F<charnames.pm>. A module can install custom |
d5448623 | 397 | translations (inside the scope which C<use>s the module) with the |
423cee85 JH |
398 | following magic incantation: |
399 | ||
d5448623 GS |
400 | use charnames (); # for $charnames::hint_bits |
401 | sub import { | |
402 | shift; | |
403 | $^H |= $charnames::hint_bits; | |
404 | $^H{charnames} = \&translator; | |
405 | } | |
423cee85 JH |
406 | |
407 | Here translator() is a subroutine which takes C<CHARNAME> as an | |
408 | argument, and returns text to insert into the string instead of the | |
4a2d328f | 409 | C<\N{CHARNAME}> escape. Since the text to insert should be different |
d5448623 GS |
410 | in C<bytes> mode and out of it, the function should check the current |
411 | state of C<bytes>-flag as in: | |
412 | ||
413 | use bytes (); # for $bytes::hint_bits | |
414 | sub translator { | |
415 | if ($^H & $bytes::hint_bits) { | |
416 | return bytes_translator(@_); | |
417 | } | |
418 | else { | |
419 | return utf8_translator(@_); | |
420 | } | |
423cee85 | 421 | } |
423cee85 | 422 | |
35c0985d MB |
423 | =head1 CUSTOM ALIASES |
424 | ||
425 | This version of charnames supports three mechanisms of adding local | |
426 | or customized aliases to standard Unicode naming conventions (:full) | |
427 | ||
428 | =head2 Anonymous hashes | |
429 | ||
430 | use charnames ":full", ":alias" => { | |
431 | e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", | |
432 | }; | |
433 | my $str = "\N{e_ACUTE}"; | |
434 | ||
435 | =head2 Alias file | |
436 | ||
437 | use charnames ":full", ":alias" => "pro"; | |
438 | ||
439 | will try to read "unicore/pro_alias.pl" from the @INC path. This | |
440 | file should return a list in plain perl: | |
441 | ||
442 | ( | |
443 | A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE", | |
444 | A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX", | |
445 | A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS", | |
446 | A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE", | |
447 | A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE", | |
448 | A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE", | |
449 | A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON", | |
450 | ); | |
451 | ||
452 | =head2 Alias shortcut | |
453 | ||
454 | use charnames ":alias" => ":pro"; | |
455 | ||
456 | works exactly the same as the alias pairs, only this time, | |
457 | ":full" is inserted automatically as first argument (if no | |
458 | other argument is given). | |
459 | ||
b177ca84 JF |
460 | =head1 charnames::viacode(code) |
461 | ||
462 | Returns the full name of the character indicated by the numeric code. | |
463 | The example | |
464 | ||
465 | print charnames::viacode(0x2722); | |
466 | ||
467 | prints "FOUR TEARDROP-SPOKED ASTERISK". | |
468 | ||
daf0d493 JH |
469 | Returns undef if no name is known for the code. |
470 | ||
35c0985d | 471 | This works only for the standard names, and does not yet apply |
daf0d493 JH |
472 | to custom translators. |
473 | ||
274085e3 PN |
474 | Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK |
475 | SPACE", not "BYTE ORDER MARK". | |
476 | ||
eb6a2339 | 477 | =head1 charnames::vianame(name) |
daf0d493 JH |
478 | |
479 | Returns the code point indicated by the name. | |
480 | The example | |
481 | ||
482 | printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK"); | |
483 | ||
484 | prints "2722". | |
485 | ||
eb6a2339 | 486 | Returns undef if the name is unknown. |
b177ca84 | 487 | |
35c0985d | 488 | This works only for the standard names, and does not yet apply |
b177ca84 JF |
489 | to custom translators. |
490 | ||
52ea3e69 JH |
491 | =head1 ALIASES |
492 | ||
493 | A few aliases have been defined for convenience: instead of having | |
494 | to use the official names | |
495 | ||
496 | LINE FEED (LF) | |
497 | FORM FEED (FF) | |
498 | CARRIAGE RETURN (CR) | |
499 | NEXT LINE (NEL) | |
500 | ||
501 | (yes, with parentheses) one can use | |
502 | ||
503 | LINE FEED | |
504 | FORM FEED | |
505 | CARRIAGE RETURN | |
506 | NEXT LINE | |
507 | LF | |
508 | FF | |
509 | CR | |
510 | NEL | |
511 | ||
512 | One can also use | |
513 | ||
514 | BYTE ORDER MARK | |
515 | BOM | |
516 | ||
24b5d5cc JH |
517 | and |
518 | ||
519 | ZWNJ | |
520 | ZWJ | |
521 | ||
522 | for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER. | |
52ea3e69 JH |
523 | |
524 | For backward compatibility one can use the old names for | |
525 | certain C0 and C1 controls | |
526 | ||
527 | old new | |
528 | ||
529 | HORIZONTAL TABULATION CHARACTER TABULATION | |
530 | VERTICAL TABULATION LINE TABULATION | |
531 | FILE SEPARATOR INFORMATION SEPARATOR FOUR | |
532 | GROUP SEPARATOR INFORMATION SEPARATOR THREE | |
533 | RECORD SEPARATOR INFORMATION SEPARATOR TWO | |
534 | UNIT SEPARATOR INFORMATION SEPARATOR ONE | |
535 | PARTIAL LINE DOWN PARTIAL LINE FORWARD | |
536 | PARTIAL LINE UP PARTIAL LINE BACKWARD | |
537 | ||
538 | but the old names in addition to giving the character | |
539 | will also give a warning about being deprecated. | |
540 | ||
f0175764 JH |
541 | =head1 ILLEGAL CHARACTERS |
542 | ||
00d835f2 JH |
543 | If you ask by name for a character that does not exist, a warning is |
544 | given and the Unicode I<replacement character> "\x{FFFD}" is returned. | |
545 | ||
546 | If you ask by code for a character that does not exist, no warning is | |
547 | given and C<undef> is returned. (Though if you ask for a code point | |
548 | past U+10FFFF you do get a warning.) | |
f0175764 | 549 | |
423cee85 JH |
550 | =head1 BUGS |
551 | ||
552 | Since evaluation of the translation function happens in a middle of | |
553 | compilation (of a string literal), the translation function should not | |
554 | do any C<eval>s or C<require>s. This restriction should be lifted in | |
555 | a future version of Perl. | |
556 | ||
557 | =cut |