This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to MakeMaker 6.17.
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
4use Carp;
51cf30b6 5use File::Spec;
35c0985d 6our $VERSION = '1.02';
b75c8c73 7
d5448623 8use bytes (); # for $bytes::hint_bits
9cfe5470 9$charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH
423cee85 10
52ea3e69
JH
11my %alias1 = (
12 # Icky 3.2 names with parentheses.
13 'LINE FEED' => 'LINE FEED (LF)',
14 'FORM FEED' => 'FORM FEED (FF)',
15 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)',
16 'NEXT LINE' => 'NEXT LINE (NEL)',
17 # Convenience.
18 'LF' => 'LINE FEED (LF)',
19 'FF' => 'FORM FEED (FF)',
eb380778 20 'CR' => 'CARRIAGE RETURN (CR)',
51e9e896 21 'NEL' => 'NEXT LINE (NEL)',
24b5d5cc
JH
22 # More convenience. For futher convencience,
23 # it is suggested some way using using the NamesList
24 # aliases is implemented.
25 'ZWNJ' => 'ZERO WIDTH NON-JOINER',
26 'ZWJ' => 'ZERO WIDTH JOINER',
52ea3e69
JH
27 'BOM' => 'BYTE ORDER MARK',
28 );
29
30my %alias2 = (
31 # Pre-3.2 compatibility (only for the first 256 characters).
32 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION',
33 'VERTICAL TABULATION' => 'LINE TABULATION',
34 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR',
35 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE',
36 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO',
37 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE',
38 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD',
39 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD',
40 );
41
35c0985d
MB
42my %alias3 = (
43 # User defined aliasses. Even more convenient :)
44 );
423cee85
JH
45my $txt;
46
35c0985d
MB
47sub alias (@)
48{
49 @_ or return %alias3;
50 my $alias = ref $_[0] ? $_[0] : { @_ };
51 @alias3{keys %$alias} = values %$alias;
52} # alias
53
54sub alias_file ($)
55{
51cf30b6
MB
56 my ($arg, $file) = @_;
57 if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
58 $file = $arg;
59 }
60 elsif ($arg =~ m/^\w+$/) {
61 $file = "unicore/${arg}_alias.pl";
62 }
63 else {
64 croak "Charnames alias files can only have identifier characters";
65 }
35c0985d 66 if (my @alias = do $file) {
51cf30b6
MB
67 @alias == 1 && !defined $alias[0] and
68 croak "$file cannot be used as alias file for charnames";
69 @alias % 2 and
70 croak "$file did not return a (valid) list of alias pairs";
35c0985d
MB
71 alias (@alias);
72 return (1);
73 }
74 0;
75} # alias_file
76
423cee85 77# This is not optimized in any way yet
b177ca84
JF
78sub charnames
79{
80 my $name = shift;
81
52ea3e69 82 if (exists $alias1{$name}) {
35c0985d 83 $name = $alias1{$name};
52ea3e69 84 }
35c0985d
MB
85 elsif (exists $alias2{$name}) {
86 require warnings;
87 warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
88 $name = $alias2{$name};
89 }
90 elsif (exists $alias3{$name}) {
91 $name = $alias3{$name};
52ea3e69 92 }
b177ca84 93
52ea3e69 94 my $ord;
423cee85 95 my @off;
52ea3e69
JH
96 my $fname;
97
98 if ($name eq "BYTE ORDER MARK") {
35c0985d
MB
99 $fname = $name;
100 $ord = 0xFEFF;
52ea3e69 101 } else {
35c0985d
MB
102 ## Suck in the code/name list as a big string.
103 ## Lines look like:
104 ## "0052\t\tLATIN CAPITAL LETTER R\n"
105 $txt = do "unicore/Name.pl" unless $txt;
106
107 ## @off will hold the index into the code/name string of the start and
108 ## end of the name as we find it.
109
a6d05634 110 ## If :full, look for the name exactly
35c0985d
MB
111 if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
112 @off = ($-[0], $+[0]);
113 }
114
115 ## If we didn't get above, and :short allowed, look for the short name.
116 ## The short name is like "greek:Sigma"
117 unless (@off) {
118 if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
119 my ($script, $cname) = ($1, $2);
120 my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
121 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
52ea3e69 122 @off = ($-[0], $+[0]);
35c0985d 123 }
423cee85 124 }
35c0985d 125 }
b177ca84 126
35c0985d
MB
127 ## If we still don't have it, check for the name among the loaded
128 ## scripts.
129 if (not @off) {
130 my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
131 for my $script (@{$^H{charnames_scripts}}) {
132 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
133 @off = ($-[0], $+[0]);
134 last;
135 }
52ea3e69 136 }
35c0985d
MB
137 }
138
139 ## If we don't have it by now, give up.
140 unless (@off) {
141 carp "Unknown charname '$name'";
142 return "\x{FFFD}";
143 }
144
145 ##
146 ## Now know where in the string the name starts.
147 ## The code, in hex, is before that.
148 ##
149 ## The code can be 4-6 characters long, so we've got to sort of
150 ## go look for it, just after the newline that comes before $off[0].
151 ##
152 ## This would be much easier if unicore/Name.pl had info in
153 ## a name/code order, instead of code/name order.
154 ##
155 ## The +1 after the rindex() is to skip past the newline we're finding,
156 ## or, if the rindex() fails, to put us to an offset of zero.
157 ##
158 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
159
160 ## we know where it starts, so turn into number -
161 ## the ordinal for the char.
162 $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
423cee85 163 }
b177ca84 164
d5448623 165 if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
8058d7ab 166 use bytes;
d41ff1b8 167 return chr $ord if $ord <= 255;
f0175764 168 my $hex = sprintf "%04x", $ord;
52ea3e69 169 if (not defined $fname) {
35c0985d 170 $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
52ea3e69 171 }
f0175764 172 croak "Character 0x$hex with name '$fname' is above 0xFF";
423cee85 173 }
f0175764 174
52ea3e69 175 no warnings 'utf8'; # allow even illegal characters
bfa383d6 176 return pack "U", $ord;
35c0985d 177} # charnames
423cee85 178
b177ca84
JF
179sub import
180{
181 shift; ## ignore class name
182
35c0985d
MB
183 if (not @_) {
184 carp("`use charnames' needs explicit imports list");
b177ca84 185 }
d5448623 186 $^H |= $charnames::hint_bits;
423cee85 187 $^H{charnames} = \&charnames ;
b177ca84
JF
188
189 ##
190 ## fill %h keys with our @_ args.
191 ##
35c0985d
MB
192 my ($promote, %h, @args) = (0);
193 while (@_ and $_ = shift) {
51cf30b6
MB
194 if ($_ eq ":alias") {
195 @_ or
196 croak ":alias needs an argument in charnames";
35c0985d
MB
197 my $alias = shift;
198 if (ref $alias) {
199 ref $alias eq "HASH" or
51cf30b6 200 croak "Only HASH reference supported as argument to :alias";
35c0985d
MB
201 alias ($alias);
202 next;
203 }
51cf30b6
MB
204 if ($alias =~ m{:(\w+)$}) {
205 $1 eq "full" || $1 eq "short" and
206 croak ":alias cannot use existing pragma :$1 (reversed order?)";
207 alias_file ($1) and $promote = 1;
208 next;
35c0985d 209 }
51cf30b6
MB
210 alias_file ($alias);
211 next;
212 }
213 if (m/^:/ and ! ($_ eq ":full" || $_ eq ":short")) {
214 warn "unsupported special '$_' in charnames";
215 next;
35c0985d
MB
216 }
217 push @args, $_;
218 }
219 @args == 0 && $promote and @args = (":full");
220 @h{@args} = (1) x @args;
b177ca84 221
423cee85
JH
222 $^H{charnames_full} = delete $h{':full'};
223 $^H{charnames_short} = delete $h{':short'};
224 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84
JF
225
226 ##
227 ## If utf8? warnings are enabled, and some scripts were given,
228 ## see if at least we can find one letter of each script.
229 ##
35c0985d
MB
230 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
231 $txt = do "unicore/Name.pl" unless $txt;
232
233 for my $script (@{$^H{charnames_scripts}}) {
234 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
235 warnings::warn('utf8', "No such script: '$script'");
b177ca84 236 }
35c0985d 237 }
bd62941a 238 }
35c0985d 239} # import
423cee85 240
f0175764
JH
241require Unicode::UCD; # for Unicode::UCD::_getcode()
242
4e2cda5d
JH
243my %viacode;
244
b177ca84
JF
245sub viacode
246{
35c0985d
MB
247 if (@_ != 1) {
248 carp "charnames::viacode() expects one argument";
249 return ()
250 }
f0175764 251
35c0985d
MB
252 my $arg = shift;
253 my $code = Unicode::UCD::_getcode($arg);
b177ca84 254
35c0985d 255 my $hex;
f0175764 256
35c0985d
MB
257 if (defined $code) {
258 $hex = sprintf "%04X", $arg;
259 } else {
260 carp("unexpected arg \"$arg\" to charnames::viacode()");
261 return;
262 }
b177ca84 263
35c0985d
MB
264 if ($code > 0x10FFFF) {
265 carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex;
266 return;
267 }
f0175764 268
35c0985d 269 return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d 270
35c0985d 271 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 272
35c0985d
MB
273 if ($txt =~ m/^$hex\t\t(.+)/m) {
274 return $viacode{$hex} = $1;
275 } else {
276 return;
277 }
278} # viacode
daf0d493 279
4e2cda5d
JH
280my %vianame;
281
daf0d493
JH
282sub vianame
283{
35c0985d
MB
284 if (@_ != 1) {
285 carp "charnames::vianame() expects one name argument";
286 return ()
287 }
daf0d493 288
35c0985d 289 my $arg = shift;
daf0d493 290
35c0985d 291 return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;
dbc0d4f2 292
35c0985d 293 return $vianame{$arg} if exists $vianame{$arg};
4e2cda5d 294
35c0985d 295 $txt = do "unicore/Name.pl" unless $txt;
daf0d493 296
35c0985d
MB
297 my $pos = index $txt, "\t\t$arg\n";
298 if ($[ <= $pos) {
299 my $posLF = rindex $txt, "\n", $pos;
300 (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d;
301 return $vianame{$arg} = hex $code;
302
303 # If $pos is at the 1st line, $posLF must be $[ - 1 (not found);
304 # then $posLF + 1 equals to $[ (at the beginning of $txt).
305 # Otherwise $posLF is the position of "\n";
306 # then $posLF + 1 must be the position of the next to "\n"
307 # (the beginning of the line).
308 # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t",
309 # "10300\t", "100000", etc. So we can get the code via removing TAB.
310 } else {
311 return;
312 }
313} # vianame
b177ca84 314
423cee85
JH
315
3161;
317__END__
318
319=head1 NAME
320
274085e3 321charnames - define character names for C<\N{named}> string literal escapes
423cee85
JH
322
323=head1 SYNOPSIS
324
325 use charnames ':full';
4a2d328f 326 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85
JH
327
328 use charnames ':short';
4a2d328f 329 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85
JH
330
331 use charnames qw(cyrillic greek);
4a2d328f 332 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 333
35c0985d
MB
334 use charnames ":full", ":alias" => {
335 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
76ae0c45 336 };
35c0985d
MB
337 print "\N{e_ACUTE} is a small letter e with an acute.\n";
338
76ae0c45 339 use charnames ();
a23c04e4
JH
340 print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
341 printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330"
b177ca84 342
423cee85
JH
343=head1 DESCRIPTION
344
35c0985d
MB
345Pragma C<use charnames> supports arguments C<:full>, C<:short>, script
346names and customized aliases. If C<:full> is present, for expansion of
76ae0c45
RGS
347C<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of
348standard Unicode character names. If C<:short> is present, and
423cee85
JH
349C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
350as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
a191c821 351with script name arguments, then for C<\N{CHARNAME}> the name
423cee85 352C<CHARNAME> is looked up as a letter in the given scripts (in the
35c0985d 353specified order). Customized aliases are explained in L</CUSTOM ALIASES>.
423cee85
JH
354
355For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623 356this pragma looks for the names
423cee85
JH
357
358 SCRIPTNAME CAPITAL LETTER CHARNAME
359 SCRIPTNAME SMALL LETTER CHARNAME
360 SCRIPTNAME LETTER CHARNAME
361
362in the table of standard Unicode names. If C<CHARNAME> is lowercase,
daf0d493
JH
363then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
364is ignored.
365
366Note that C<\N{...}> is compile-time, it's a special form of string
367constant used inside double-quoted strings: in other words, you cannot
4e2cda5d 368use variables inside the C<\N{...}>. If you want similar run-time
daf0d493 369functionality, use charnames::vianame().
423cee85 370
301a3cda 371For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
dbc0d4f2
JH
372as of Unicode 3.1, there are no official Unicode names but you can use
373instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). In
374Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
375has been updated, see L</ALIASES>. Also note that the U+UU80, U+0081,
376U+0084, and U+0099 do not have names even in ISO 6429.
377
378Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
379is the Unicode smiley face, or "\N{WHITE SMILING FACE}".
301a3cda 380
423cee85
JH
381=head1 CUSTOM TRANSLATORS
382
d5448623 383The mechanism of translation of C<\N{...}> escapes is general and not
423cee85 384hardwired into F<charnames.pm>. A module can install custom
d5448623 385translations (inside the scope which C<use>s the module) with the
423cee85
JH
386following magic incantation:
387
d5448623
GS
388 use charnames (); # for $charnames::hint_bits
389 sub import {
390 shift;
391 $^H |= $charnames::hint_bits;
392 $^H{charnames} = \&translator;
393 }
423cee85
JH
394
395Here translator() is a subroutine which takes C<CHARNAME> as an
396argument, and returns text to insert into the string instead of the
4a2d328f 397C<\N{CHARNAME}> escape. Since the text to insert should be different
d5448623
GS
398in C<bytes> mode and out of it, the function should check the current
399state of C<bytes>-flag as in:
400
401 use bytes (); # for $bytes::hint_bits
402 sub translator {
403 if ($^H & $bytes::hint_bits) {
404 return bytes_translator(@_);
405 }
406 else {
407 return utf8_translator(@_);
408 }
423cee85 409 }
423cee85 410
35c0985d
MB
411=head1 CUSTOM ALIASES
412
413This version of charnames supports three mechanisms of adding local
414or customized aliases to standard Unicode naming conventions (:full)
415
416=head2 Anonymous hashes
417
418 use charnames ":full", ":alias" => {
419 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
420 };
421 my $str = "\N{e_ACUTE}";
422
423=head2 Alias file
424
425 use charnames ":full", ":alias" => "pro";
426
427 will try to read "unicore/pro_alias.pl" from the @INC path. This
428 file should return a list in plain perl:
429
430 (
431 A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
432 A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
433 A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
434 A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
435 A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
436 A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
437 A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
438 );
439
440=head2 Alias shortcut
441
442 use charnames ":alias" => ":pro";
443
444 works exactly the same as the alias pairs, only this time,
445 ":full" is inserted automatically as first argument (if no
446 other argument is given).
447
b177ca84
JF
448=head1 charnames::viacode(code)
449
450Returns the full name of the character indicated by the numeric code.
451The example
452
453 print charnames::viacode(0x2722);
454
455prints "FOUR TEARDROP-SPOKED ASTERISK".
456
daf0d493
JH
457Returns undef if no name is known for the code.
458
35c0985d 459This works only for the standard names, and does not yet apply
daf0d493
JH
460to custom translators.
461
274085e3
PN
462Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
463SPACE", not "BYTE ORDER MARK".
464
eb6a2339 465=head1 charnames::vianame(name)
daf0d493
JH
466
467Returns the code point indicated by the name.
468The example
469
470 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
471
472prints "2722".
473
eb6a2339 474Returns undef if the name is unknown.
b177ca84 475
35c0985d 476This works only for the standard names, and does not yet apply
b177ca84
JF
477to custom translators.
478
52ea3e69
JH
479=head1 ALIASES
480
481A few aliases have been defined for convenience: instead of having
482to use the official names
483
484 LINE FEED (LF)
485 FORM FEED (FF)
486 CARRIAGE RETURN (CR)
487 NEXT LINE (NEL)
488
489(yes, with parentheses) one can use
490
491 LINE FEED
492 FORM FEED
493 CARRIAGE RETURN
494 NEXT LINE
495 LF
496 FF
497 CR
498 NEL
499
500One can also use
501
502 BYTE ORDER MARK
503 BOM
504
24b5d5cc
JH
505and
506
507 ZWNJ
508 ZWJ
509
510for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER.
52ea3e69
JH
511
512For backward compatibility one can use the old names for
513certain C0 and C1 controls
514
515 old new
516
517 HORIZONTAL TABULATION CHARACTER TABULATION
518 VERTICAL TABULATION LINE TABULATION
519 FILE SEPARATOR INFORMATION SEPARATOR FOUR
520 GROUP SEPARATOR INFORMATION SEPARATOR THREE
521 RECORD SEPARATOR INFORMATION SEPARATOR TWO
522 UNIT SEPARATOR INFORMATION SEPARATOR ONE
523 PARTIAL LINE DOWN PARTIAL LINE FORWARD
524 PARTIAL LINE UP PARTIAL LINE BACKWARD
525
526but the old names in addition to giving the character
527will also give a warning about being deprecated.
528
f0175764
JH
529=head1 ILLEGAL CHARACTERS
530
00d835f2
JH
531If you ask by name for a character that does not exist, a warning is
532given and the Unicode I<replacement character> "\x{FFFD}" is returned.
533
534If you ask by code for a character that does not exist, no warning is
535given and C<undef> is returned. (Though if you ask for a code point
536past U+10FFFF you do get a warning.)
f0175764 537
423cee85
JH
538=head1 BUGS
539
540Since evaluation of the translation function happens in a middle of
541compilation (of a string literal), the translation function should not
542do any C<eval>s or C<require>s. This restriction should be lifted in
543a future version of Perl.
544
545=cut