This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Proposed patch + test case.
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
4use Carp;
51cf30b6 5use File::Spec;
b8effcb5 6our $VERSION = '1.03';
b75c8c73 7
d5448623 8use bytes (); # for $bytes::hint_bits
9cfe5470 9$charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH
423cee85 10
52ea3e69
JH
11my %alias1 = (
12 # Icky 3.2 names with parentheses.
13 'LINE FEED' => 'LINE FEED (LF)',
14 'FORM FEED' => 'FORM FEED (FF)',
15 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)',
16 'NEXT LINE' => 'NEXT LINE (NEL)',
17 # Convenience.
18 'LF' => 'LINE FEED (LF)',
19 'FF' => 'FORM FEED (FF)',
eb380778 20 'CR' => 'CARRIAGE RETURN (CR)',
51e9e896 21 'NEL' => 'NEXT LINE (NEL)',
24b5d5cc
JH
22 # More convenience. For futher convencience,
23 # it is suggested some way using using the NamesList
24 # aliases is implemented.
25 'ZWNJ' => 'ZERO WIDTH NON-JOINER',
26 'ZWJ' => 'ZERO WIDTH JOINER',
52ea3e69
JH
27 'BOM' => 'BYTE ORDER MARK',
28 );
29
30my %alias2 = (
31 # Pre-3.2 compatibility (only for the first 256 characters).
32 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION',
33 'VERTICAL TABULATION' => 'LINE TABULATION',
34 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR',
35 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE',
36 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO',
37 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE',
38 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD',
39 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD',
40 );
41
35c0985d
MB
42my %alias3 = (
43 # User defined aliasses. Even more convenient :)
44 );
423cee85
JH
45my $txt;
46
35c0985d
MB
47sub alias (@)
48{
49 @_ or return %alias3;
50 my $alias = ref $_[0] ? $_[0] : { @_ };
51 @alias3{keys %$alias} = values %$alias;
52} # alias
53
54sub alias_file ($)
55{
51cf30b6
MB
56 my ($arg, $file) = @_;
57 if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
58 $file = $arg;
59 }
60 elsif ($arg =~ m/^\w+$/) {
61 $file = "unicore/${arg}_alias.pl";
62 }
63 else {
64 croak "Charnames alias files can only have identifier characters";
65 }
35c0985d 66 if (my @alias = do $file) {
51cf30b6
MB
67 @alias == 1 && !defined $alias[0] and
68 croak "$file cannot be used as alias file for charnames";
69 @alias % 2 and
70 croak "$file did not return a (valid) list of alias pairs";
35c0985d
MB
71 alias (@alias);
72 return (1);
73 }
74 0;
75} # alias_file
76
423cee85 77# This is not optimized in any way yet
b177ca84
JF
78sub charnames
79{
80 my $name = shift;
81
52ea3e69 82 if (exists $alias1{$name}) {
35c0985d 83 $name = $alias1{$name};
52ea3e69 84 }
35c0985d
MB
85 elsif (exists $alias2{$name}) {
86 require warnings;
87 warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
88 $name = $alias2{$name};
89 }
90 elsif (exists $alias3{$name}) {
91 $name = $alias3{$name};
52ea3e69 92 }
b177ca84 93
52ea3e69 94 my $ord;
423cee85 95 my @off;
52ea3e69
JH
96 my $fname;
97
98 if ($name eq "BYTE ORDER MARK") {
35c0985d
MB
99 $fname = $name;
100 $ord = 0xFEFF;
52ea3e69 101 } else {
35c0985d
MB
102 ## Suck in the code/name list as a big string.
103 ## Lines look like:
104 ## "0052\t\tLATIN CAPITAL LETTER R\n"
105 $txt = do "unicore/Name.pl" unless $txt;
106
107 ## @off will hold the index into the code/name string of the start and
108 ## end of the name as we find it.
109
a6d05634 110 ## If :full, look for the name exactly
35c0985d
MB
111 if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
112 @off = ($-[0], $+[0]);
113 }
114
115 ## If we didn't get above, and :short allowed, look for the short name.
116 ## The short name is like "greek:Sigma"
117 unless (@off) {
118 if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
119 my ($script, $cname) = ($1, $2);
120 my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
121 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
52ea3e69 122 @off = ($-[0], $+[0]);
35c0985d 123 }
423cee85 124 }
35c0985d 125 }
b177ca84 126
35c0985d
MB
127 ## If we still don't have it, check for the name among the loaded
128 ## scripts.
129 if (not @off) {
130 my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
131 for my $script (@{$^H{charnames_scripts}}) {
132 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
133 @off = ($-[0], $+[0]);
134 last;
135 }
52ea3e69 136 }
35c0985d
MB
137 }
138
139 ## If we don't have it by now, give up.
140 unless (@off) {
141 carp "Unknown charname '$name'";
142 return "\x{FFFD}";
143 }
144
145 ##
146 ## Now know where in the string the name starts.
147 ## The code, in hex, is before that.
148 ##
149 ## The code can be 4-6 characters long, so we've got to sort of
150 ## go look for it, just after the newline that comes before $off[0].
151 ##
152 ## This would be much easier if unicore/Name.pl had info in
153 ## a name/code order, instead of code/name order.
154 ##
155 ## The +1 after the rindex() is to skip past the newline we're finding,
156 ## or, if the rindex() fails, to put us to an offset of zero.
157 ##
158 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
159
160 ## we know where it starts, so turn into number -
161 ## the ordinal for the char.
162 $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
423cee85 163 }
b177ca84 164
d5448623 165 if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
8058d7ab 166 use bytes;
d41ff1b8 167 return chr $ord if $ord <= 255;
f0175764 168 my $hex = sprintf "%04x", $ord;
52ea3e69 169 if (not defined $fname) {
35c0985d 170 $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
52ea3e69 171 }
f0175764 172 croak "Character 0x$hex with name '$fname' is above 0xFF";
423cee85 173 }
f0175764 174
52ea3e69 175 no warnings 'utf8'; # allow even illegal characters
bfa383d6 176 return pack "U", $ord;
35c0985d 177} # charnames
423cee85 178
b177ca84
JF
179sub import
180{
181 shift; ## ignore class name
182
35c0985d
MB
183 if (not @_) {
184 carp("`use charnames' needs explicit imports list");
b177ca84 185 }
d5448623 186 $^H |= $charnames::hint_bits;
423cee85 187 $^H{charnames} = \&charnames ;
b177ca84
JF
188
189 ##
190 ## fill %h keys with our @_ args.
191 ##
35c0985d 192 my ($promote, %h, @args) = (0);
e5c3f898
MG
193 while (my $arg = shift) {
194 if ($arg eq ":alias") {
51cf30b6
MB
195 @_ or
196 croak ":alias needs an argument in charnames";
35c0985d
MB
197 my $alias = shift;
198 if (ref $alias) {
199 ref $alias eq "HASH" or
51cf30b6 200 croak "Only HASH reference supported as argument to :alias";
35c0985d
MB
201 alias ($alias);
202 next;
203 }
51cf30b6
MB
204 if ($alias =~ m{:(\w+)$}) {
205 $1 eq "full" || $1 eq "short" and
206 croak ":alias cannot use existing pragma :$1 (reversed order?)";
207 alias_file ($1) and $promote = 1;
208 next;
35c0985d 209 }
51cf30b6
MB
210 alias_file ($alias);
211 next;
212 }
e5c3f898
MG
213 if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" || $arg eq ":short")) {
214 warn "unsupported special '$arg' in charnames";
51cf30b6 215 next;
35c0985d 216 }
e5c3f898 217 push @args, $arg;
35c0985d
MB
218 }
219 @args == 0 && $promote and @args = (":full");
220 @h{@args} = (1) x @args;
b177ca84 221
423cee85
JH
222 $^H{charnames_full} = delete $h{':full'};
223 $^H{charnames_short} = delete $h{':short'};
224 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84
JF
225
226 ##
227 ## If utf8? warnings are enabled, and some scripts were given,
228 ## see if at least we can find one letter of each script.
229 ##
35c0985d
MB
230 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
231 $txt = do "unicore/Name.pl" unless $txt;
232
233 for my $script (@{$^H{charnames_scripts}}) {
234 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
235 warnings::warn('utf8', "No such script: '$script'");
b177ca84 236 }
35c0985d 237 }
bd62941a 238 }
35c0985d 239} # import
423cee85 240
b8effcb5
YST
241# this comes actually from Unicode::UCD, but it avoids the
242# overhead of loading it
243sub _getcode {
244 my $arg = shift;
245
246 if ($arg =~ /^[1-9]\d*$/) {
247 return $arg;
248 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
249 return hex($1);
250 }
251
252 return;
253}
f0175764 254
4e2cda5d
JH
255my %viacode;
256
b177ca84
JF
257sub viacode
258{
35c0985d
MB
259 if (@_ != 1) {
260 carp "charnames::viacode() expects one argument";
261 return ()
262 }
f0175764 263
35c0985d 264 my $arg = shift;
b8effcb5 265 my $code = _getcode($arg);
b177ca84 266
35c0985d 267 my $hex;
f0175764 268
35c0985d
MB
269 if (defined $code) {
270 $hex = sprintf "%04X", $arg;
271 } else {
272 carp("unexpected arg \"$arg\" to charnames::viacode()");
273 return;
274 }
b177ca84 275
35c0985d
MB
276 if ($code > 0x10FFFF) {
277 carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex;
278 return;
279 }
f0175764 280
35c0985d 281 return $viacode{$hex} if exists $viacode{$hex};
4e2cda5d 282
35c0985d 283 $txt = do "unicore/Name.pl" unless $txt;
b177ca84 284
35c0985d
MB
285 if ($txt =~ m/^$hex\t\t(.+)/m) {
286 return $viacode{$hex} = $1;
287 } else {
288 return;
289 }
290} # viacode
daf0d493 291
4e2cda5d
JH
292my %vianame;
293
daf0d493
JH
294sub vianame
295{
35c0985d
MB
296 if (@_ != 1) {
297 carp "charnames::vianame() expects one name argument";
298 return ()
299 }
daf0d493 300
35c0985d 301 my $arg = shift;
daf0d493 302
35c0985d 303 return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;
dbc0d4f2 304
35c0985d 305 return $vianame{$arg} if exists $vianame{$arg};
4e2cda5d 306
35c0985d 307 $txt = do "unicore/Name.pl" unless $txt;
daf0d493 308
35c0985d
MB
309 my $pos = index $txt, "\t\t$arg\n";
310 if ($[ <= $pos) {
311 my $posLF = rindex $txt, "\n", $pos;
312 (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d;
313 return $vianame{$arg} = hex $code;
314
315 # If $pos is at the 1st line, $posLF must be $[ - 1 (not found);
316 # then $posLF + 1 equals to $[ (at the beginning of $txt).
317 # Otherwise $posLF is the position of "\n";
318 # then $posLF + 1 must be the position of the next to "\n"
319 # (the beginning of the line).
320 # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t",
321 # "10300\t", "100000", etc. So we can get the code via removing TAB.
322 } else {
323 return;
324 }
325} # vianame
b177ca84 326
423cee85
JH
327
3281;
329__END__
330
331=head1 NAME
332
274085e3 333charnames - define character names for C<\N{named}> string literal escapes
423cee85
JH
334
335=head1 SYNOPSIS
336
337 use charnames ':full';
4a2d328f 338 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85
JH
339
340 use charnames ':short';
4a2d328f 341 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85
JH
342
343 use charnames qw(cyrillic greek);
4a2d328f 344 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 345
35c0985d
MB
346 use charnames ":full", ":alias" => {
347 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
76ae0c45 348 };
35c0985d
MB
349 print "\N{e_ACUTE} is a small letter e with an acute.\n";
350
76ae0c45 351 use charnames ();
a23c04e4
JH
352 print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
353 printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330"
b177ca84 354
423cee85
JH
355=head1 DESCRIPTION
356
35c0985d
MB
357Pragma C<use charnames> supports arguments C<:full>, C<:short>, script
358names and customized aliases. If C<:full> is present, for expansion of
76ae0c45
RGS
359C<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of
360standard Unicode character names. If C<:short> is present, and
423cee85
JH
361C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
362as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
a191c821 363with script name arguments, then for C<\N{CHARNAME}> the name
423cee85 364C<CHARNAME> is looked up as a letter in the given scripts (in the
35c0985d 365specified order). Customized aliases are explained in L</CUSTOM ALIASES>.
423cee85
JH
366
367For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623 368this pragma looks for the names
423cee85
JH
369
370 SCRIPTNAME CAPITAL LETTER CHARNAME
371 SCRIPTNAME SMALL LETTER CHARNAME
372 SCRIPTNAME LETTER CHARNAME
373
374in the table of standard Unicode names. If C<CHARNAME> is lowercase,
daf0d493
JH
375then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
376is ignored.
377
378Note that C<\N{...}> is compile-time, it's a special form of string
379constant used inside double-quoted strings: in other words, you cannot
4e2cda5d 380use variables inside the C<\N{...}>. If you want similar run-time
daf0d493 381functionality, use charnames::vianame().
423cee85 382
301a3cda 383For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
dbc0d4f2
JH
384as of Unicode 3.1, there are no official Unicode names but you can use
385instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). In
386Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
387has been updated, see L</ALIASES>. Also note that the U+UU80, U+0081,
388U+0084, and U+0099 do not have names even in ISO 6429.
389
390Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
391is the Unicode smiley face, or "\N{WHITE SMILING FACE}".
301a3cda 392
423cee85
JH
393=head1 CUSTOM TRANSLATORS
394
d5448623 395The mechanism of translation of C<\N{...}> escapes is general and not
423cee85 396hardwired into F<charnames.pm>. A module can install custom
d5448623 397translations (inside the scope which C<use>s the module) with the
423cee85
JH
398following magic incantation:
399
d5448623
GS
400 use charnames (); # for $charnames::hint_bits
401 sub import {
402 shift;
403 $^H |= $charnames::hint_bits;
404 $^H{charnames} = \&translator;
405 }
423cee85
JH
406
407Here translator() is a subroutine which takes C<CHARNAME> as an
408argument, and returns text to insert into the string instead of the
4a2d328f 409C<\N{CHARNAME}> escape. Since the text to insert should be different
d5448623
GS
410in C<bytes> mode and out of it, the function should check the current
411state of C<bytes>-flag as in:
412
413 use bytes (); # for $bytes::hint_bits
414 sub translator {
415 if ($^H & $bytes::hint_bits) {
416 return bytes_translator(@_);
417 }
418 else {
419 return utf8_translator(@_);
420 }
423cee85 421 }
423cee85 422
35c0985d
MB
423=head1 CUSTOM ALIASES
424
425This version of charnames supports three mechanisms of adding local
426or customized aliases to standard Unicode naming conventions (:full)
427
428=head2 Anonymous hashes
429
430 use charnames ":full", ":alias" => {
431 e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
432 };
433 my $str = "\N{e_ACUTE}";
434
435=head2 Alias file
436
437 use charnames ":full", ":alias" => "pro";
438
439 will try to read "unicore/pro_alias.pl" from the @INC path. This
440 file should return a list in plain perl:
441
442 (
443 A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
444 A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
445 A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
446 A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
447 A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
448 A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
449 A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
450 );
451
452=head2 Alias shortcut
453
454 use charnames ":alias" => ":pro";
455
456 works exactly the same as the alias pairs, only this time,
457 ":full" is inserted automatically as first argument (if no
458 other argument is given).
459
b177ca84
JF
460=head1 charnames::viacode(code)
461
462Returns the full name of the character indicated by the numeric code.
463The example
464
465 print charnames::viacode(0x2722);
466
467prints "FOUR TEARDROP-SPOKED ASTERISK".
468
daf0d493
JH
469Returns undef if no name is known for the code.
470
35c0985d 471This works only for the standard names, and does not yet apply
daf0d493
JH
472to custom translators.
473
274085e3
PN
474Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
475SPACE", not "BYTE ORDER MARK".
476
eb6a2339 477=head1 charnames::vianame(name)
daf0d493
JH
478
479Returns the code point indicated by the name.
480The example
481
482 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
483
484prints "2722".
485
eb6a2339 486Returns undef if the name is unknown.
b177ca84 487
35c0985d 488This works only for the standard names, and does not yet apply
b177ca84
JF
489to custom translators.
490
52ea3e69
JH
491=head1 ALIASES
492
493A few aliases have been defined for convenience: instead of having
494to use the official names
495
496 LINE FEED (LF)
497 FORM FEED (FF)
498 CARRIAGE RETURN (CR)
499 NEXT LINE (NEL)
500
501(yes, with parentheses) one can use
502
503 LINE FEED
504 FORM FEED
505 CARRIAGE RETURN
506 NEXT LINE
507 LF
508 FF
509 CR
510 NEL
511
512One can also use
513
514 BYTE ORDER MARK
515 BOM
516
24b5d5cc
JH
517and
518
519 ZWNJ
520 ZWJ
521
522for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER.
52ea3e69
JH
523
524For backward compatibility one can use the old names for
525certain C0 and C1 controls
526
527 old new
528
529 HORIZONTAL TABULATION CHARACTER TABULATION
530 VERTICAL TABULATION LINE TABULATION
531 FILE SEPARATOR INFORMATION SEPARATOR FOUR
532 GROUP SEPARATOR INFORMATION SEPARATOR THREE
533 RECORD SEPARATOR INFORMATION SEPARATOR TWO
534 UNIT SEPARATOR INFORMATION SEPARATOR ONE
535 PARTIAL LINE DOWN PARTIAL LINE FORWARD
536 PARTIAL LINE UP PARTIAL LINE BACKWARD
537
538but the old names in addition to giving the character
539will also give a warning about being deprecated.
540
f0175764
JH
541=head1 ILLEGAL CHARACTERS
542
00d835f2
JH
543If you ask by name for a character that does not exist, a warning is
544given and the Unicode I<replacement character> "\x{FFFD}" is returned.
545
546If you ask by code for a character that does not exist, no warning is
547given and C<undef> is returned. (Though if you ask for a code point
548past U+10FFFF you do get a warning.)
f0175764 549
423cee85
JH
550=head1 BUGS
551
552Since evaluation of the translation function happens in a middle of
553compilation (of a string literal), the translation function should not
554do any C<eval>s or C<require>s. This restriction should be lifted in
555a future version of Perl.
556
557=cut