This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: Réf. : Re: PATCH proposal for ext/Safe/safe2.t
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
4use Carp;
5our $VERSION = '1.01';
b75c8c73 6
d5448623
GS
7use bytes (); # for $bytes::hint_bits
8$charnames::hint_bits = 0x20000;
423cee85 9
423cee85
JH
10my $txt;
11
12# This is not optimized in any way yet
b177ca84
JF
13sub charnames
14{
15 my $name = shift;
16
17 ## Suck in the code/name list as a big string.
18 ## Lines look like:
19 ## "0052\t\tLATIN CAPITAL LETTER R\n"
55d7b906 20 $txt = do "unicore/Name.pl" unless $txt;
b177ca84
JF
21
22 ## @off will hold the index into the code/name string of the start and
23 ## end of the name as we find it.
423cee85 24 my @off;
b177ca84
JF
25
26 ## If :full, look for the the name exactly
423cee85
JH
27 if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
28 @off = ($-[0], $+[0]);
29 }
b177ca84
JF
30
31 ## If we didn't get above, and :short allowed, look for the short name.
32 ## The short name is like "greek:Sigma"
423cee85 33 unless (@off) {
b177ca84 34 if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
423cee85
JH
35 my ($script, $cname) = ($1,$2);
36 my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
37 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
38 @off = ($-[0], $+[0]);
39 }
40 }
41 }
b177ca84
JF
42
43 ## If we still don't have it, check for the name among the loaded
44 ## scripts.
45 if (not @off)
46 {
47 my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
48 for my $script ( @{$^H{charnames_scripts}} )
49 {
50 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U$name$/m) {
51 @off = ($-[0], $+[0]);
52 last;
53 }
54 }
423cee85 55 }
b177ca84
JF
56
57 ## If we don't have it by now, give up.
f0175764
JH
58 unless (@off) {
59 carp "Unknown charname '$name'";
60 return "\x{FFFD}";
61 }
b896c7a5 62
b177ca84
JF
63 ##
64 ## Now know where in the string the name starts.
65 ## The code, in hex, is befor that.
66 ##
67 ## The code can be 4-6 characters long, so we've got to sort of
68 ## go look for it, just after the newline that comes before $off[0].
69 ##
70 ## This would be much easier if unicore/Name.pl had info in
71 ## a name/code order, instead of code/name order.
72 ##
73 ## The +1 after the rindex() is to skip past the newline we're finding,
74 ## or, if the rindex() fails, to put us to an offset of zero.
75 ##
76 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
77
78 ## we know where it starts, so turn into number - the ordinal for the char.
79 my $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
80
d5448623 81 if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
8058d7ab 82 use bytes;
d41ff1b8 83 return chr $ord if $ord <= 255;
f0175764 84 my $hex = sprintf "%04x", $ord;
d41ff1b8 85 my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
f0175764 86 croak "Character 0x$hex with name '$fname' is above 0xFF";
423cee85 87 }
f0175764 88
bfa383d6 89 return pack "U", $ord;
423cee85
JH
90}
91
b177ca84
JF
92sub import
93{
94 shift; ## ignore class name
95
96 if (not @_)
97 {
98 carp("`use charnames' needs explicit imports list");
99 }
d5448623 100 $^H |= $charnames::hint_bits;
423cee85 101 $^H{charnames} = \&charnames ;
b177ca84
JF
102
103 ##
104 ## fill %h keys with our @_ args.
105 ##
423cee85
JH
106 my %h;
107 @h{@_} = (1) x @_;
b177ca84 108
423cee85
JH
109 $^H{charnames_full} = delete $h{':full'};
110 $^H{charnames_short} = delete $h{':short'};
111 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84
JF
112
113 ##
114 ## If utf8? warnings are enabled, and some scripts were given,
115 ## see if at least we can find one letter of each script.
116 ##
117 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}})
118 {
119 $txt = do "unicore/Name.pl" unless $txt;
120
121 for my $script (@{$^H{charnames_scripts}})
122 {
123 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
124 warnings::warn('utf8', "No such script: '$script'");
125 }
126 }
bd62941a 127 }
423cee85
JH
128}
129
f0175764
JH
130require Unicode::UCD; # for Unicode::UCD::_getcode()
131
4e2cda5d
JH
132my %viacode;
133
b177ca84
JF
134sub viacode
135{
136 if (@_ != 1) {
daf0d493 137 carp "charnames::viacode() expects one numeric argument";
b177ca84
JF
138 return ()
139 }
f0175764 140
b177ca84 141 my $arg = shift;
f0175764 142 my $code = Unicode::UCD::_getcode($arg);
b177ca84
JF
143
144 my $hex;
f0175764
JH
145
146 if (defined $code) {
b177ca84
JF
147 $hex = sprintf "%04X", $arg;
148 } else {
149 carp("unexpected arg \"$arg\" to charnames::viacode()");
daf0d493 150 return;
b177ca84
JF
151 }
152
f0175764
JH
153 if ($code > 0x10FFFF) {
154 carp "Unicode characters only allocated up to 0x10FFFF (you asked for $hex)";
155 return "\x{FFFD}";
156 }
157
4e2cda5d
JH
158 return $viacode{$hex} if exists $viacode{$hex};
159
b177ca84
JF
160 $txt = do "unicore/Name.pl" unless $txt;
161
162 if ($txt =~ m/^$hex\t\t(.+)/m) {
4e2cda5d 163 return $viacode{$hex} = $1;
b177ca84 164 } else {
daf0d493
JH
165 return;
166 }
167}
168
4e2cda5d
JH
169my %vianame;
170
daf0d493
JH
171sub vianame
172{
173 if (@_ != 1) {
174 carp "charnames::vianame() expects one name argument";
175 return ()
176 }
177
178 my $arg = shift;
179
4e2cda5d
JH
180 return $vianame{$arg} if exists $vianame{$arg};
181
daf0d493
JH
182 $txt = do "unicore/Name.pl" unless $txt;
183
184 if ($txt =~ m/^([0-9A-F]+)\t\t($arg)/m) {
4e2cda5d 185 return $vianame{$arg} = hex $1;
daf0d493
JH
186 } else {
187 return;
b177ca84
JF
188 }
189}
190
423cee85
JH
191
1921;
193__END__
194
195=head1 NAME
196
b177ca84 197charnames - define character names for C<\N{named}> string literal escapes.
423cee85
JH
198
199=head1 SYNOPSIS
200
201 use charnames ':full';
4a2d328f 202 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85
JH
203
204 use charnames ':short';
4a2d328f 205 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85
JH
206
207 use charnames qw(cyrillic greek);
4a2d328f 208 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 209
b177ca84 210 print charname::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
daf0d493 211 printf "%04X", charname::vianame("GOTHIC LETTER AHSA"); # prints "10330"
b177ca84 212
423cee85
JH
213=head1 DESCRIPTION
214
215Pragma C<use charnames> supports arguments C<:full>, C<:short> and
216script names. If C<:full> is present, for expansion of
4a2d328f 217C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
423cee85
JH
218standard Unicode names of chars. If C<:short> is present, and
219C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
220as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
4a2d328f 221with script name arguments, then for C<\N{CHARNAME}}> the name
423cee85
JH
222C<CHARNAME> is looked up as a letter in the given scripts (in the
223specified order).
224
225For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623 226this pragma looks for the names
423cee85
JH
227
228 SCRIPTNAME CAPITAL LETTER CHARNAME
229 SCRIPTNAME SMALL LETTER CHARNAME
230 SCRIPTNAME LETTER CHARNAME
231
232in the table of standard Unicode names. If C<CHARNAME> is lowercase,
daf0d493
JH
233then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
234is ignored.
235
236Note that C<\N{...}> is compile-time, it's a special form of string
237constant used inside double-quoted strings: in other words, you cannot
4e2cda5d 238use variables inside the C<\N{...}>. If you want similar run-time
daf0d493 239functionality, use charnames::vianame().
423cee85 240
301a3cda
JH
241For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
242as of Unicode 3.1, there are no official Unicode names but you can
243use instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth).
244In Unicode 3.2 some naming changes will happen since ISO 6429 has been
245updated. Also note that the U+UU80, U+0081, U+0084, and U+0099
246do not have names even in ISO 6429.
247
423cee85
JH
248=head1 CUSTOM TRANSLATORS
249
d5448623 250The mechanism of translation of C<\N{...}> escapes is general and not
423cee85 251hardwired into F<charnames.pm>. A module can install custom
d5448623 252translations (inside the scope which C<use>s the module) with the
423cee85
JH
253following magic incantation:
254
d5448623
GS
255 use charnames (); # for $charnames::hint_bits
256 sub import {
257 shift;
258 $^H |= $charnames::hint_bits;
259 $^H{charnames} = \&translator;
260 }
423cee85
JH
261
262Here translator() is a subroutine which takes C<CHARNAME> as an
263argument, and returns text to insert into the string instead of the
4a2d328f 264C<\N{CHARNAME}> escape. Since the text to insert should be different
d5448623
GS
265in C<bytes> mode and out of it, the function should check the current
266state of C<bytes>-flag as in:
267
268 use bytes (); # for $bytes::hint_bits
269 sub translator {
270 if ($^H & $bytes::hint_bits) {
271 return bytes_translator(@_);
272 }
273 else {
274 return utf8_translator(@_);
275 }
423cee85 276 }
423cee85 277
b177ca84
JF
278=head1 charnames::viacode(code)
279
280Returns the full name of the character indicated by the numeric code.
281The example
282
283 print charnames::viacode(0x2722);
284
285prints "FOUR TEARDROP-SPOKED ASTERISK".
286
daf0d493
JH
287Returns undef if no name is known for the code.
288
289This works only for the standard names, and does not yet aply
290to custom translators.
291
292=head1 charnames::vianame(code)
293
294Returns the code point indicated by the name.
295The example
296
297 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
298
299prints "2722".
300
301Returns undef if no name is known for the name.
b177ca84
JF
302
303This works only for the standard names, and does not yet aply
304to custom translators.
305
f0175764
JH
306=head1 ILLEGAL CHARACTERS
307
308If you ask for a character that does not exist, a warning is given
309and the special Unicode I<replacement character> "\x{FFFD}" is returned.
310
423cee85
JH
311=head1 BUGS
312
313Since evaluation of the translation function happens in a middle of
314compilation (of a string literal), the translation function should not
315do any C<eval>s or C<require>s. This restriction should be lifted in
316a future version of Perl.
317
318=cut