This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: Win32 snag - File::Find File::Spec and Config.pm
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
4use Carp;
5our $VERSION = '1.01';
b75c8c73 6
d5448623
GS
7use bytes (); # for $bytes::hint_bits
8$charnames::hint_bits = 0x20000;
423cee85 9
423cee85
JH
10my $txt;
11
12# This is not optimized in any way yet
b177ca84
JF
13sub charnames
14{
15 my $name = shift;
16
17 ## Suck in the code/name list as a big string.
18 ## Lines look like:
19 ## "0052\t\tLATIN CAPITAL LETTER R\n"
55d7b906 20 $txt = do "unicore/Name.pl" unless $txt;
b177ca84
JF
21
22 ## @off will hold the index into the code/name string of the start and
23 ## end of the name as we find it.
423cee85 24 my @off;
b177ca84
JF
25
26 ## If :full, look for the the name exactly
423cee85
JH
27 if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
28 @off = ($-[0], $+[0]);
29 }
b177ca84
JF
30
31 ## If we didn't get above, and :short allowed, look for the short name.
32 ## The short name is like "greek:Sigma"
423cee85 33 unless (@off) {
b177ca84 34 if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
423cee85
JH
35 my ($script, $cname) = ($1,$2);
36 my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
37 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
38 @off = ($-[0], $+[0]);
39 }
40 }
41 }
b177ca84
JF
42
43 ## If we still don't have it, check for the name among the loaded
44 ## scripts.
45 if (not @off)
46 {
47 my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
48 for my $script ( @{$^H{charnames_scripts}} )
49 {
50 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U$name$/m) {
51 @off = ($-[0], $+[0]);
52 last;
53 }
54 }
423cee85 55 }
b177ca84
JF
56
57 ## If we don't have it by now, give up.
f0175764
JH
58 unless (@off) {
59 carp "Unknown charname '$name'";
60 return "\x{FFFD}";
61 }
b896c7a5 62
b177ca84
JF
63 ##
64 ## Now know where in the string the name starts.
65 ## The code, in hex, is befor that.
66 ##
67 ## The code can be 4-6 characters long, so we've got to sort of
68 ## go look for it, just after the newline that comes before $off[0].
69 ##
70 ## This would be much easier if unicore/Name.pl had info in
71 ## a name/code order, instead of code/name order.
72 ##
73 ## The +1 after the rindex() is to skip past the newline we're finding,
74 ## or, if the rindex() fails, to put us to an offset of zero.
75 ##
76 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
77
78 ## we know where it starts, so turn into number - the ordinal for the char.
79 my $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
80
d5448623 81 if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
8058d7ab 82 use bytes;
d41ff1b8 83 return chr $ord if $ord <= 255;
f0175764 84 my $hex = sprintf "%04x", $ord;
d41ff1b8 85 my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
f0175764 86 croak "Character 0x$hex with name '$fname' is above 0xFF";
423cee85 87 }
f0175764 88
bfa383d6 89 return pack "U", $ord;
423cee85
JH
90}
91
b177ca84
JF
92sub import
93{
94 shift; ## ignore class name
95
96 if (not @_)
97 {
98 carp("`use charnames' needs explicit imports list");
99 }
d5448623 100 $^H |= $charnames::hint_bits;
423cee85 101 $^H{charnames} = \&charnames ;
b177ca84
JF
102
103 ##
104 ## fill %h keys with our @_ args.
105 ##
423cee85
JH
106 my %h;
107 @h{@_} = (1) x @_;
b177ca84 108
423cee85
JH
109 $^H{charnames_full} = delete $h{':full'};
110 $^H{charnames_short} = delete $h{':short'};
111 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84
JF
112
113 ##
114 ## If utf8? warnings are enabled, and some scripts were given,
115 ## see if at least we can find one letter of each script.
116 ##
117 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}})
118 {
119 $txt = do "unicore/Name.pl" unless $txt;
120
121 for my $script (@{$^H{charnames_scripts}})
122 {
123 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
124 warnings::warn('utf8', "No such script: '$script'");
125 }
126 }
bd62941a 127 }
423cee85
JH
128}
129
f0175764
JH
130require Unicode::UCD; # for Unicode::UCD::_getcode()
131
4e2cda5d
JH
132my %viacode;
133
b177ca84
JF
134sub viacode
135{
136 if (@_ != 1) {
daf0d493 137 carp "charnames::viacode() expects one numeric argument";
b177ca84
JF
138 return ()
139 }
f0175764 140
b177ca84 141 my $arg = shift;
f0175764 142 my $code = Unicode::UCD::_getcode($arg);
b177ca84
JF
143
144 my $hex;
f0175764
JH
145
146 if (defined $code) {
b177ca84
JF
147 $hex = sprintf "%04X", $arg;
148 } else {
149 carp("unexpected arg \"$arg\" to charnames::viacode()");
daf0d493 150 return;
b177ca84
JF
151 }
152
f0175764
JH
153 if ($code > 0x10FFFF) {
154 carp "Unicode characters only allocated up to 0x10FFFF (you asked for $hex)";
155 return "\x{FFFD}";
156 }
157
4e2cda5d
JH
158 return $viacode{$hex} if exists $viacode{$hex};
159
b177ca84
JF
160 $txt = do "unicore/Name.pl" unless $txt;
161
162 if ($txt =~ m/^$hex\t\t(.+)/m) {
4e2cda5d 163 return $viacode{$hex} = $1;
b177ca84 164 } else {
daf0d493
JH
165 return;
166 }
167}
168
4e2cda5d
JH
169my %vianame;
170
daf0d493
JH
171sub vianame
172{
173 if (@_ != 1) {
174 carp "charnames::vianame() expects one name argument";
175 return ()
176 }
177
178 my $arg = shift;
179
4e2cda5d
JH
180 return $vianame{$arg} if exists $vianame{$arg};
181
daf0d493
JH
182 $txt = do "unicore/Name.pl" unless $txt;
183
184 if ($txt =~ m/^([0-9A-F]+)\t\t($arg)/m) {
4e2cda5d 185 return $vianame{$arg} = hex $1;
daf0d493
JH
186 } else {
187 return;
b177ca84
JF
188 }
189}
190
423cee85
JH
191
1921;
193__END__
194
195=head1 NAME
196
b177ca84 197charnames - define character names for C<\N{named}> string literal escapes.
423cee85
JH
198
199=head1 SYNOPSIS
200
201 use charnames ':full';
4a2d328f 202 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85
JH
203
204 use charnames ':short';
4a2d328f 205 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85
JH
206
207 use charnames qw(cyrillic greek);
4a2d328f 208 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 209
b177ca84 210 print charname::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
daf0d493 211 printf "%04X", charname::vianame("GOTHIC LETTER AHSA"); # prints "10330"
b177ca84 212
423cee85
JH
213=head1 DESCRIPTION
214
215Pragma C<use charnames> supports arguments C<:full>, C<:short> and
216script names. If C<:full> is present, for expansion of
4a2d328f 217C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
423cee85
JH
218standard Unicode names of chars. If C<:short> is present, and
219C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
220as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
4a2d328f 221with script name arguments, then for C<\N{CHARNAME}}> the name
423cee85
JH
222C<CHARNAME> is looked up as a letter in the given scripts (in the
223specified order).
224
225For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623 226this pragma looks for the names
423cee85
JH
227
228 SCRIPTNAME CAPITAL LETTER CHARNAME
229 SCRIPTNAME SMALL LETTER CHARNAME
230 SCRIPTNAME LETTER CHARNAME
231
232in the table of standard Unicode names. If C<CHARNAME> is lowercase,
daf0d493
JH
233then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
234is ignored.
235
236Note that C<\N{...}> is compile-time, it's a special form of string
237constant used inside double-quoted strings: in other words, you cannot
4e2cda5d 238use variables inside the C<\N{...}>. If you want similar run-time
daf0d493 239functionality, use charnames::vianame().
423cee85
JH
240
241=head1 CUSTOM TRANSLATORS
242
d5448623 243The mechanism of translation of C<\N{...}> escapes is general and not
423cee85 244hardwired into F<charnames.pm>. A module can install custom
d5448623 245translations (inside the scope which C<use>s the module) with the
423cee85
JH
246following magic incantation:
247
d5448623
GS
248 use charnames (); # for $charnames::hint_bits
249 sub import {
250 shift;
251 $^H |= $charnames::hint_bits;
252 $^H{charnames} = \&translator;
253 }
423cee85
JH
254
255Here translator() is a subroutine which takes C<CHARNAME> as an
256argument, and returns text to insert into the string instead of the
4a2d328f 257C<\N{CHARNAME}> escape. Since the text to insert should be different
d5448623
GS
258in C<bytes> mode and out of it, the function should check the current
259state of C<bytes>-flag as in:
260
261 use bytes (); # for $bytes::hint_bits
262 sub translator {
263 if ($^H & $bytes::hint_bits) {
264 return bytes_translator(@_);
265 }
266 else {
267 return utf8_translator(@_);
268 }
423cee85 269 }
423cee85 270
b177ca84
JF
271=head1 charnames::viacode(code)
272
273Returns the full name of the character indicated by the numeric code.
274The example
275
276 print charnames::viacode(0x2722);
277
278prints "FOUR TEARDROP-SPOKED ASTERISK".
279
daf0d493
JH
280Returns undef if no name is known for the code.
281
282This works only for the standard names, and does not yet aply
283to custom translators.
284
285=head1 charnames::vianame(code)
286
287Returns the code point indicated by the name.
288The example
289
290 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
291
292prints "2722".
293
294Returns undef if no name is known for the name.
b177ca84
JF
295
296This works only for the standard names, and does not yet aply
297to custom translators.
298
f0175764
JH
299=head1 ILLEGAL CHARACTERS
300
301If you ask for a character that does not exist, a warning is given
302and the special Unicode I<replacement character> "\x{FFFD}" is returned.
303
423cee85
JH
304=head1 BUGS
305
306Since evaluation of the translation function happens in a middle of
307compilation (of a string literal), the translation function should not
308do any C<eval>s or C<require>s. This restriction should be lifted in
309a future version of Perl.
310
311=cut