This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
The hoops one has to jump through.
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
4use Carp;
5our $VERSION = '1.01';
b75c8c73 6
d5448623
GS
7use bytes (); # for $bytes::hint_bits
8$charnames::hint_bits = 0x20000;
423cee85 9
423cee85
JH
10my $txt;
11
12# This is not optimized in any way yet
b177ca84
JF
13sub charnames
14{
15 my $name = shift;
16
17 ## Suck in the code/name list as a big string.
18 ## Lines look like:
19 ## "0052\t\tLATIN CAPITAL LETTER R\n"
55d7b906 20 $txt = do "unicore/Name.pl" unless $txt;
b177ca84
JF
21
22 ## @off will hold the index into the code/name string of the start and
23 ## end of the name as we find it.
423cee85 24 my @off;
b177ca84
JF
25
26 ## If :full, look for the the name exactly
423cee85
JH
27 if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
28 @off = ($-[0], $+[0]);
29 }
b177ca84
JF
30
31 ## If we didn't get above, and :short allowed, look for the short name.
32 ## The short name is like "greek:Sigma"
423cee85 33 unless (@off) {
b177ca84 34 if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
423cee85
JH
35 my ($script, $cname) = ($1,$2);
36 my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
37 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
38 @off = ($-[0], $+[0]);
39 }
40 }
41 }
b177ca84
JF
42
43 ## If we still don't have it, check for the name among the loaded
44 ## scripts.
45 if (not @off)
46 {
47 my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
48 for my $script ( @{$^H{charnames_scripts}} )
49 {
50 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U$name$/m) {
51 @off = ($-[0], $+[0]);
52 last;
53 }
54 }
423cee85 55 }
b177ca84
JF
56
57 ## If we don't have it by now, give up.
423cee85 58 die "Unknown charname '$name'" unless @off;
b896c7a5 59
b177ca84
JF
60 ##
61 ## Now know where in the string the name starts.
62 ## The code, in hex, is befor that.
63 ##
64 ## The code can be 4-6 characters long, so we've got to sort of
65 ## go look for it, just after the newline that comes before $off[0].
66 ##
67 ## This would be much easier if unicore/Name.pl had info in
68 ## a name/code order, instead of code/name order.
69 ##
70 ## The +1 after the rindex() is to skip past the newline we're finding,
71 ## or, if the rindex() fails, to put us to an offset of zero.
72 ##
73 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
74
75 ## we know where it starts, so turn into number - the ordinal for the char.
76 my $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
77
d5448623 78 if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
8058d7ab 79 use bytes;
d41ff1b8
GS
80 return chr $ord if $ord <= 255;
81 my $hex = sprintf '%X=0%o', $ord, $ord;
82 my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
83 die "Character 0x$hex with name '$fname' is above 0xFF";
423cee85 84 }
bfa383d6 85 return pack "U", $ord;
423cee85
JH
86}
87
b177ca84
JF
88sub import
89{
90 shift; ## ignore class name
91
92 if (not @_)
93 {
94 carp("`use charnames' needs explicit imports list");
95 }
d5448623 96 $^H |= $charnames::hint_bits;
423cee85 97 $^H{charnames} = \&charnames ;
b177ca84
JF
98
99 ##
100 ## fill %h keys with our @_ args.
101 ##
423cee85
JH
102 my %h;
103 @h{@_} = (1) x @_;
b177ca84 104
423cee85
JH
105 $^H{charnames_full} = delete $h{':full'};
106 $^H{charnames_short} = delete $h{':short'};
107 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84
JF
108
109 ##
110 ## If utf8? warnings are enabled, and some scripts were given,
111 ## see if at least we can find one letter of each script.
112 ##
113 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}})
114 {
115 $txt = do "unicore/Name.pl" unless $txt;
116
117 for my $script (@{$^H{charnames_scripts}})
118 {
119 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
120 warnings::warn('utf8', "No such script: '$script'");
121 }
122 }
bd62941a 123 }
423cee85
JH
124}
125
4e2cda5d
JH
126my %viacode;
127
b177ca84
JF
128sub viacode
129{
130 if (@_ != 1) {
daf0d493 131 carp "charnames::viacode() expects one numeric argument";
b177ca84
JF
132 return ()
133 }
134 my $arg = shift;
135
136 my $hex;
137 if ($arg =~ m/^[0-9]+$/) {
138 $hex = sprintf "%04X", $arg;
139 } else {
140 carp("unexpected arg \"$arg\" to charnames::viacode()");
daf0d493 141 return;
b177ca84
JF
142 }
143
4e2cda5d
JH
144 return $viacode{$hex} if exists $viacode{$hex};
145
b177ca84
JF
146 $txt = do "unicore/Name.pl" unless $txt;
147
148 if ($txt =~ m/^$hex\t\t(.+)/m) {
4e2cda5d 149 return $viacode{$hex} = $1;
b177ca84 150 } else {
daf0d493
JH
151 return;
152 }
153}
154
4e2cda5d
JH
155my %vianame;
156
daf0d493
JH
157sub vianame
158{
159 if (@_ != 1) {
160 carp "charnames::vianame() expects one name argument";
161 return ()
162 }
163
164 my $arg = shift;
165
4e2cda5d
JH
166 return $vianame{$arg} if exists $vianame{$arg};
167
daf0d493
JH
168 $txt = do "unicore/Name.pl" unless $txt;
169
170 if ($txt =~ m/^([0-9A-F]+)\t\t($arg)/m) {
4e2cda5d 171 return $vianame{$arg} = hex $1;
daf0d493
JH
172 } else {
173 return;
b177ca84
JF
174 }
175}
176
423cee85
JH
177
1781;
179__END__
180
181=head1 NAME
182
b177ca84 183charnames - define character names for C<\N{named}> string literal escapes.
423cee85
JH
184
185=head1 SYNOPSIS
186
187 use charnames ':full';
4a2d328f 188 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85
JH
189
190 use charnames ':short';
4a2d328f 191 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85
JH
192
193 use charnames qw(cyrillic greek);
4a2d328f 194 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 195
b177ca84 196 print charname::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
daf0d493 197 printf "%04X", charname::vianame("GOTHIC LETTER AHSA"); # prints "10330"
b177ca84 198
423cee85
JH
199=head1 DESCRIPTION
200
201Pragma C<use charnames> supports arguments C<:full>, C<:short> and
202script names. If C<:full> is present, for expansion of
4a2d328f 203C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
423cee85
JH
204standard Unicode names of chars. If C<:short> is present, and
205C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
206as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
4a2d328f 207with script name arguments, then for C<\N{CHARNAME}}> the name
423cee85
JH
208C<CHARNAME> is looked up as a letter in the given scripts (in the
209specified order).
210
211For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623 212this pragma looks for the names
423cee85
JH
213
214 SCRIPTNAME CAPITAL LETTER CHARNAME
215 SCRIPTNAME SMALL LETTER CHARNAME
216 SCRIPTNAME LETTER CHARNAME
217
218in the table of standard Unicode names. If C<CHARNAME> is lowercase,
daf0d493
JH
219then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
220is ignored.
221
222Note that C<\N{...}> is compile-time, it's a special form of string
223constant used inside double-quoted strings: in other words, you cannot
4e2cda5d 224use variables inside the C<\N{...}>. If you want similar run-time
daf0d493 225functionality, use charnames::vianame().
423cee85
JH
226
227=head1 CUSTOM TRANSLATORS
228
d5448623 229The mechanism of translation of C<\N{...}> escapes is general and not
423cee85 230hardwired into F<charnames.pm>. A module can install custom
d5448623 231translations (inside the scope which C<use>s the module) with the
423cee85
JH
232following magic incantation:
233
d5448623
GS
234 use charnames (); # for $charnames::hint_bits
235 sub import {
236 shift;
237 $^H |= $charnames::hint_bits;
238 $^H{charnames} = \&translator;
239 }
423cee85
JH
240
241Here translator() is a subroutine which takes C<CHARNAME> as an
242argument, and returns text to insert into the string instead of the
4a2d328f 243C<\N{CHARNAME}> escape. Since the text to insert should be different
d5448623
GS
244in C<bytes> mode and out of it, the function should check the current
245state of C<bytes>-flag as in:
246
247 use bytes (); # for $bytes::hint_bits
248 sub translator {
249 if ($^H & $bytes::hint_bits) {
250 return bytes_translator(@_);
251 }
252 else {
253 return utf8_translator(@_);
254 }
423cee85 255 }
423cee85 256
b177ca84
JF
257=head1 charnames::viacode(code)
258
259Returns the full name of the character indicated by the numeric code.
260The example
261
262 print charnames::viacode(0x2722);
263
264prints "FOUR TEARDROP-SPOKED ASTERISK".
265
daf0d493
JH
266Returns undef if no name is known for the code.
267
268This works only for the standard names, and does not yet aply
269to custom translators.
270
271=head1 charnames::vianame(code)
272
273Returns the code point indicated by the name.
274The example
275
276 printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
277
278prints "2722".
279
280Returns undef if no name is known for the name.
b177ca84
JF
281
282This works only for the standard names, and does not yet aply
283to custom translators.
284
423cee85
JH
285=head1 BUGS
286
287Since evaluation of the translation function happens in a middle of
288compilation (of a string literal), the translation function should not
289do any C<eval>s or C<require>s. This restriction should be lifted in
290a future version of Perl.
291
292=cut