This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
memset() is cheaper than a loop of 256 bit-a-a-times
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b177ca84
JF
2use strict;
3use warnings;
4use Carp;
5our $VERSION = '1.01';
b75c8c73 6
d5448623
GS
7use bytes (); # for $bytes::hint_bits
8$charnames::hint_bits = 0x20000;
423cee85 9
423cee85
JH
10my $txt;
11
12# This is not optimized in any way yet
b177ca84
JF
13sub charnames
14{
15 my $name = shift;
16
17 ## Suck in the code/name list as a big string.
18 ## Lines look like:
19 ## "0052\t\tLATIN CAPITAL LETTER R\n"
55d7b906 20 $txt = do "unicore/Name.pl" unless $txt;
b177ca84
JF
21
22 ## @off will hold the index into the code/name string of the start and
23 ## end of the name as we find it.
423cee85 24 my @off;
b177ca84
JF
25
26 ## If :full, look for the the name exactly
423cee85
JH
27 if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
28 @off = ($-[0], $+[0]);
29 }
b177ca84
JF
30
31 ## If we didn't get above, and :short allowed, look for the short name.
32 ## The short name is like "greek:Sigma"
423cee85 33 unless (@off) {
b177ca84 34 if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
423cee85
JH
35 my ($script, $cname) = ($1,$2);
36 my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
37 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
38 @off = ($-[0], $+[0]);
39 }
40 }
41 }
b177ca84
JF
42
43 ## If we still don't have it, check for the name among the loaded
44 ## scripts.
45 if (not @off)
46 {
47 my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
48 for my $script ( @{$^H{charnames_scripts}} )
49 {
50 if ($txt =~ m/\t\t$script (?:$case )?LETTER \U$name$/m) {
51 @off = ($-[0], $+[0]);
52 last;
53 }
54 }
423cee85 55 }
b177ca84
JF
56
57 ## If we don't have it by now, give up.
423cee85 58 die "Unknown charname '$name'" unless @off;
b896c7a5 59
b177ca84
JF
60 ##
61 ## Now know where in the string the name starts.
62 ## The code, in hex, is befor that.
63 ##
64 ## The code can be 4-6 characters long, so we've got to sort of
65 ## go look for it, just after the newline that comes before $off[0].
66 ##
67 ## This would be much easier if unicore/Name.pl had info in
68 ## a name/code order, instead of code/name order.
69 ##
70 ## The +1 after the rindex() is to skip past the newline we're finding,
71 ## or, if the rindex() fails, to put us to an offset of zero.
72 ##
73 my $hexstart = rindex($txt, "\n", $off[0]) + 1;
74
75 ## we know where it starts, so turn into number - the ordinal for the char.
76 my $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
77
d5448623 78 if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
8058d7ab 79 use bytes;
d41ff1b8
GS
80 return chr $ord if $ord <= 255;
81 my $hex = sprintf '%X=0%o', $ord, $ord;
82 my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
83 die "Character 0x$hex with name '$fname' is above 0xFF";
423cee85 84 }
bfa383d6 85 return pack "U", $ord;
423cee85
JH
86}
87
b177ca84
JF
88sub import
89{
90 shift; ## ignore class name
91
92 if (not @_)
93 {
94 carp("`use charnames' needs explicit imports list");
95 }
d5448623 96 $^H |= $charnames::hint_bits;
423cee85 97 $^H{charnames} = \&charnames ;
b177ca84
JF
98
99 ##
100 ## fill %h keys with our @_ args.
101 ##
423cee85
JH
102 my %h;
103 @h{@_} = (1) x @_;
b177ca84 104
423cee85
JH
105 $^H{charnames_full} = delete $h{':full'};
106 $^H{charnames_short} = delete $h{':short'};
107 $^H{charnames_scripts} = [map uc, keys %h];
b177ca84
JF
108
109 ##
110 ## If utf8? warnings are enabled, and some scripts were given,
111 ## see if at least we can find one letter of each script.
112 ##
113 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}})
114 {
115 $txt = do "unicore/Name.pl" unless $txt;
116
117 for my $script (@{$^H{charnames_scripts}})
118 {
119 if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
120 warnings::warn('utf8', "No such script: '$script'");
121 }
122 }
bd62941a 123 }
423cee85
JH
124}
125
b177ca84
JF
126sub viacode
127{
128 if (@_ != 1) {
129 carp "charnames::viacode() expects one numeric value";
130 return ()
131 }
132 my $arg = shift;
133
134 my $hex;
135 if ($arg =~ m/^[0-9]+$/) {
136 $hex = sprintf "%04X", $arg;
137 } else {
138 carp("unexpected arg \"$arg\" to charnames::viacode()");
139 return ();
140 }
141
142 $txt = do "unicore/Name.pl" unless $txt;
143
144 if ($txt =~ m/^$hex\t\t(.+)/m) {
145 return $1;
146 } else {
147 return ();
148 }
149}
150
423cee85
JH
151
1521;
153__END__
154
155=head1 NAME
156
b177ca84 157charnames - define character names for C<\N{named}> string literal escapes.
423cee85
JH
158
159=head1 SYNOPSIS
160
161 use charnames ':full';
4a2d328f 162 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85
JH
163
164 use charnames ':short';
4a2d328f 165 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85
JH
166
167 use charnames qw(cyrillic greek);
4a2d328f 168 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 169
b177ca84
JF
170 print charname::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
171
423cee85
JH
172=head1 DESCRIPTION
173
174Pragma C<use charnames> supports arguments C<:full>, C<:short> and
175script names. If C<:full> is present, for expansion of
4a2d328f 176C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
423cee85
JH
177standard Unicode names of chars. If C<:short> is present, and
178C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
179as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
4a2d328f 180with script name arguments, then for C<\N{CHARNAME}}> the name
423cee85
JH
181C<CHARNAME> is looked up as a letter in the given scripts (in the
182specified order).
183
184For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623 185this pragma looks for the names
423cee85
JH
186
187 SCRIPTNAME CAPITAL LETTER CHARNAME
188 SCRIPTNAME SMALL LETTER CHARNAME
189 SCRIPTNAME LETTER CHARNAME
190
191in the table of standard Unicode names. If C<CHARNAME> is lowercase,
d5448623 192then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant is
423cee85
JH
193ignored.
194
195=head1 CUSTOM TRANSLATORS
196
d5448623 197The mechanism of translation of C<\N{...}> escapes is general and not
423cee85 198hardwired into F<charnames.pm>. A module can install custom
d5448623 199translations (inside the scope which C<use>s the module) with the
423cee85
JH
200following magic incantation:
201
d5448623
GS
202 use charnames (); # for $charnames::hint_bits
203 sub import {
204 shift;
205 $^H |= $charnames::hint_bits;
206 $^H{charnames} = \&translator;
207 }
423cee85
JH
208
209Here translator() is a subroutine which takes C<CHARNAME> as an
210argument, and returns text to insert into the string instead of the
4a2d328f 211C<\N{CHARNAME}> escape. Since the text to insert should be different
d5448623
GS
212in C<bytes> mode and out of it, the function should check the current
213state of C<bytes>-flag as in:
214
215 use bytes (); # for $bytes::hint_bits
216 sub translator {
217 if ($^H & $bytes::hint_bits) {
218 return bytes_translator(@_);
219 }
220 else {
221 return utf8_translator(@_);
222 }
423cee85 223 }
423cee85 224
b177ca84
JF
225=head1 charnames::viacode(code)
226
227Returns the full name of the character indicated by the numeric code.
228The example
229
230 print charnames::viacode(0x2722);
231
232prints "FOUR TEARDROP-SPOKED ASTERISK".
233
234Returns nothing if no name is known for the code.
235
236This works only for the standard names, and does not yet aply
237to custom translators.
238
423cee85
JH
239=head1 BUGS
240
241Since evaluation of the translation function happens in a middle of
242compilation (of a string literal), the translation function should not
243do any C<eval>s or C<require>s. This restriction should be lifted in
244a future version of Perl.
245
246=cut