Commit | Line | Data |
---|---|---|
423cee85 | 1 | package charnames; |
b177ca84 JF |
2 | use strict; |
3 | use warnings; | |
4 | use Carp; | |
5 | our $VERSION = '1.01'; | |
b75c8c73 | 6 | |
d5448623 GS |
7 | use bytes (); # for $bytes::hint_bits |
8 | $charnames::hint_bits = 0x20000; | |
423cee85 | 9 | |
423cee85 JH |
10 | my $txt; |
11 | ||
12 | # This is not optimized in any way yet | |
b177ca84 JF |
13 | sub charnames |
14 | { | |
15 | my $name = shift; | |
16 | ||
17 | ## Suck in the code/name list as a big string. | |
18 | ## Lines look like: | |
19 | ## "0052\t\tLATIN CAPITAL LETTER R\n" | |
55d7b906 | 20 | $txt = do "unicore/Name.pl" unless $txt; |
b177ca84 JF |
21 | |
22 | ## @off will hold the index into the code/name string of the start and | |
23 | ## end of the name as we find it. | |
423cee85 | 24 | my @off; |
b177ca84 JF |
25 | |
26 | ## If :full, look for the the name exactly | |
423cee85 JH |
27 | if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) { |
28 | @off = ($-[0], $+[0]); | |
29 | } | |
b177ca84 JF |
30 | |
31 | ## If we didn't get above, and :short allowed, look for the short name. | |
32 | ## The short name is like "greek:Sigma" | |
423cee85 | 33 | unless (@off) { |
b177ca84 | 34 | if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) { |
423cee85 JH |
35 | my ($script, $cname) = ($1,$2); |
36 | my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); | |
37 | if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) { | |
38 | @off = ($-[0], $+[0]); | |
39 | } | |
40 | } | |
41 | } | |
b177ca84 JF |
42 | |
43 | ## If we still don't have it, check for the name among the loaded | |
44 | ## scripts. | |
45 | if (not @off) | |
46 | { | |
47 | my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); | |
48 | for my $script ( @{$^H{charnames_scripts}} ) | |
49 | { | |
50 | if ($txt =~ m/\t\t$script (?:$case )?LETTER \U$name$/m) { | |
51 | @off = ($-[0], $+[0]); | |
52 | last; | |
53 | } | |
54 | } | |
423cee85 | 55 | } |
b177ca84 JF |
56 | |
57 | ## If we don't have it by now, give up. | |
423cee85 | 58 | die "Unknown charname '$name'" unless @off; |
b896c7a5 | 59 | |
b177ca84 JF |
60 | ## |
61 | ## Now know where in the string the name starts. | |
62 | ## The code, in hex, is befor that. | |
63 | ## | |
64 | ## The code can be 4-6 characters long, so we've got to sort of | |
65 | ## go look for it, just after the newline that comes before $off[0]. | |
66 | ## | |
67 | ## This would be much easier if unicore/Name.pl had info in | |
68 | ## a name/code order, instead of code/name order. | |
69 | ## | |
70 | ## The +1 after the rindex() is to skip past the newline we're finding, | |
71 | ## or, if the rindex() fails, to put us to an offset of zero. | |
72 | ## | |
73 | my $hexstart = rindex($txt, "\n", $off[0]) + 1; | |
74 | ||
75 | ## we know where it starts, so turn into number - the ordinal for the char. | |
76 | my $ord = hex substr($txt, $hexstart, $off[0] - $hexstart); | |
77 | ||
d5448623 | 78 | if ($^H & $bytes::hint_bits) { # "use bytes" in effect? |
8058d7ab | 79 | use bytes; |
d41ff1b8 GS |
80 | return chr $ord if $ord <= 255; |
81 | my $hex = sprintf '%X=0%o', $ord, $ord; | |
82 | my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2; | |
83 | die "Character 0x$hex with name '$fname' is above 0xFF"; | |
423cee85 | 84 | } |
bfa383d6 | 85 | return pack "U", $ord; |
423cee85 JH |
86 | } |
87 | ||
b177ca84 JF |
88 | sub import |
89 | { | |
90 | shift; ## ignore class name | |
91 | ||
92 | if (not @_) | |
93 | { | |
94 | carp("`use charnames' needs explicit imports list"); | |
95 | } | |
d5448623 | 96 | $^H |= $charnames::hint_bits; |
423cee85 | 97 | $^H{charnames} = \&charnames ; |
b177ca84 JF |
98 | |
99 | ## | |
100 | ## fill %h keys with our @_ args. | |
101 | ## | |
423cee85 JH |
102 | my %h; |
103 | @h{@_} = (1) x @_; | |
b177ca84 | 104 | |
423cee85 JH |
105 | $^H{charnames_full} = delete $h{':full'}; |
106 | $^H{charnames_short} = delete $h{':short'}; | |
107 | $^H{charnames_scripts} = [map uc, keys %h]; | |
b177ca84 JF |
108 | |
109 | ## | |
110 | ## If utf8? warnings are enabled, and some scripts were given, | |
111 | ## see if at least we can find one letter of each script. | |
112 | ## | |
113 | if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) | |
114 | { | |
115 | $txt = do "unicore/Name.pl" unless $txt; | |
116 | ||
117 | for my $script (@{$^H{charnames_scripts}}) | |
118 | { | |
119 | if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) { | |
120 | warnings::warn('utf8', "No such script: '$script'"); | |
121 | } | |
122 | } | |
bd62941a | 123 | } |
423cee85 JH |
124 | } |
125 | ||
4e2cda5d JH |
126 | my %viacode; |
127 | ||
b177ca84 JF |
128 | sub viacode |
129 | { | |
130 | if (@_ != 1) { | |
daf0d493 | 131 | carp "charnames::viacode() expects one numeric argument"; |
b177ca84 JF |
132 | return () |
133 | } | |
134 | my $arg = shift; | |
135 | ||
136 | my $hex; | |
137 | if ($arg =~ m/^[0-9]+$/) { | |
138 | $hex = sprintf "%04X", $arg; | |
139 | } else { | |
140 | carp("unexpected arg \"$arg\" to charnames::viacode()"); | |
daf0d493 | 141 | return; |
b177ca84 JF |
142 | } |
143 | ||
4e2cda5d JH |
144 | return $viacode{$hex} if exists $viacode{$hex}; |
145 | ||
b177ca84 JF |
146 | $txt = do "unicore/Name.pl" unless $txt; |
147 | ||
148 | if ($txt =~ m/^$hex\t\t(.+)/m) { | |
4e2cda5d | 149 | return $viacode{$hex} = $1; |
b177ca84 | 150 | } else { |
daf0d493 JH |
151 | return; |
152 | } | |
153 | } | |
154 | ||
4e2cda5d JH |
155 | my %vianame; |
156 | ||
daf0d493 JH |
157 | sub vianame |
158 | { | |
159 | if (@_ != 1) { | |
160 | carp "charnames::vianame() expects one name argument"; | |
161 | return () | |
162 | } | |
163 | ||
164 | my $arg = shift; | |
165 | ||
4e2cda5d JH |
166 | return $vianame{$arg} if exists $vianame{$arg}; |
167 | ||
daf0d493 JH |
168 | $txt = do "unicore/Name.pl" unless $txt; |
169 | ||
170 | if ($txt =~ m/^([0-9A-F]+)\t\t($arg)/m) { | |
4e2cda5d | 171 | return $vianame{$arg} = hex $1; |
daf0d493 JH |
172 | } else { |
173 | return; | |
b177ca84 JF |
174 | } |
175 | } | |
176 | ||
423cee85 JH |
177 | |
178 | 1; | |
179 | __END__ | |
180 | ||
181 | =head1 NAME | |
182 | ||
b177ca84 | 183 | charnames - define character names for C<\N{named}> string literal escapes. |
423cee85 JH |
184 | |
185 | =head1 SYNOPSIS | |
186 | ||
187 | use charnames ':full'; | |
4a2d328f | 188 | print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; |
423cee85 JH |
189 | |
190 | use charnames ':short'; | |
4a2d328f | 191 | print "\N{greek:Sigma} is an upper-case sigma.\n"; |
423cee85 JH |
192 | |
193 | use charnames qw(cyrillic greek); | |
4a2d328f | 194 | print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n"; |
423cee85 | 195 | |
b177ca84 | 196 | print charname::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE" |
daf0d493 | 197 | printf "%04X", charname::vianame("GOTHIC LETTER AHSA"); # prints "10330" |
b177ca84 | 198 | |
423cee85 JH |
199 | =head1 DESCRIPTION |
200 | ||
201 | Pragma C<use charnames> supports arguments C<:full>, C<:short> and | |
202 | script names. If C<:full> is present, for expansion of | |
4a2d328f | 203 | C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of |
423cee85 JH |
204 | standard Unicode names of chars. If C<:short> is present, and |
205 | C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up | |
206 | as a letter in script C<SCRIPT>. If pragma C<use charnames> is used | |
4a2d328f | 207 | with script name arguments, then for C<\N{CHARNAME}}> the name |
423cee85 JH |
208 | C<CHARNAME> is looked up as a letter in the given scripts (in the |
209 | specified order). | |
210 | ||
211 | For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME> | |
d5448623 | 212 | this pragma looks for the names |
423cee85 JH |
213 | |
214 | SCRIPTNAME CAPITAL LETTER CHARNAME | |
215 | SCRIPTNAME SMALL LETTER CHARNAME | |
216 | SCRIPTNAME LETTER CHARNAME | |
217 | ||
218 | in the table of standard Unicode names. If C<CHARNAME> is lowercase, | |
daf0d493 JH |
219 | then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant |
220 | is ignored. | |
221 | ||
222 | Note that C<\N{...}> is compile-time, it's a special form of string | |
223 | constant used inside double-quoted strings: in other words, you cannot | |
4e2cda5d | 224 | use variables inside the C<\N{...}>. If you want similar run-time |
daf0d493 | 225 | functionality, use charnames::vianame(). |
423cee85 JH |
226 | |
227 | =head1 CUSTOM TRANSLATORS | |
228 | ||
d5448623 | 229 | The mechanism of translation of C<\N{...}> escapes is general and not |
423cee85 | 230 | hardwired into F<charnames.pm>. A module can install custom |
d5448623 | 231 | translations (inside the scope which C<use>s the module) with the |
423cee85 JH |
232 | following magic incantation: |
233 | ||
d5448623 GS |
234 | use charnames (); # for $charnames::hint_bits |
235 | sub import { | |
236 | shift; | |
237 | $^H |= $charnames::hint_bits; | |
238 | $^H{charnames} = \&translator; | |
239 | } | |
423cee85 JH |
240 | |
241 | Here translator() is a subroutine which takes C<CHARNAME> as an | |
242 | argument, and returns text to insert into the string instead of the | |
4a2d328f | 243 | C<\N{CHARNAME}> escape. Since the text to insert should be different |
d5448623 GS |
244 | in C<bytes> mode and out of it, the function should check the current |
245 | state of C<bytes>-flag as in: | |
246 | ||
247 | use bytes (); # for $bytes::hint_bits | |
248 | sub translator { | |
249 | if ($^H & $bytes::hint_bits) { | |
250 | return bytes_translator(@_); | |
251 | } | |
252 | else { | |
253 | return utf8_translator(@_); | |
254 | } | |
423cee85 | 255 | } |
423cee85 | 256 | |
b177ca84 JF |
257 | =head1 charnames::viacode(code) |
258 | ||
259 | Returns the full name of the character indicated by the numeric code. | |
260 | The example | |
261 | ||
262 | print charnames::viacode(0x2722); | |
263 | ||
264 | prints "FOUR TEARDROP-SPOKED ASTERISK". | |
265 | ||
daf0d493 JH |
266 | Returns undef if no name is known for the code. |
267 | ||
268 | This works only for the standard names, and does not yet aply | |
269 | to custom translators. | |
270 | ||
271 | =head1 charnames::vianame(code) | |
272 | ||
273 | Returns the code point indicated by the name. | |
274 | The example | |
275 | ||
276 | printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK"); | |
277 | ||
278 | prints "2722". | |
279 | ||
280 | Returns undef if no name is known for the name. | |
b177ca84 JF |
281 | |
282 | This works only for the standard names, and does not yet aply | |
283 | to custom translators. | |
284 | ||
423cee85 JH |
285 | =head1 BUGS |
286 | ||
287 | Since evaluation of the translation function happens in a middle of | |
288 | compilation (of a string literal), the translation function should not | |
289 | do any C<eval>s or C<require>s. This restriction should be lifted in | |
290 | a future version of Perl. | |
291 | ||
292 | =cut |