Commit | Line | Data |
---|---|---|
423cee85 | 1 | package charnames; |
b177ca84 JF |
2 | use strict; |
3 | use warnings; | |
280e9a5c | 4 | our $VERSION = '1.50'; |
a03f0b9f | 5 | use unicore::Name; # mktables-generated algorithmically-defined names |
e7a078a0 | 6 | use _charnames (); # The submodule for this where most of the work gets done |
b75c8c73 | 7 | |
52fb7278 | 8 | use bytes (); # for $bytes::hint_bits |
123148a1 | 9 | use re "/aa"; # Everything in here should be ASCII |
423cee85 | 10 | |
38f4139d | 11 | # Translate between Unicode character names and their code points. |
e7a078a0 KW |
12 | # This is a wrapper around the submodule C<_charnames>. This design allows |
13 | # C<_charnames> to be autoloaded to enable use of \N{...}, but requires this | |
14 | # module to be explicitly requested for the functions API. | |
b177ca84 | 15 | |
889a6fe0 | 16 | $Carp::Internal{ (__PACKAGE__) } = 1; |
63098191 | 17 | |
b177ca84 JF |
18 | sub import |
19 | { | |
20 | shift; ## ignore class name | |
e7a078a0 KW |
21 | _charnames->import(@_); |
22 | } | |
423cee85 | 23 | |
84374e30 KW |
24 | # Cache of already looked-up values. This is set to only contain |
25 | # official values, and user aliases can't override them, so scoping is | |
26 | # not an issue. | |
27 | my %viacode; | |
63098191 KW |
28 | |
29 | sub viacode { | |
e7a078a0 KW |
30 | return _charnames::viacode(@_); |
31 | } | |
daf0d493 JH |
32 | |
33 | sub vianame | |
34 | { | |
35c0985d | 35 | if (@_ != 1) { |
e7a078a0 | 36 | _charnames::carp "charnames::vianame() expects one name argument"; |
35c0985d MB |
37 | return () |
38 | } | |
daf0d493 | 39 | |
63098191 KW |
40 | # Looks up the character name and returns its ordinal if |
41 | # found, undef otherwise. | |
daf0d493 | 42 | |
63098191 | 43 | my $arg = shift; |
a39c5dfb | 44 | return () unless length $arg; |
dbc0d4f2 | 45 | |
63098191 | 46 | if ($arg =~ /^U\+([0-9a-fA-F]+)$/) { |
4e2cda5d | 47 | |
fb121860 KW |
48 | # khw claims that this is poor interface design. The function should |
49 | # return either a an ord or a chr for all inputs; not be bipolar. But | |
50 | # can't change it because of backward compatibility. New code can use | |
51 | # string_vianame() instead. | |
5a7fb30a | 52 | my $ord = CORE::hex $1; |
2c9cc169 KW |
53 | return chr utf8::unicode_to_native($ord) if $ord <= 255 |
54 | || ! ((caller 0)[8] & $bytes::hint_bits); | |
e7a078a0 | 55 | _charnames::carp _charnames::not_legal_use_bytes_msg($arg, chr $ord); |
5a7fb30a | 56 | return; |
63098191 | 57 | } |
daf0d493 | 58 | |
fb121860 KW |
59 | # The first 1 arg means wants an ord returned; the second that we are in |
60 | # runtime, and this is the first level routine called from the user | |
e7a078a0 | 61 | return _charnames::lookup_name($arg, 1, 1); |
35c0985d | 62 | } # vianame |
b177ca84 | 63 | |
fb121860 KW |
64 | sub string_vianame { |
65 | ||
66 | # Looks up the character name and returns its string representation if | |
67 | # found, undef otherwise. | |
68 | ||
69 | if (@_ != 1) { | |
e7a078a0 | 70 | _charnames::carp "charnames::string_vianame() expects one name argument"; |
fb121860 KW |
71 | return; |
72 | } | |
73 | ||
74 | my $arg = shift; | |
a39c5dfb | 75 | return () unless length $arg; |
fb121860 KW |
76 | |
77 | if ($arg =~ /^U\+([0-9a-fA-F]+)$/) { | |
78 | ||
79 | my $ord = CORE::hex $1; | |
2c9cc169 KW |
80 | return chr utf8::unicode_to_native($ord) if $ord <= 255 |
81 | || ! ((caller 0)[8] & $bytes::hint_bits); | |
fb121860 | 82 | |
e7a078a0 | 83 | _charnames::carp _charnames::not_legal_use_bytes_msg($arg, chr $ord); |
fb121860 KW |
84 | return; |
85 | } | |
86 | ||
87 | # The 0 arg means wants a string returned; the 1 arg means that we are in | |
88 | # runtime, and this is the first level routine called from the user | |
e7a078a0 | 89 | return _charnames::lookup_name($arg, 0, 1); |
fb121860 KW |
90 | } # string_vianame |
91 | ||
423cee85 JH |
92 | 1; |
93 | __END__ | |
94 | ||
bde9e88d KW |
95 | =encoding utf8 |
96 | ||
423cee85 JH |
97 | =head1 NAME |
98 | ||
fb121860 | 99 | charnames - access to Unicode character names and named character sequences; also define character names |
423cee85 JH |
100 | |
101 | =head1 SYNOPSIS | |
102 | ||
bcc08981 KW |
103 | use charnames ':full'; |
104 | print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; | |
105 | print "\N{LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW}", | |
106 | " is an officially named sequence of two Unicode characters\n"; | |
107 | ||
38f4139d KW |
108 | use charnames ':loose'; |
109 | print "\N{Greek small-letter sigma}", | |
110 | "can be used to ignore case, underscores, most blanks," | |
111 | "and when you aren't sure if the official name has hyphens\n"; | |
112 | ||
bcc08981 KW |
113 | use charnames ':short'; |
114 | print "\N{greek:Sigma} is an upper-case sigma.\n"; | |
115 | ||
116 | use charnames qw(cyrillic greek); | |
117 | print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n"; | |
118 | ||
bde9e88d | 119 | use utf8; |
bcc08981 KW |
120 | use charnames ":full", ":alias" => { |
121 | e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", | |
122 | mychar => 0xE8000, # Private use area | |
bde9e88d | 123 | "自転車に乗る人" => "BICYCLIST" |
bcc08981 KW |
124 | }; |
125 | print "\N{e_ACUTE} is a small letter e with an acute.\n"; | |
14aeae98 | 126 | print "\N{mychar} allows me to name private use characters.\n"; |
bde9e88d KW |
127 | print "And I can create synonyms in other languages,", |
128 | " such as \N{自転車に乗る人} for "BICYCLIST (U+1F6B4)\n"; | |
bcc08981 KW |
129 | |
130 | use charnames (); | |
131 | print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE" | |
132 | printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints | |
133 | # "10330" | |
134 | print charnames::vianame("LATIN CAPITAL LETTER A"); # prints 65 on | |
135 | # ASCII platforms; | |
136 | # 193 on EBCDIC | |
137 | print charnames::string_vianame("LATIN CAPITAL LETTER A"); # prints "A" | |
b177ca84 | 138 | |
423cee85 JH |
139 | =head1 DESCRIPTION |
140 | ||
da9dec57 | 141 | Pragma C<use charnames> is used to gain access to the names of the |
fb121860 KW |
142 | Unicode characters and named character sequences, and to allow you to define |
143 | your own character and character sequence names. | |
144 | ||
145 | All forms of the pragma enable use of the following 3 functions: | |
146 | ||
147 | =over | |
148 | ||
149 | =item * | |
150 | ||
151 | L</charnames::string_vianame(I<name>)> for run-time lookup of a | |
152 | either a character name or a named character sequence, returning its string | |
153 | representation | |
154 | ||
155 | =item * | |
156 | ||
157 | L</charnames::vianame(I<name>)> for run-time lookup of a | |
158 | character name (but not a named character sequence) to get its ordinal value | |
159 | (code point) | |
da9dec57 | 160 | |
fb121860 | 161 | =item * |
da9dec57 | 162 | |
fb121860 KW |
163 | L</charnames::viacode(I<code>)> for run-time lookup of a code point to get its |
164 | Unicode name. | |
165 | ||
166 | =back | |
167 | ||
1f3b4888 | 168 | Starting in Perl v5.16, any occurrence of C<\N{I<CHARNAME>}> sequences |
fbb93542 KW |
169 | in a double-quotish string automatically loads this module with arguments |
170 | C<:full> and C<:short> (described below) if it hasn't already been loaded with | |
171 | different arguments, in order to compile the named Unicode character into | |
1f3b4888 KW |
172 | position in the string. Prior to v5.16, an explicit S<C<use charnames>> was |
173 | required to enable this usage. (However, prior to v5.16, the form C<S<"use | |
fbb93542 | 174 | charnames ();">> did not enable C<\N{I<CHARNAME>}>.) |
da9dec57 KW |
175 | |
176 | Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number, | |
fbb93542 | 177 | also inserts a character into a string. |
22bd7dd2 | 178 | The character it inserts is the one whose Unicode code point |
da9dec57 | 179 | (ordinal value) is equal to the number. For example, C<"\N{U+263a}"> is |
fbb93542 KW |
180 | the Unicode (white background, black foreground) smiley face |
181 | equivalent to C<"\N{WHITE SMILING FACE}">. | |
d9f23c72 | 182 | Also note, C<\N{I<...>}> can mean a regex quantifier instead of a character |
8ebef31d KW |
183 | name, when the I<...> is a number (or comma separated pair of numbers |
184 | (see L<perlreref/QUANTIFIERS>), and is not related to this pragma. | |
da9dec57 | 185 | |
38f4139d KW |
186 | The C<charnames> pragma supports arguments C<:full>, C<:loose>, C<:short>, |
187 | script names and L<customized aliases|/CUSTOM ALIASES>. | |
188 | ||
189 | If C<:full> is present, for expansion of | |
da9dec57 | 190 | C<\N{I<CHARNAME>}>, the string I<CHARNAME> is first looked up in the list of |
38f4139d KW |
191 | standard Unicode character names. |
192 | ||
193 | C<:loose> is a variant of C<:full> which allows I<CHARNAME> to be less | |
194 | precisely specified. Details are in L</LOOSE MATCHES>. | |
195 | ||
196 | If C<:short> is present, and | |
da9dec57 | 197 | I<CHARNAME> has the form C<I<SCRIPT>:I<CNAME>>, then I<CNAME> is looked up |
14aeae98 KW |
198 | as a letter in script I<SCRIPT>, as described in the next paragraph. |
199 | Or, if C<use charnames> is used | |
da9dec57 KW |
200 | with script name arguments, then for C<\N{I<CHARNAME>}> the name |
201 | I<CHARNAME> is looked up as a letter in the given scripts (in the | |
16036bcd KW |
202 | specified order). Customized aliases can override these, and are explained in |
203 | L</CUSTOM ALIASES>. | |
423cee85 | 204 | |
1f3b4888 | 205 | For lookup of I<CHARNAME> inside a given script I<SCRIPTNAME>, |
14aeae98 | 206 | this pragma looks in the table of standard Unicode names for the names |
423cee85 JH |
207 | |
208 | SCRIPTNAME CAPITAL LETTER CHARNAME | |
209 | SCRIPTNAME SMALL LETTER CHARNAME | |
210 | SCRIPTNAME LETTER CHARNAME | |
211 | ||
14aeae98 | 212 | If I<CHARNAME> is all lowercase, |
daf0d493 | 213 | then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant |
14aeae98 | 214 | is ignored, and both I<CHARNAME> and I<SCRIPTNAME> are converted to all |
38f4139d KW |
215 | uppercase for look-up. Other than that, both of them follow L<loose|/LOOSE |
216 | MATCHES> rules if C<:loose> is also specified; strict otherwise. | |
daf0d493 | 217 | |
da9dec57 KW |
218 | Note that C<\N{...}> is compile-time; it's a special form of string |
219 | constant used inside double-quotish strings; this means that you cannot | |
4e2cda5d | 220 | use variables inside the C<\N{...}>. If you want similar run-time |
fb121860 KW |
221 | functionality, use |
222 | L<charnames::string_vianame()|/charnames::string_vianame(I<name>)>. | |
423cee85 | 223 | |
67db75e3 KW |
224 | Note, starting in Perl 5.18, the name C<BELL> refers to the Unicode character |
225 | U+1F514, instead of the traditional U+0007. For the latter, use C<ALERT> | |
226 | or C<BEL>. | |
301a3cda | 227 | |
90249f0a | 228 | It is a syntax error to use C<\N{NAME}> where C<NAME> is unknown. |
e5432b89 | 229 | |
8ebef31d KW |
230 | For C<\N{NAME}>, it is a fatal error if C<use bytes> is in effect and the |
231 | input name is that of a character that won't fit into a byte (i.e., whose | |
232 | ordinal is above 255). | |
e5432b89 | 233 | |
da9dec57 | 234 | Otherwise, any string that includes a C<\N{I<charname>}> or |
850b7ec9 | 235 | C<S<\N{U+I<code point>}>> will automatically have Unicode rules (see |
da9dec57 KW |
236 | L<perlunicode/Byte and Character Semantics>). |
237 | ||
38f4139d KW |
238 | =head1 LOOSE MATCHES |
239 | ||
240 | By specifying C<:loose>, Unicode's L<loose character name | |
5ef88e32 | 241 | matching|http://www.unicode.org/reports/tr44#Matching_Rules> rules are |
38f4139d KW |
242 | selected instead of the strict exact match used otherwise. |
243 | That means that I<CHARNAME> doesn't have to be so precisely specified. | |
244 | Upper/lower case doesn't matter (except with scripts as mentioned above), nor | |
245 | do any underscores, and the only hyphens that matter are those at the | |
246 | beginning or end of a word in the name (with one exception: the hyphen in | |
247 | U+1180 C<HANGUL JUNGSEONG O-E> does matter). | |
248 | Also, blanks not adjacent to hyphens don't matter. | |
249 | The official Unicode names are quite variable as to where they use hyphens | |
250 | versus spaces to separate word-like units, and this option allows you to not | |
251 | have to care as much. | |
252 | The reason non-medial hyphens matter is because of cases like | |
253 | U+0F60 C<TIBETAN LETTER -A> versus U+0F68 C<TIBETAN LETTER A>. | |
254 | The hyphen here is significant, as is the space before it, and so both must be | |
255 | included. | |
256 | ||
257 | C<:loose> slows down look-ups by a factor of 2 to 3 versus | |
258 | C<:full>, but the trade-off may be worth it to you. Each individual look-up | |
259 | takes very little time, and the results are cached, so the speed difference | |
260 | would become a factor only in programs that do look-ups of many different | |
67db75e3 KW |
261 | spellings, and probably only when those look-ups are through C<vianame()> and |
262 | C<string_vianame()>, since C<\N{...}> look-ups are done at compile time. | |
38f4139d | 263 | |
5ffe0e96 | 264 | =head1 ALIASES |
423cee85 | 265 | |
7620cb10 KW |
266 | Starting in Unicode 6.1 and Perl v5.16, Unicode defines many abbreviations and |
267 | names that were formerly Perl extensions, and some additional ones that Perl | |
268 | did not previously accept. The list is getting too long to reproduce here, | |
269 | but you can get the complete list from the Unicode web site: | |
270 | L<http://www.unicode.org/Public/UNIDATA/NameAliases.txt>. | |
271 | ||
272 | Earlier versions of Perl accepted almost all the 6.1 names. These were most | |
273 | extensively documented in the v5.14 version of this pod: | |
274 | L<http://perldoc.perl.org/5.14.0/charnames.html#ALIASES>. | |
16036bcd | 275 | |
35c0985d MB |
276 | =head1 CUSTOM ALIASES |
277 | ||
1f31fcd4 KW |
278 | You can add customized aliases to standard (C<:full>) Unicode naming |
279 | conventions. The aliases override any standard definitions, so, if | |
da9dec57 KW |
280 | you're twisted enough, you can change C<"\N{LATIN CAPITAL LETTER A}"> to |
281 | mean C<"B">, etc. | |
55bc7d3c | 282 | |
bde9e88d | 283 | Aliases must begin with a character that is alphabetic. After that, each may |
558de9fa | 284 | contain any combination of word (C<\w>) characters, SPACE (U+0020), |
754e15cf KW |
285 | HYPHEN-MINUS (U+002D), LEFT PARENTHESIS (U+0028), and RIGHT PARENTHESIS |
286 | (U+0029). These last two should never have been allowed | |
287 | in names, and are retained for backwards compatibility only, and may be | |
bde9e88d KW |
288 | deprecated and removed in future releases of Perl, so don't use them for new |
289 | names. (More precisely, the first character of a name you specify must be | |
290 | something that matches all of C<\p{ID_Start}>, C<\p{Alphabetic}>, and | |
291 | C<\p{Gc=Letter}>. This makes sure it is what any reasonable person would view | |
558de9fa KW |
292 | as an alphabetic character. And, the continuation characters that match C<\w> |
293 | must also match C<\p{ID_Continue}>.) Starting with Perl v5.18, any Unicode | |
bde9e88d KW |
294 | characters meeting the above criteria may be used; prior to that only |
295 | Latin1-range characters were acceptable. | |
e5432b89 | 296 | |
38f4139d KW |
297 | An alias can map to either an official Unicode character name (not a loose |
298 | matched name) or to a | |
e5432b89 KW |
299 | numeric code point (ordinal). The latter is useful for assigning names |
300 | to code points in Unicode private use areas such as U+E800 through | |
f12d74c0 | 301 | U+F8FF. |
055bf491 | 302 | A numeric code point must be a non-negative integer, or a string beginning |
f12d74c0 KW |
303 | with C<"U+"> or C<"0x"> with the remainder considered to be a |
304 | hexadecimal integer. A literal numeric constant must be unsigned; it | |
305 | will be interpreted as hex if it has a leading zero or contains | |
306 | non-decimal hex digits; otherwise it will be interpreted as decimal. | |
22bd7dd2 KW |
307 | If it begins with C<"U+">, it is interpreted as the Unicode code point; |
308 | otherwise it is interpreted as native. (Only code points below 256 can | |
309 | differ between Unicode and native.) Thus C<U+41> is always the Latin letter | |
310 | "A"; but C<0x41> can be "NO-BREAK SPACE" on EBCDIC platforms. | |
232cbbee | 311 | |
da9dec57 | 312 | Aliases are added either by the use of anonymous hashes: |
35c0985d | 313 | |
da9dec57 | 314 | use charnames ":alias" => { |
35c0985d | 315 | e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", |
232cbbee | 316 | mychar1 => 0xE8000, |
35c0985d MB |
317 | }; |
318 | my $str = "\N{e_ACUTE}"; | |
319 | ||
da9dec57 | 320 | or by using a file containing aliases: |
35c0985d | 321 | |
da9dec57 | 322 | use charnames ":alias" => "pro"; |
35c0985d | 323 | |
8ebef31d | 324 | This will try to read C<"unicore/pro_alias.pl"> from the C<@INC> path. This |
da9dec57 | 325 | file should return a list in plain perl: |
35c0985d MB |
326 | |
327 | ( | |
328 | A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE", | |
329 | A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX", | |
330 | A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS", | |
331 | A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE", | |
332 | A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE", | |
333 | A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE", | |
334 | A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON", | |
f12d74c0 | 335 | mychar2 => "U+E8001", |
35c0985d MB |
336 | ); |
337 | ||
da9dec57 KW |
338 | Both these methods insert C<":full"> automatically as the first argument (if no |
339 | other argument is given), and you can give the C<":full"> explicitly as | |
340 | well, like | |
35c0985d | 341 | |
da9dec57 | 342 | use charnames ":full", ":alias" => "pro"; |
35c0985d | 343 | |
38f4139d KW |
344 | C<":loose"> has no effect with these. Input names must match exactly, using |
345 | C<":full"> rules. | |
346 | ||
14aeae98 | 347 | Also, both these methods currently allow only single characters to be named. |
8ebef31d KW |
348 | To name a sequence of characters, use a |
349 | L<custom translator|/CUSTOM TRANSLATORS> (described below). | |
350 | ||
228e8c7b KW |
351 | =head1 charnames::string_vianame(I<name>) |
352 | ||
353 | This is a runtime equivalent to C<\N{...}>. I<name> can be any expression | |
354 | that evaluates to a name accepted by C<\N{...}> under the L<C<:full> | |
355 | option|/DESCRIPTION> to C<charnames>. In addition, any other options for the | |
356 | controlling C<"use charnames"> in the same scope apply, like C<:loose> or any | |
357 | L<script list, C<:short> option|/DESCRIPTION>, or L<custom aliases|/CUSTOM | |
358 | ALIASES> you may have defined. | |
359 | ||
0fe83d7d KW |
360 | The only differences are due to the fact that C<string_vianame> is run-time |
361 | and C<\N{}> is compile time. You can't interpolate inside a C<\N{}>, (so | |
362 | C<\N{$variable}> doesn't work); and if the input name is unknown, | |
363 | C<string_vianame> returns C<undef> instead of it being a syntax error. | |
228e8c7b KW |
364 | |
365 | =head1 charnames::vianame(I<name>) | |
366 | ||
367 | This is similar to C<string_vianame>. The main difference is that under most | |
2f8114fb | 368 | circumstances, C<vianame> returns an ordinal code |
228e8c7b KW |
369 | point, whereas C<string_vianame> returns a string. For example, |
370 | ||
371 | printf "U+%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK"); | |
372 | ||
373 | prints "U+2722". | |
374 | ||
375 | This leads to the other two differences. Since a single code point is | |
376 | returned, the function can't handle named character sequences, as these are | |
377 | composed of multiple characters (it returns C<undef> for these. And, the code | |
378 | point can be that of any | |
379 | character, even ones that aren't legal under the C<S<use bytes>> pragma, | |
380 | ||
381 | See L</BUGS> for the circumstances in which the behavior differs | |
382 | from that described above. | |
383 | ||
da9dec57 | 384 | =head1 charnames::viacode(I<code>) |
b177ca84 JF |
385 | |
386 | Returns the full name of the character indicated by the numeric code. | |
da9dec57 | 387 | For example, |
b177ca84 JF |
388 | |
389 | print charnames::viacode(0x2722); | |
390 | ||
391 | prints "FOUR TEARDROP-SPOKED ASTERISK". | |
392 | ||
f6067adc KW |
393 | The name returned is the "best" (defined below) official name or alias |
394 | for the code point, if | |
ffec6758 KW |
395 | available; otherwise your custom alias for it, if defined; otherwise C<undef>. |
396 | This means that your alias will only be returned for code points that don't | |
397 | have an official Unicode name (nor alias) such as private use code points. | |
7620cb10 | 398 | |
da9dec57 KW |
399 | If you define more than one name for the code point, it is indeterminate |
400 | which one will be returned. | |
401 | ||
ffec6758 | 402 | As mentioned, the function returns C<undef> if no name is known for the code |
67db75e3 | 403 | point. In Unicode the proper name for these is the empty string, which |
da9dec57 KW |
404 | C<undef> stringifies to. (If you ask for a code point past the legal |
405 | Unicode maximum of U+10FFFF that you haven't assigned an alias to, you | |
f12d74c0 KW |
406 | get C<undef> plus a warning.) |
407 | ||
1f3b4888 | 408 | The input number must be a non-negative integer, or a string beginning |
f12d74c0 KW |
409 | with C<"U+"> or C<"0x"> with the remainder considered to be a |
410 | hexadecimal integer. A literal numeric constant must be unsigned; it | |
411 | will be interpreted as hex if it has a leading zero or contains | |
412 | non-decimal hex digits; otherwise it will be interpreted as decimal. | |
22bd7dd2 KW |
413 | If it begins with C<"U+">, it is interpreted as the Unicode code point; |
414 | otherwise it is interpreted as native. (Only code points below 256 can | |
415 | differ between Unicode and native.) Thus C<U+41> is always the Latin letter | |
416 | "A"; but C<0x41> can be "NO-BREAK SPACE" on EBCDIC platforms. | |
daf0d493 | 417 | |
f6067adc KW |
418 | As mentioned above under L</ALIASES>, Unicode 6.1 defines extra names |
419 | (synonyms or aliases) for some code points, most of which were already | |
420 | available as Perl extensions. All these are accepted by C<\N{...}> and the | |
421 | other functions in this module, but C<viacode> has to choose which one | |
422 | name to return for a given input code point, so it returns the "best" name. | |
423 | To understand how this works, it is helpful to know more about the Unicode | |
424 | name properties. All code points actually have only a single name, which | |
425 | (starting in Unicode 2.0) can never change once a character has been assigned | |
426 | to the code point. But mistakes have been made in assigning names, for | |
427 | example sometimes a clerical error was made during the publishing of the | |
428 | Standard which caused words to be misspelled, and there was no way to correct | |
429 | those. The Name_Alias property was eventually created to handle these | |
430 | situations. If a name was wrong, a corrected synonym would be published for | |
431 | it, using Name_Alias. C<viacode> will return that corrected synonym as the | |
432 | "best" name for a code point. (It is even possible, though it hasn't happened | |
433 | yet, that the correction itself will need to be corrected, and so another | |
434 | Name_Alias can be created for that code point; C<viacode> will return the | |
435 | most recent correction.) | |
436 | ||
437 | The Unicode name for each of the control characters (such as LINE FEED) is the | |
438 | empty string. However almost all had names assigned by other standards, such | |
439 | as the ASCII Standard, or were in common use. C<viacode> returns these names | |
440 | as the "best" ones available. Unicode 6.1 has created Name_Aliases for each | |
441 | of them, including alternate names, like NEW LINE. C<viacode> uses the | |
442 | original name, "LINE FEED" in preference to the alternate. Similarly the | |
443 | name returned for U+FEFF is "ZERO WIDTH NO-BREAK SPACE", not "BYTE ORDER | |
444 | MARK". | |
445 | ||
446 | Until Unicode 6.1, the 4 control characters U+0080, U+0081, U+0084, and U+0099 | |
447 | did not have names nor aliases. | |
448 | To preserve backwards compatibility, any alias you define for these code | |
449 | points will be returned by this function, in preference to the official name. | |
450 | ||
451 | Some code points also have abbreviated names, such as "LF" or "NL". | |
452 | C<viacode> never returns these. | |
453 | ||
454 | Because a name correction may be added in future Unicode releases, the name | |
455 | that C<viacode> returns may change as a result. This is a rare event, but it | |
456 | does happen. | |
274085e3 | 457 | |
5ffe0e96 | 458 | =head1 CUSTOM TRANSLATORS |
52ea3e69 | 459 | |
5ffe0e96 | 460 | The mechanism of translation of C<\N{...}> escapes is general and not |
5ef88e32 | 461 | hardwired into F<charnames.pm>. A module can install custom |
5ffe0e96 MB |
462 | translations (inside the scope which C<use>s the module) with the |
463 | following magic incantation: | |
52ea3e69 | 464 | |
5ffe0e96 | 465 | sub import { |
52fb7278 KW |
466 | shift; |
467 | $^H{charnames} = \&translator; | |
5ffe0e96 | 468 | } |
52ea3e69 | 469 | |
da9dec57 | 470 | Here translator() is a subroutine which takes I<CHARNAME> as an |
5ffe0e96 | 471 | argument, and returns text to insert into the string instead of the |
5ef88e32 KW |
472 | C<\N{I<CHARNAME>}> escape. |
473 | ||
474 | This is the only way you can create a custom named sequence of code points. | |
475 | ||
476 | Since the text to insert should be different | |
5ffe0e96 MB |
477 | in C<bytes> mode and out of it, the function should check the current |
478 | state of C<bytes>-flag as in: | |
52ea3e69 | 479 | |
52fb7278 | 480 | use bytes (); # for $bytes::hint_bits |
5ffe0e96 | 481 | sub translator { |
52fb7278 KW |
482 | if ($^H & $bytes::hint_bits) { |
483 | return bytes_translator(@_); | |
484 | } | |
485 | else { | |
486 | return utf8_translator(@_); | |
487 | } | |
5ffe0e96 | 488 | } |
52ea3e69 | 489 | |
da9dec57 | 490 | See L</CUSTOM ALIASES> above for restrictions on I<CHARNAME>. |
f0175764 | 491 | |
9e808deb KW |
492 | Of course, C<vianame>, C<viacode>, and C<string_vianame> would need to be |
493 | overridden as well. | |
1f31fcd4 | 494 | |
423cee85 JH |
495 | =head1 BUGS |
496 | ||
14aeae98 | 497 | vianame() normally returns an ordinal code point, but when the input name is of |
8ebef31d KW |
498 | the form C<U+...>, it returns a chr instead. In this case, if C<use bytes> is |
499 | in effect and the character won't fit into a byte, it returns C<undef> and | |
500 | raises a warning. | |
55bc7d3c | 501 | |
f12d74c0 KW |
502 | Since evaluation of the translation function (see L</CUSTOM |
503 | TRANSLATORS>) happens in the middle of compilation (of a string | |
504 | literal), the translation function should not do any C<eval>s or | |
505 | C<require>s. This restriction should be lifted (but is low priority) in | |
506 | a future version of Perl. | |
423cee85 JH |
507 | |
508 | =cut | |
0eacc33e | 509 | |
52fb7278 | 510 | # ex: set ts=8 sts=2 sw=2 et: |