Commit | Line | Data |
---|---|---|
423cee85 | 1 | package charnames; |
b177ca84 JF |
2 | use strict; |
3 | use warnings; | |
e7a078a0 | 4 | our $VERSION = '1.28'; |
a03f0b9f | 5 | use unicore::Name; # mktables-generated algorithmically-defined names |
e7a078a0 | 6 | use _charnames (); # The submodule for this where most of the work gets done |
b75c8c73 | 7 | |
52fb7278 | 8 | use bytes (); # for $bytes::hint_bits |
123148a1 | 9 | use re "/aa"; # Everything in here should be ASCII |
423cee85 | 10 | |
38f4139d | 11 | # Translate between Unicode character names and their code points. |
e7a078a0 KW |
12 | # This is a wrapper around the submodule C<_charnames>. This design allows |
13 | # C<_charnames> to be autoloaded to enable use of \N{...}, but requires this | |
14 | # module to be explicitly requested for the functions API. | |
b177ca84 | 15 | |
889a6fe0 | 16 | $Carp::Internal{ (__PACKAGE__) } = 1; |
63098191 | 17 | |
b177ca84 JF |
18 | sub import |
19 | { | |
20 | shift; ## ignore class name | |
e7a078a0 KW |
21 | _charnames->import(@_); |
22 | } | |
423cee85 | 23 | |
84374e30 KW |
24 | # Cache of already looked-up values. This is set to only contain |
25 | # official values, and user aliases can't override them, so scoping is | |
26 | # not an issue. | |
27 | my %viacode; | |
63098191 KW |
28 | |
29 | sub viacode { | |
e7a078a0 KW |
30 | return _charnames::viacode(@_); |
31 | } | |
daf0d493 JH |
32 | |
33 | sub vianame | |
34 | { | |
35c0985d | 35 | if (@_ != 1) { |
e7a078a0 | 36 | _charnames::carp "charnames::vianame() expects one name argument"; |
35c0985d MB |
37 | return () |
38 | } | |
daf0d493 | 39 | |
63098191 KW |
40 | # Looks up the character name and returns its ordinal if |
41 | # found, undef otherwise. | |
daf0d493 | 42 | |
63098191 | 43 | my $arg = shift; |
dbc0d4f2 | 44 | |
63098191 | 45 | if ($arg =~ /^U\+([0-9a-fA-F]+)$/) { |
4e2cda5d | 46 | |
fb121860 KW |
47 | # khw claims that this is poor interface design. The function should |
48 | # return either a an ord or a chr for all inputs; not be bipolar. But | |
49 | # can't change it because of backward compatibility. New code can use | |
50 | # string_vianame() instead. | |
5a7fb30a KW |
51 | my $ord = CORE::hex $1; |
52 | return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits); | |
e7a078a0 | 53 | _charnames::carp _charnames::not_legal_use_bytes_msg($arg, chr $ord); |
5a7fb30a | 54 | return; |
63098191 | 55 | } |
daf0d493 | 56 | |
fb121860 KW |
57 | # The first 1 arg means wants an ord returned; the second that we are in |
58 | # runtime, and this is the first level routine called from the user | |
e7a078a0 | 59 | return _charnames::lookup_name($arg, 1, 1); |
35c0985d | 60 | } # vianame |
b177ca84 | 61 | |
fb121860 KW |
62 | sub string_vianame { |
63 | ||
64 | # Looks up the character name and returns its string representation if | |
65 | # found, undef otherwise. | |
66 | ||
67 | if (@_ != 1) { | |
e7a078a0 | 68 | _charnames::carp "charnames::string_vianame() expects one name argument"; |
fb121860 KW |
69 | return; |
70 | } | |
71 | ||
72 | my $arg = shift; | |
73 | ||
74 | if ($arg =~ /^U\+([0-9a-fA-F]+)$/) { | |
75 | ||
76 | my $ord = CORE::hex $1; | |
77 | return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits); | |
78 | ||
e7a078a0 | 79 | _charnames::carp _charnames::not_legal_use_bytes_msg($arg, chr $ord); |
fb121860 KW |
80 | return; |
81 | } | |
82 | ||
83 | # The 0 arg means wants a string returned; the 1 arg means that we are in | |
84 | # runtime, and this is the first level routine called from the user | |
e7a078a0 | 85 | return _charnames::lookup_name($arg, 0, 1); |
fb121860 KW |
86 | } # string_vianame |
87 | ||
423cee85 JH |
88 | 1; |
89 | __END__ | |
90 | ||
91 | =head1 NAME | |
92 | ||
fb121860 | 93 | charnames - access to Unicode character names and named character sequences; also define character names |
423cee85 JH |
94 | |
95 | =head1 SYNOPSIS | |
96 | ||
bcc08981 KW |
97 | use charnames ':full'; |
98 | print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; | |
99 | print "\N{LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW}", | |
100 | " is an officially named sequence of two Unicode characters\n"; | |
101 | ||
38f4139d KW |
102 | use charnames ':loose'; |
103 | print "\N{Greek small-letter sigma}", | |
104 | "can be used to ignore case, underscores, most blanks," | |
105 | "and when you aren't sure if the official name has hyphens\n"; | |
106 | ||
bcc08981 KW |
107 | use charnames ':short'; |
108 | print "\N{greek:Sigma} is an upper-case sigma.\n"; | |
109 | ||
110 | use charnames qw(cyrillic greek); | |
111 | print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n"; | |
112 | ||
113 | use charnames ":full", ":alias" => { | |
114 | e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", | |
115 | mychar => 0xE8000, # Private use area | |
116 | }; | |
117 | print "\N{e_ACUTE} is a small letter e with an acute.\n"; | |
14aeae98 | 118 | print "\N{mychar} allows me to name private use characters.\n"; |
bcc08981 KW |
119 | |
120 | use charnames (); | |
121 | print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE" | |
122 | printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints | |
123 | # "10330" | |
124 | print charnames::vianame("LATIN CAPITAL LETTER A"); # prints 65 on | |
125 | # ASCII platforms; | |
126 | # 193 on EBCDIC | |
127 | print charnames::string_vianame("LATIN CAPITAL LETTER A"); # prints "A" | |
b177ca84 | 128 | |
423cee85 JH |
129 | =head1 DESCRIPTION |
130 | ||
da9dec57 | 131 | Pragma C<use charnames> is used to gain access to the names of the |
fb121860 KW |
132 | Unicode characters and named character sequences, and to allow you to define |
133 | your own character and character sequence names. | |
134 | ||
135 | All forms of the pragma enable use of the following 3 functions: | |
136 | ||
137 | =over | |
138 | ||
139 | =item * | |
140 | ||
141 | L</charnames::string_vianame(I<name>)> for run-time lookup of a | |
142 | either a character name or a named character sequence, returning its string | |
143 | representation | |
144 | ||
145 | =item * | |
146 | ||
147 | L</charnames::vianame(I<name>)> for run-time lookup of a | |
148 | character name (but not a named character sequence) to get its ordinal value | |
149 | (code point) | |
da9dec57 | 150 | |
fb121860 | 151 | =item * |
da9dec57 | 152 | |
fb121860 KW |
153 | L</charnames::viacode(I<code>)> for run-time lookup of a code point to get its |
154 | Unicode name. | |
155 | ||
156 | =back | |
157 | ||
fbb93542 KW |
158 | Starting in Perl 5.16, any occurrence of C<\N{I<CHARNAME>}> sequences |
159 | in a double-quotish string automatically loads this module with arguments | |
160 | C<:full> and C<:short> (described below) if it hasn't already been loaded with | |
161 | different arguments, in order to compile the named Unicode character into | |
162 | position in the string. Prior to 5.16, an explicit S<C<use charnames>> was | |
163 | required to enable this usage. (However, prior to 5.16, the form C<S<"use | |
164 | charnames ();">> did not enable C<\N{I<CHARNAME>}>.) | |
da9dec57 KW |
165 | |
166 | Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number, | |
fbb93542 KW |
167 | also inserts a character into a string. |
168 | The character it inserts is the one whose code point | |
da9dec57 | 169 | (ordinal value) is equal to the number. For example, C<"\N{U+263a}"> is |
fbb93542 KW |
170 | the Unicode (white background, black foreground) smiley face |
171 | equivalent to C<"\N{WHITE SMILING FACE}">. | |
d9f23c72 | 172 | Also note, C<\N{I<...>}> can mean a regex quantifier instead of a character |
8ebef31d KW |
173 | name, when the I<...> is a number (or comma separated pair of numbers |
174 | (see L<perlreref/QUANTIFIERS>), and is not related to this pragma. | |
da9dec57 | 175 | |
38f4139d KW |
176 | The C<charnames> pragma supports arguments C<:full>, C<:loose>, C<:short>, |
177 | script names and L<customized aliases|/CUSTOM ALIASES>. | |
178 | ||
179 | If C<:full> is present, for expansion of | |
da9dec57 | 180 | C<\N{I<CHARNAME>}>, the string I<CHARNAME> is first looked up in the list of |
38f4139d KW |
181 | standard Unicode character names. |
182 | ||
183 | C<:loose> is a variant of C<:full> which allows I<CHARNAME> to be less | |
184 | precisely specified. Details are in L</LOOSE MATCHES>. | |
185 | ||
186 | If C<:short> is present, and | |
da9dec57 | 187 | I<CHARNAME> has the form C<I<SCRIPT>:I<CNAME>>, then I<CNAME> is looked up |
14aeae98 KW |
188 | as a letter in script I<SCRIPT>, as described in the next paragraph. |
189 | Or, if C<use charnames> is used | |
da9dec57 KW |
190 | with script name arguments, then for C<\N{I<CHARNAME>}> the name |
191 | I<CHARNAME> is looked up as a letter in the given scripts (in the | |
16036bcd KW |
192 | specified order). Customized aliases can override these, and are explained in |
193 | L</CUSTOM ALIASES>. | |
423cee85 | 194 | |
da9dec57 | 195 | For lookup of I<CHARNAME> inside a given script I<SCRIPTNAME> |
14aeae98 | 196 | this pragma looks in the table of standard Unicode names for the names |
423cee85 JH |
197 | |
198 | SCRIPTNAME CAPITAL LETTER CHARNAME | |
199 | SCRIPTNAME SMALL LETTER CHARNAME | |
200 | SCRIPTNAME LETTER CHARNAME | |
201 | ||
14aeae98 | 202 | If I<CHARNAME> is all lowercase, |
daf0d493 | 203 | then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant |
14aeae98 | 204 | is ignored, and both I<CHARNAME> and I<SCRIPTNAME> are converted to all |
38f4139d KW |
205 | uppercase for look-up. Other than that, both of them follow L<loose|/LOOSE |
206 | MATCHES> rules if C<:loose> is also specified; strict otherwise. | |
daf0d493 | 207 | |
da9dec57 KW |
208 | Note that C<\N{...}> is compile-time; it's a special form of string |
209 | constant used inside double-quotish strings; this means that you cannot | |
4e2cda5d | 210 | use variables inside the C<\N{...}>. If you want similar run-time |
fb121860 KW |
211 | functionality, use |
212 | L<charnames::string_vianame()|/charnames::string_vianame(I<name>)>. | |
423cee85 | 213 | |
301a3cda | 214 | For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F) |
da9dec57 KW |
215 | there are no official Unicode names but you can use instead the ISO 6429 |
216 | names (LINE FEED, ESCAPE, and so forth, and their abbreviations, LF, | |
1f31fcd4 | 217 | ESC, ...). In Unicode 3.2 (as of Perl 5.8) some naming changes took |
b59ae8bb | 218 | place, and ISO 6429 was updated, see L</ALIASES>. Since Unicode 6.0, it |
d9f23c72 KW |
219 | is deprecated to use C<BELL>. Instead use C<ALERT> (but C<BEL> will continue |
220 | to work). | |
301a3cda | 221 | |
e5432b89 KW |
222 | If the input name is unknown, C<\N{NAME}> raises a warning and |
223 | substitutes the Unicode REPLACEMENT CHARACTER (U+FFFD). | |
224 | ||
8ebef31d KW |
225 | For C<\N{NAME}>, it is a fatal error if C<use bytes> is in effect and the |
226 | input name is that of a character that won't fit into a byte (i.e., whose | |
227 | ordinal is above 255). | |
e5432b89 | 228 | |
da9dec57 KW |
229 | Otherwise, any string that includes a C<\N{I<charname>}> or |
230 | C<S<\N{U+I<code point>}>> will automatically have Unicode semantics (see | |
231 | L<perlunicode/Byte and Character Semantics>). | |
232 | ||
38f4139d KW |
233 | =head1 LOOSE MATCHES |
234 | ||
235 | By specifying C<:loose>, Unicode's L<loose character name | |
5ef88e32 | 236 | matching|http://www.unicode.org/reports/tr44#Matching_Rules> rules are |
38f4139d KW |
237 | selected instead of the strict exact match used otherwise. |
238 | That means that I<CHARNAME> doesn't have to be so precisely specified. | |
239 | Upper/lower case doesn't matter (except with scripts as mentioned above), nor | |
240 | do any underscores, and the only hyphens that matter are those at the | |
241 | beginning or end of a word in the name (with one exception: the hyphen in | |
242 | U+1180 C<HANGUL JUNGSEONG O-E> does matter). | |
243 | Also, blanks not adjacent to hyphens don't matter. | |
244 | The official Unicode names are quite variable as to where they use hyphens | |
245 | versus spaces to separate word-like units, and this option allows you to not | |
246 | have to care as much. | |
247 | The reason non-medial hyphens matter is because of cases like | |
248 | U+0F60 C<TIBETAN LETTER -A> versus U+0F68 C<TIBETAN LETTER A>. | |
249 | The hyphen here is significant, as is the space before it, and so both must be | |
250 | included. | |
251 | ||
252 | C<:loose> slows down look-ups by a factor of 2 to 3 versus | |
253 | C<:full>, but the trade-off may be worth it to you. Each individual look-up | |
254 | takes very little time, and the results are cached, so the speed difference | |
255 | would become a factor only in programs that do look-ups of many different | |
256 | spellings, and probably only when those look-ups are through vianame() and | |
257 | string_vianame(), since C<\N{...}> look-ups are done at compile time. | |
258 | ||
5ffe0e96 | 259 | =head1 ALIASES |
423cee85 | 260 | |
14aeae98 KW |
261 | A few aliases have been defined for convenience; instead of having |
262 | to use the official names, | |
423cee85 | 263 | |
5ffe0e96 MB |
264 | LINE FEED (LF) |
265 | FORM FEED (FF) | |
266 | CARRIAGE RETURN (CR) | |
267 | NEXT LINE (NEL) | |
423cee85 | 268 | |
e5432b89 | 269 | (yes, with parentheses), one can use |
d5448623 | 270 | |
5ffe0e96 MB |
271 | LINE FEED |
272 | FORM FEED | |
273 | CARRIAGE RETURN | |
274 | NEXT LINE | |
275 | LF | |
276 | FF | |
277 | CR | |
278 | NEL | |
279 | ||
16036bcd KW |
280 | All the other standard abbreviations for the controls, such as C<ACK> for |
281 | C<ACKNOWLEDGE> also can be used. | |
282 | ||
5ffe0e96 MB |
283 | One can also use |
284 | ||
285 | BYTE ORDER MARK | |
286 | BOM | |
287 | ||
16036bcd KW |
288 | and these abbreviations |
289 | ||
290 | Abbreviation Full Name | |
291 | ||
292 | CGJ COMBINING GRAPHEME JOINER | |
293 | FVS1 MONGOLIAN FREE VARIATION SELECTOR ONE | |
294 | FVS2 MONGOLIAN FREE VARIATION SELECTOR TWO | |
295 | FVS3 MONGOLIAN FREE VARIATION SELECTOR THREE | |
296 | LRE LEFT-TO-RIGHT EMBEDDING | |
297 | LRM LEFT-TO-RIGHT MARK | |
298 | LRO LEFT-TO-RIGHT OVERRIDE | |
299 | MMSP MEDIUM MATHEMATICAL SPACE | |
300 | MVS MONGOLIAN VOWEL SEPARATOR | |
301 | NBSP NO-BREAK SPACE | |
302 | NNBSP NARROW NO-BREAK SPACE | |
303 | PDF POP DIRECTIONAL FORMATTING | |
304 | RLE RIGHT-TO-LEFT EMBEDDING | |
305 | RLM RIGHT-TO-LEFT MARK | |
306 | RLO RIGHT-TO-LEFT OVERRIDE | |
307 | SHY SOFT HYPHEN | |
308 | VS1 VARIATION SELECTOR-1 | |
309 | . | |
310 | . | |
311 | . | |
312 | VS256 VARIATION SELECTOR-256 | |
313 | WJ WORD JOINER | |
314 | ZWJ ZERO WIDTH JOINER | |
315 | ZWNJ ZERO WIDTH NON-JOINER | |
316 | ZWSP ZERO WIDTH SPACE | |
5ffe0e96 MB |
317 | |
318 | For backward compatibility one can use the old names for | |
319 | certain C0 and C1 controls | |
320 | ||
321 | old new | |
322 | ||
5ffe0e96 MB |
323 | FILE SEPARATOR INFORMATION SEPARATOR FOUR |
324 | GROUP SEPARATOR INFORMATION SEPARATOR THREE | |
16036bcd KW |
325 | HORIZONTAL TABULATION CHARACTER TABULATION |
326 | HORIZONTAL TABULATION SET CHARACTER TABULATION SET | |
327 | HORIZONTAL TABULATION WITH JUSTIFICATION CHARACTER TABULATION | |
328 | WITH JUSTIFICATION | |
5ffe0e96 MB |
329 | PARTIAL LINE DOWN PARTIAL LINE FORWARD |
330 | PARTIAL LINE UP PARTIAL LINE BACKWARD | |
16036bcd KW |
331 | RECORD SEPARATOR INFORMATION SEPARATOR TWO |
332 | REVERSE INDEX REVERSE LINE FEED | |
333 | UNIT SEPARATOR INFORMATION SEPARATOR ONE | |
334 | VERTICAL TABULATION LINE TABULATION | |
335 | VERTICAL TABULATION SET LINE TABULATION SET | |
5ffe0e96 MB |
336 | |
337 | but the old names in addition to giving the character | |
338 | will also give a warning about being deprecated. | |
423cee85 | 339 | |
16036bcd KW |
340 | And finally, certain published variants are usable, including some for |
341 | controls that have no Unicode names: | |
342 | ||
1f31fcd4 KW |
343 | name character |
344 | ||
52fb7278 | 345 | END OF PROTECTED AREA END OF GUARDED AREA, U+0097 |
1f31fcd4 KW |
346 | HIGH OCTET PRESET U+0081 |
347 | HOP U+0081 | |
348 | IND U+0084 | |
349 | INDEX U+0084 | |
350 | PAD U+0080 | |
351 | PADDING CHARACTER U+0080 | |
352 | PRIVATE USE 1 PRIVATE USE ONE, U+0091 | |
353 | PRIVATE USE 2 PRIVATE USE TWO, U+0092 | |
354 | SGC U+0099 | |
355 | SINGLE GRAPHIC CHARACTER INTRODUCER U+0099 | |
356 | SINGLE-SHIFT 2 SINGLE SHIFT TWO, U+008E | |
357 | SINGLE-SHIFT 3 SINGLE SHIFT THREE, U+008F | |
358 | START OF PROTECTED AREA START OF GUARDED AREA, U+0096 | |
16036bcd | 359 | |
35c0985d MB |
360 | =head1 CUSTOM ALIASES |
361 | ||
1f31fcd4 KW |
362 | You can add customized aliases to standard (C<:full>) Unicode naming |
363 | conventions. The aliases override any standard definitions, so, if | |
da9dec57 KW |
364 | you're twisted enough, you can change C<"\N{LATIN CAPITAL LETTER A}"> to |
365 | mean C<"B">, etc. | |
55bc7d3c KW |
366 | |
367 | Note that an alias should not be something that is a legal curly | |
368 | brace-enclosed quantifier (see L<perlreref/QUANTIFIERS>). For example | |
e5432b89 KW |
369 | C<\N{123}> means to match 123 non-newline characters, and is not treated as a |
370 | charnames alias. Aliases are discouraged from beginning with anything | |
371 | other than an alphabetic character and from containing anything other | |
372 | than alphanumerics, spaces, dashes, parentheses, and underscores. | |
373 | Currently they must be ASCII. | |
374 | ||
38f4139d KW |
375 | An alias can map to either an official Unicode character name (not a loose |
376 | matched name) or to a | |
e5432b89 KW |
377 | numeric code point (ordinal). The latter is useful for assigning names |
378 | to code points in Unicode private use areas such as U+E800 through | |
f12d74c0 KW |
379 | U+F8FF. |
380 | A numeric code point must be a non-negative integer or a string beginning | |
381 | with C<"U+"> or C<"0x"> with the remainder considered to be a | |
382 | hexadecimal integer. A literal numeric constant must be unsigned; it | |
383 | will be interpreted as hex if it has a leading zero or contains | |
384 | non-decimal hex digits; otherwise it will be interpreted as decimal. | |
232cbbee | 385 | |
da9dec57 | 386 | Aliases are added either by the use of anonymous hashes: |
35c0985d | 387 | |
da9dec57 | 388 | use charnames ":alias" => { |
35c0985d | 389 | e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", |
232cbbee | 390 | mychar1 => 0xE8000, |
35c0985d MB |
391 | }; |
392 | my $str = "\N{e_ACUTE}"; | |
393 | ||
da9dec57 | 394 | or by using a file containing aliases: |
35c0985d | 395 | |
da9dec57 | 396 | use charnames ":alias" => "pro"; |
35c0985d | 397 | |
8ebef31d | 398 | This will try to read C<"unicore/pro_alias.pl"> from the C<@INC> path. This |
da9dec57 | 399 | file should return a list in plain perl: |
35c0985d MB |
400 | |
401 | ( | |
402 | A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE", | |
403 | A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX", | |
404 | A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS", | |
405 | A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE", | |
406 | A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE", | |
407 | A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE", | |
408 | A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON", | |
f12d74c0 | 409 | mychar2 => "U+E8001", |
35c0985d MB |
410 | ); |
411 | ||
da9dec57 KW |
412 | Both these methods insert C<":full"> automatically as the first argument (if no |
413 | other argument is given), and you can give the C<":full"> explicitly as | |
414 | well, like | |
35c0985d | 415 | |
da9dec57 | 416 | use charnames ":full", ":alias" => "pro"; |
35c0985d | 417 | |
38f4139d KW |
418 | C<":loose"> has no effect with these. Input names must match exactly, using |
419 | C<":full"> rules. | |
420 | ||
14aeae98 | 421 | Also, both these methods currently allow only single characters to be named. |
8ebef31d KW |
422 | To name a sequence of characters, use a |
423 | L<custom translator|/CUSTOM TRANSLATORS> (described below). | |
424 | ||
da9dec57 | 425 | =head1 charnames::viacode(I<code>) |
b177ca84 JF |
426 | |
427 | Returns the full name of the character indicated by the numeric code. | |
da9dec57 | 428 | For example, |
b177ca84 JF |
429 | |
430 | print charnames::viacode(0x2722); | |
431 | ||
432 | prints "FOUR TEARDROP-SPOKED ASTERISK". | |
433 | ||
232cbbee | 434 | The name returned is the official name for the code point, if |
8ebef31d | 435 | available; otherwise your custom alias for it. This means that your |
232cbbee | 436 | alias will only be returned for code points that don't have an official |
14aeae98 | 437 | Unicode name (nor a Unicode version 1 name), such as private use code |
232cbbee | 438 | points, and the 4 control characters U+0080, U+0081, U+0084, and U+0099. |
da9dec57 KW |
439 | If you define more than one name for the code point, it is indeterminate |
440 | which one will be returned. | |
441 | ||
442 | The function returns C<undef> if no name is known for the code point. | |
443 | In Unicode the proper name of these is the empty string, which | |
444 | C<undef> stringifies to. (If you ask for a code point past the legal | |
445 | Unicode maximum of U+10FFFF that you haven't assigned an alias to, you | |
f12d74c0 KW |
446 | get C<undef> plus a warning.) |
447 | ||
448 | The input number must be a non-negative integer or a string beginning | |
449 | with C<"U+"> or C<"0x"> with the remainder considered to be a | |
450 | hexadecimal integer. A literal numeric constant must be unsigned; it | |
451 | will be interpreted as hex if it has a leading zero or contains | |
452 | non-decimal hex digits; otherwise it will be interpreted as decimal. | |
daf0d493 | 453 | |
d9f23c72 | 454 | Notice that the name returned for U+FEFF is "ZERO WIDTH NO-BREAK |
274085e3 PN |
455 | SPACE", not "BYTE ORDER MARK". |
456 | ||
fb121860 | 457 | =head1 charnames::string_vianame(I<name>) |
daf0d493 | 458 | |
fb121860 KW |
459 | This is a runtime equivalent to C<\N{...}>. I<name> can be any expression |
460 | that evaluates to a name accepted by C<\N{...}> under the L<C<:full> | |
461 | option|/DESCRIPTION> to C<charnames>. In addition, any other options for the | |
38f4139d KW |
462 | controlling C<"use charnames"> in the same scope apply, like C<:loose> or any |
463 | L<script list, C<:short> option|/DESCRIPTION>, or L<custom aliases|/CUSTOM | |
464 | ALIASES> you may have defined. | |
daf0d493 | 465 | |
fb121860 KW |
466 | The only difference is that if the input name is unknown, C<string_vianame> |
467 | returns C<undef> instead of the REPLACEMENT CHARACTER and does not raise a | |
468 | warning message. | |
daf0d493 | 469 | |
fb121860 KW |
470 | =head1 charnames::vianame(I<name>) |
471 | ||
472 | This is similar to C<string_vianame>. The main difference is that under most | |
5ef88e32 | 473 | circumstances, vianame returns an ordinal code |
fb121860 | 474 | point, whereas C<string_vianame> returns a string. For example, |
daf0d493 | 475 | |
fb121860 | 476 | printf "U+%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK"); |
b177ca84 | 477 | |
fb121860 | 478 | prints "U+2722". |
1f31fcd4 | 479 | |
fb121860 KW |
480 | This leads to the other two differences. Since a single code point is |
481 | returned, the function can't handle named character sequences, as these are | |
14aeae98 KW |
482 | composed of multiple characters (it returns C<undef> for these. And, the code |
483 | point can be that of any | |
fb121860 | 484 | character, even ones that aren't legal under the C<S<use bytes>> pragma, |
b177ca84 | 485 | |
5ef88e32 KW |
486 | See L</BUGS> for the circumstances in which the behavior differs |
487 | from that described above. | |
488 | ||
5ffe0e96 | 489 | =head1 CUSTOM TRANSLATORS |
52ea3e69 | 490 | |
5ffe0e96 | 491 | The mechanism of translation of C<\N{...}> escapes is general and not |
5ef88e32 | 492 | hardwired into F<charnames.pm>. A module can install custom |
5ffe0e96 MB |
493 | translations (inside the scope which C<use>s the module) with the |
494 | following magic incantation: | |
52ea3e69 | 495 | |
5ffe0e96 | 496 | sub import { |
52fb7278 KW |
497 | shift; |
498 | $^H{charnames} = \&translator; | |
5ffe0e96 | 499 | } |
52ea3e69 | 500 | |
da9dec57 | 501 | Here translator() is a subroutine which takes I<CHARNAME> as an |
5ffe0e96 | 502 | argument, and returns text to insert into the string instead of the |
5ef88e32 KW |
503 | C<\N{I<CHARNAME>}> escape. |
504 | ||
505 | This is the only way you can create a custom named sequence of code points. | |
506 | ||
507 | Since the text to insert should be different | |
5ffe0e96 MB |
508 | in C<bytes> mode and out of it, the function should check the current |
509 | state of C<bytes>-flag as in: | |
52ea3e69 | 510 | |
52fb7278 | 511 | use bytes (); # for $bytes::hint_bits |
5ffe0e96 | 512 | sub translator { |
52fb7278 KW |
513 | if ($^H & $bytes::hint_bits) { |
514 | return bytes_translator(@_); | |
515 | } | |
516 | else { | |
517 | return utf8_translator(@_); | |
518 | } | |
5ffe0e96 | 519 | } |
52ea3e69 | 520 | |
da9dec57 | 521 | See L</CUSTOM ALIASES> above for restrictions on I<CHARNAME>. |
f0175764 | 522 | |
9e808deb KW |
523 | Of course, C<vianame>, C<viacode>, and C<string_vianame> would need to be |
524 | overridden as well. | |
1f31fcd4 | 525 | |
423cee85 JH |
526 | =head1 BUGS |
527 | ||
14aeae98 | 528 | vianame() normally returns an ordinal code point, but when the input name is of |
8ebef31d KW |
529 | the form C<U+...>, it returns a chr instead. In this case, if C<use bytes> is |
530 | in effect and the character won't fit into a byte, it returns C<undef> and | |
531 | raises a warning. | |
55bc7d3c | 532 | |
16036bcd KW |
533 | Names must be ASCII characters only, which means that you are out of luck if |
534 | you want to create aliases in a language where some or all the characters of | |
535 | the desired aliases are non-ASCII. | |
bee80e93 | 536 | |
f12d74c0 KW |
537 | Since evaluation of the translation function (see L</CUSTOM |
538 | TRANSLATORS>) happens in the middle of compilation (of a string | |
539 | literal), the translation function should not do any C<eval>s or | |
540 | C<require>s. This restriction should be lifted (but is low priority) in | |
541 | a future version of Perl. | |
423cee85 JH |
542 | |
543 | =cut | |
0eacc33e | 544 | |
52fb7278 | 545 | # ex: set ts=8 sts=2 sw=2 et: |