| 1 | package charnames; |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | our $VERSION = '1.36'; |
| 5 | use unicore::Name; # mktables-generated algorithmically-defined names |
| 6 | use _charnames (); # The submodule for this where most of the work gets done |
| 7 | |
| 8 | use bytes (); # for $bytes::hint_bits |
| 9 | use re "/aa"; # Everything in here should be ASCII |
| 10 | |
| 11 | # Translate between Unicode character names and their code points. |
| 12 | # This is a wrapper around the submodule C<_charnames>. This design allows |
| 13 | # C<_charnames> to be autoloaded to enable use of \N{...}, but requires this |
| 14 | # module to be explicitly requested for the functions API. |
| 15 | |
| 16 | $Carp::Internal{ (__PACKAGE__) } = 1; |
| 17 | |
| 18 | sub import |
| 19 | { |
| 20 | shift; ## ignore class name |
| 21 | _charnames->import(@_); |
| 22 | } |
| 23 | |
| 24 | # Cache of already looked-up values. This is set to only contain |
| 25 | # official values, and user aliases can't override them, so scoping is |
| 26 | # not an issue. |
| 27 | my %viacode; |
| 28 | |
| 29 | sub viacode { |
| 30 | return _charnames::viacode(@_); |
| 31 | } |
| 32 | |
| 33 | sub vianame |
| 34 | { |
| 35 | if (@_ != 1) { |
| 36 | _charnames::carp "charnames::vianame() expects one name argument"; |
| 37 | return () |
| 38 | } |
| 39 | |
| 40 | # Looks up the character name and returns its ordinal if |
| 41 | # found, undef otherwise. |
| 42 | |
| 43 | my $arg = shift; |
| 44 | |
| 45 | if ($arg =~ /^U\+([0-9a-fA-F]+)$/) { |
| 46 | |
| 47 | # khw claims that this is poor interface design. The function should |
| 48 | # return either a an ord or a chr for all inputs; not be bipolar. But |
| 49 | # can't change it because of backward compatibility. New code can use |
| 50 | # string_vianame() instead. |
| 51 | my $ord = CORE::hex $1; |
| 52 | return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits); |
| 53 | _charnames::carp _charnames::not_legal_use_bytes_msg($arg, chr $ord); |
| 54 | return; |
| 55 | } |
| 56 | |
| 57 | # The first 1 arg means wants an ord returned; the second that we are in |
| 58 | # runtime, and this is the first level routine called from the user |
| 59 | return _charnames::lookup_name($arg, 1, 1); |
| 60 | } # vianame |
| 61 | |
| 62 | sub string_vianame { |
| 63 | |
| 64 | # Looks up the character name and returns its string representation if |
| 65 | # found, undef otherwise. |
| 66 | |
| 67 | if (@_ != 1) { |
| 68 | _charnames::carp "charnames::string_vianame() expects one name argument"; |
| 69 | return; |
| 70 | } |
| 71 | |
| 72 | my $arg = shift; |
| 73 | |
| 74 | if ($arg =~ /^U\+([0-9a-fA-F]+)$/) { |
| 75 | |
| 76 | my $ord = CORE::hex $1; |
| 77 | return chr $ord if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits); |
| 78 | |
| 79 | _charnames::carp _charnames::not_legal_use_bytes_msg($arg, chr $ord); |
| 80 | return; |
| 81 | } |
| 82 | |
| 83 | # The 0 arg means wants a string returned; the 1 arg means that we are in |
| 84 | # runtime, and this is the first level routine called from the user |
| 85 | return _charnames::lookup_name($arg, 0, 1); |
| 86 | } # string_vianame |
| 87 | |
| 88 | 1; |
| 89 | __END__ |
| 90 | |
| 91 | =encoding utf8 |
| 92 | |
| 93 | =head1 NAME |
| 94 | |
| 95 | charnames - access to Unicode character names and named character sequences; also define character names |
| 96 | |
| 97 | =head1 SYNOPSIS |
| 98 | |
| 99 | use charnames ':full'; |
| 100 | print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; |
| 101 | print "\N{LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW}", |
| 102 | " is an officially named sequence of two Unicode characters\n"; |
| 103 | |
| 104 | use charnames ':loose'; |
| 105 | print "\N{Greek small-letter sigma}", |
| 106 | "can be used to ignore case, underscores, most blanks," |
| 107 | "and when you aren't sure if the official name has hyphens\n"; |
| 108 | |
| 109 | use charnames ':short'; |
| 110 | print "\N{greek:Sigma} is an upper-case sigma.\n"; |
| 111 | |
| 112 | use charnames qw(cyrillic greek); |
| 113 | print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n"; |
| 114 | |
| 115 | use utf8; |
| 116 | use charnames ":full", ":alias" => { |
| 117 | e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", |
| 118 | mychar => 0xE8000, # Private use area |
| 119 | "自転車に乗る人" => "BICYCLIST" |
| 120 | }; |
| 121 | print "\N{e_ACUTE} is a small letter e with an acute.\n"; |
| 122 | print "\N{mychar} allows me to name private use characters.\n"; |
| 123 | print "And I can create synonyms in other languages,", |
| 124 | " such as \N{自転車に乗る人} for "BICYCLIST (U+1F6B4)\n"; |
| 125 | |
| 126 | use charnames (); |
| 127 | print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE" |
| 128 | printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints |
| 129 | # "10330" |
| 130 | print charnames::vianame("LATIN CAPITAL LETTER A"); # prints 65 on |
| 131 | # ASCII platforms; |
| 132 | # 193 on EBCDIC |
| 133 | print charnames::string_vianame("LATIN CAPITAL LETTER A"); # prints "A" |
| 134 | |
| 135 | =head1 DESCRIPTION |
| 136 | |
| 137 | Pragma C<use charnames> is used to gain access to the names of the |
| 138 | Unicode characters and named character sequences, and to allow you to define |
| 139 | your own character and character sequence names. |
| 140 | |
| 141 | All forms of the pragma enable use of the following 3 functions: |
| 142 | |
| 143 | =over |
| 144 | |
| 145 | =item * |
| 146 | |
| 147 | L</charnames::string_vianame(I<name>)> for run-time lookup of a |
| 148 | either a character name or a named character sequence, returning its string |
| 149 | representation |
| 150 | |
| 151 | =item * |
| 152 | |
| 153 | L</charnames::vianame(I<name>)> for run-time lookup of a |
| 154 | character name (but not a named character sequence) to get its ordinal value |
| 155 | (code point) |
| 156 | |
| 157 | =item * |
| 158 | |
| 159 | L</charnames::viacode(I<code>)> for run-time lookup of a code point to get its |
| 160 | Unicode name. |
| 161 | |
| 162 | =back |
| 163 | |
| 164 | Starting in Perl v5.16, any occurrence of C<\N{I<CHARNAME>}> sequences |
| 165 | in a double-quotish string automatically loads this module with arguments |
| 166 | C<:full> and C<:short> (described below) if it hasn't already been loaded with |
| 167 | different arguments, in order to compile the named Unicode character into |
| 168 | position in the string. Prior to v5.16, an explicit S<C<use charnames>> was |
| 169 | required to enable this usage. (However, prior to v5.16, the form C<S<"use |
| 170 | charnames ();">> did not enable C<\N{I<CHARNAME>}>.) |
| 171 | |
| 172 | Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number, |
| 173 | also inserts a character into a string. |
| 174 | The character it inserts is the one whose code point |
| 175 | (ordinal value) is equal to the number. For example, C<"\N{U+263a}"> is |
| 176 | the Unicode (white background, black foreground) smiley face |
| 177 | equivalent to C<"\N{WHITE SMILING FACE}">. |
| 178 | Also note, C<\N{I<...>}> can mean a regex quantifier instead of a character |
| 179 | name, when the I<...> is a number (or comma separated pair of numbers |
| 180 | (see L<perlreref/QUANTIFIERS>), and is not related to this pragma. |
| 181 | |
| 182 | The C<charnames> pragma supports arguments C<:full>, C<:loose>, C<:short>, |
| 183 | script names and L<customized aliases|/CUSTOM ALIASES>. |
| 184 | |
| 185 | If C<:full> is present, for expansion of |
| 186 | C<\N{I<CHARNAME>}>, the string I<CHARNAME> is first looked up in the list of |
| 187 | standard Unicode character names. |
| 188 | |
| 189 | C<:loose> is a variant of C<:full> which allows I<CHARNAME> to be less |
| 190 | precisely specified. Details are in L</LOOSE MATCHES>. |
| 191 | |
| 192 | If C<:short> is present, and |
| 193 | I<CHARNAME> has the form C<I<SCRIPT>:I<CNAME>>, then I<CNAME> is looked up |
| 194 | as a letter in script I<SCRIPT>, as described in the next paragraph. |
| 195 | Or, if C<use charnames> is used |
| 196 | with script name arguments, then for C<\N{I<CHARNAME>}> the name |
| 197 | I<CHARNAME> is looked up as a letter in the given scripts (in the |
| 198 | specified order). Customized aliases can override these, and are explained in |
| 199 | L</CUSTOM ALIASES>. |
| 200 | |
| 201 | For lookup of I<CHARNAME> inside a given script I<SCRIPTNAME>, |
| 202 | this pragma looks in the table of standard Unicode names for the names |
| 203 | |
| 204 | SCRIPTNAME CAPITAL LETTER CHARNAME |
| 205 | SCRIPTNAME SMALL LETTER CHARNAME |
| 206 | SCRIPTNAME LETTER CHARNAME |
| 207 | |
| 208 | If I<CHARNAME> is all lowercase, |
| 209 | then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant |
| 210 | is ignored, and both I<CHARNAME> and I<SCRIPTNAME> are converted to all |
| 211 | uppercase for look-up. Other than that, both of them follow L<loose|/LOOSE |
| 212 | MATCHES> rules if C<:loose> is also specified; strict otherwise. |
| 213 | |
| 214 | Note that C<\N{...}> is compile-time; it's a special form of string |
| 215 | constant used inside double-quotish strings; this means that you cannot |
| 216 | use variables inside the C<\N{...}>. If you want similar run-time |
| 217 | functionality, use |
| 218 | L<charnames::string_vianame()|/charnames::string_vianame(I<name>)>. |
| 219 | |
| 220 | Note, starting in Perl 5.18, the name C<BELL> refers to the Unicode character |
| 221 | U+1F514, instead of the traditional U+0007. For the latter, use C<ALERT> |
| 222 | or C<BEL>. |
| 223 | |
| 224 | It is a syntax error to use C<\N{NAME}> where C<NAME> is unknown. |
| 225 | |
| 226 | For C<\N{NAME}>, it is a fatal error if C<use bytes> is in effect and the |
| 227 | input name is that of a character that won't fit into a byte (i.e., whose |
| 228 | ordinal is above 255). |
| 229 | |
| 230 | Otherwise, any string that includes a C<\N{I<charname>}> or |
| 231 | C<S<\N{U+I<code point>}>> will automatically have Unicode semantics (see |
| 232 | L<perlunicode/Byte and Character Semantics>). |
| 233 | |
| 234 | =head1 LOOSE MATCHES |
| 235 | |
| 236 | By specifying C<:loose>, Unicode's L<loose character name |
| 237 | matching|http://www.unicode.org/reports/tr44#Matching_Rules> rules are |
| 238 | selected instead of the strict exact match used otherwise. |
| 239 | That means that I<CHARNAME> doesn't have to be so precisely specified. |
| 240 | Upper/lower case doesn't matter (except with scripts as mentioned above), nor |
| 241 | do any underscores, and the only hyphens that matter are those at the |
| 242 | beginning or end of a word in the name (with one exception: the hyphen in |
| 243 | U+1180 C<HANGUL JUNGSEONG O-E> does matter). |
| 244 | Also, blanks not adjacent to hyphens don't matter. |
| 245 | The official Unicode names are quite variable as to where they use hyphens |
| 246 | versus spaces to separate word-like units, and this option allows you to not |
| 247 | have to care as much. |
| 248 | The reason non-medial hyphens matter is because of cases like |
| 249 | U+0F60 C<TIBETAN LETTER -A> versus U+0F68 C<TIBETAN LETTER A>. |
| 250 | The hyphen here is significant, as is the space before it, and so both must be |
| 251 | included. |
| 252 | |
| 253 | C<:loose> slows down look-ups by a factor of 2 to 3 versus |
| 254 | C<:full>, but the trade-off may be worth it to you. Each individual look-up |
| 255 | takes very little time, and the results are cached, so the speed difference |
| 256 | would become a factor only in programs that do look-ups of many different |
| 257 | spellings, and probably only when those look-ups are through C<vianame()> and |
| 258 | C<string_vianame()>, since C<\N{...}> look-ups are done at compile time. |
| 259 | |
| 260 | =head1 ALIASES |
| 261 | |
| 262 | Starting in Unicode 6.1 and Perl v5.16, Unicode defines many abbreviations and |
| 263 | names that were formerly Perl extensions, and some additional ones that Perl |
| 264 | did not previously accept. The list is getting too long to reproduce here, |
| 265 | but you can get the complete list from the Unicode web site: |
| 266 | L<http://www.unicode.org/Public/UNIDATA/NameAliases.txt>. |
| 267 | |
| 268 | Earlier versions of Perl accepted almost all the 6.1 names. These were most |
| 269 | extensively documented in the v5.14 version of this pod: |
| 270 | L<http://perldoc.perl.org/5.14.0/charnames.html#ALIASES>. |
| 271 | |
| 272 | =head1 CUSTOM ALIASES |
| 273 | |
| 274 | You can add customized aliases to standard (C<:full>) Unicode naming |
| 275 | conventions. The aliases override any standard definitions, so, if |
| 276 | you're twisted enough, you can change C<"\N{LATIN CAPITAL LETTER A}"> to |
| 277 | mean C<"B">, etc. |
| 278 | |
| 279 | Aliases must begin with a character that is alphabetic. After that, each may |
| 280 | contain any combination of word (C<\w>) characters, SPACE (U+0020), |
| 281 | HYPHEN-MINUS (U+002D), LEFT PARENTHESIS (U+0028), RIGHT PARENTHESIS (U+0029), |
| 282 | and NO-BREAK SPACE (U+00A0). These last three should never have been allowed |
| 283 | in names, and are retained for backwards compatibility only; they may be |
| 284 | deprecated and removed in future releases of Perl, so don't use them for new |
| 285 | names. (More precisely, the first character of a name you specify must be |
| 286 | something that matches all of C<\p{ID_Start}>, C<\p{Alphabetic}>, and |
| 287 | C<\p{Gc=Letter}>. This makes sure it is what any reasonable person would view |
| 288 | as an alphabetic character. And, the continuation characters that match C<\w> |
| 289 | must also match C<\p{ID_Continue}>.) Starting with Perl v5.18, any Unicode |
| 290 | characters meeting the above criteria may be used; prior to that only |
| 291 | Latin1-range characters were acceptable. |
| 292 | |
| 293 | An alias can map to either an official Unicode character name (not a loose |
| 294 | matched name) or to a |
| 295 | numeric code point (ordinal). The latter is useful for assigning names |
| 296 | to code points in Unicode private use areas such as U+E800 through |
| 297 | U+F8FF. |
| 298 | A numeric code point must be a non-negative integer or a string beginning |
| 299 | with C<"U+"> or C<"0x"> with the remainder considered to be a |
| 300 | hexadecimal integer. A literal numeric constant must be unsigned; it |
| 301 | will be interpreted as hex if it has a leading zero or contains |
| 302 | non-decimal hex digits; otherwise it will be interpreted as decimal. |
| 303 | |
| 304 | Aliases are added either by the use of anonymous hashes: |
| 305 | |
| 306 | use charnames ":alias" => { |
| 307 | e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", |
| 308 | mychar1 => 0xE8000, |
| 309 | }; |
| 310 | my $str = "\N{e_ACUTE}"; |
| 311 | |
| 312 | or by using a file containing aliases: |
| 313 | |
| 314 | use charnames ":alias" => "pro"; |
| 315 | |
| 316 | This will try to read C<"unicore/pro_alias.pl"> from the C<@INC> path. This |
| 317 | file should return a list in plain perl: |
| 318 | |
| 319 | ( |
| 320 | A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE", |
| 321 | A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX", |
| 322 | A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS", |
| 323 | A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE", |
| 324 | A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE", |
| 325 | A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE", |
| 326 | A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON", |
| 327 | mychar2 => "U+E8001", |
| 328 | ); |
| 329 | |
| 330 | Both these methods insert C<":full"> automatically as the first argument (if no |
| 331 | other argument is given), and you can give the C<":full"> explicitly as |
| 332 | well, like |
| 333 | |
| 334 | use charnames ":full", ":alias" => "pro"; |
| 335 | |
| 336 | C<":loose"> has no effect with these. Input names must match exactly, using |
| 337 | C<":full"> rules. |
| 338 | |
| 339 | Also, both these methods currently allow only single characters to be named. |
| 340 | To name a sequence of characters, use a |
| 341 | L<custom translator|/CUSTOM TRANSLATORS> (described below). |
| 342 | |
| 343 | =head1 charnames::string_vianame(I<name>) |
| 344 | |
| 345 | This is a runtime equivalent to C<\N{...}>. I<name> can be any expression |
| 346 | that evaluates to a name accepted by C<\N{...}> under the L<C<:full> |
| 347 | option|/DESCRIPTION> to C<charnames>. In addition, any other options for the |
| 348 | controlling C<"use charnames"> in the same scope apply, like C<:loose> or any |
| 349 | L<script list, C<:short> option|/DESCRIPTION>, or L<custom aliases|/CUSTOM |
| 350 | ALIASES> you may have defined. |
| 351 | |
| 352 | The only differences are due to the fact that C<string_vianame> is run-time |
| 353 | and C<\N{}> is compile time. You can't interpolate inside a C<\N{}>, (so |
| 354 | C<\N{$variable}> doesn't work); and if the input name is unknown, |
| 355 | C<string_vianame> returns C<undef> instead of it being a syntax error. |
| 356 | |
| 357 | =head1 charnames::vianame(I<name>) |
| 358 | |
| 359 | This is similar to C<string_vianame>. The main difference is that under most |
| 360 | circumstances, C<vianame> returns an ordinal code |
| 361 | point, whereas C<string_vianame> returns a string. For example, |
| 362 | |
| 363 | printf "U+%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK"); |
| 364 | |
| 365 | prints "U+2722". |
| 366 | |
| 367 | This leads to the other two differences. Since a single code point is |
| 368 | returned, the function can't handle named character sequences, as these are |
| 369 | composed of multiple characters (it returns C<undef> for these. And, the code |
| 370 | point can be that of any |
| 371 | character, even ones that aren't legal under the C<S<use bytes>> pragma, |
| 372 | |
| 373 | See L</BUGS> for the circumstances in which the behavior differs |
| 374 | from that described above. |
| 375 | |
| 376 | =head1 charnames::viacode(I<code>) |
| 377 | |
| 378 | Returns the full name of the character indicated by the numeric code. |
| 379 | For example, |
| 380 | |
| 381 | print charnames::viacode(0x2722); |
| 382 | |
| 383 | prints "FOUR TEARDROP-SPOKED ASTERISK". |
| 384 | |
| 385 | The name returned is the "best" (defined below) official name or alias |
| 386 | for the code point, if |
| 387 | available; otherwise your custom alias for it, if defined; otherwise C<undef>. |
| 388 | This means that your alias will only be returned for code points that don't |
| 389 | have an official Unicode name (nor alias) such as private use code points. |
| 390 | |
| 391 | If you define more than one name for the code point, it is indeterminate |
| 392 | which one will be returned. |
| 393 | |
| 394 | As mentioned, the function returns C<undef> if no name is known for the code |
| 395 | point. In Unicode the proper name for these is the empty string, which |
| 396 | C<undef> stringifies to. (If you ask for a code point past the legal |
| 397 | Unicode maximum of U+10FFFF that you haven't assigned an alias to, you |
| 398 | get C<undef> plus a warning.) |
| 399 | |
| 400 | The input number must be a non-negative integer, or a string beginning |
| 401 | with C<"U+"> or C<"0x"> with the remainder considered to be a |
| 402 | hexadecimal integer. A literal numeric constant must be unsigned; it |
| 403 | will be interpreted as hex if it has a leading zero or contains |
| 404 | non-decimal hex digits; otherwise it will be interpreted as decimal. |
| 405 | |
| 406 | As mentioned above under L</ALIASES>, Unicode 6.1 defines extra names |
| 407 | (synonyms or aliases) for some code points, most of which were already |
| 408 | available as Perl extensions. All these are accepted by C<\N{...}> and the |
| 409 | other functions in this module, but C<viacode> has to choose which one |
| 410 | name to return for a given input code point, so it returns the "best" name. |
| 411 | To understand how this works, it is helpful to know more about the Unicode |
| 412 | name properties. All code points actually have only a single name, which |
| 413 | (starting in Unicode 2.0) can never change once a character has been assigned |
| 414 | to the code point. But mistakes have been made in assigning names, for |
| 415 | example sometimes a clerical error was made during the publishing of the |
| 416 | Standard which caused words to be misspelled, and there was no way to correct |
| 417 | those. The Name_Alias property was eventually created to handle these |
| 418 | situations. If a name was wrong, a corrected synonym would be published for |
| 419 | it, using Name_Alias. C<viacode> will return that corrected synonym as the |
| 420 | "best" name for a code point. (It is even possible, though it hasn't happened |
| 421 | yet, that the correction itself will need to be corrected, and so another |
| 422 | Name_Alias can be created for that code point; C<viacode> will return the |
| 423 | most recent correction.) |
| 424 | |
| 425 | The Unicode name for each of the control characters (such as LINE FEED) is the |
| 426 | empty string. However almost all had names assigned by other standards, such |
| 427 | as the ASCII Standard, or were in common use. C<viacode> returns these names |
| 428 | as the "best" ones available. Unicode 6.1 has created Name_Aliases for each |
| 429 | of them, including alternate names, like NEW LINE. C<viacode> uses the |
| 430 | original name, "LINE FEED" in preference to the alternate. Similarly the |
| 431 | name returned for U+FEFF is "ZERO WIDTH NO-BREAK SPACE", not "BYTE ORDER |
| 432 | MARK". |
| 433 | |
| 434 | Until Unicode 6.1, the 4 control characters U+0080, U+0081, U+0084, and U+0099 |
| 435 | did not have names nor aliases. |
| 436 | To preserve backwards compatibility, any alias you define for these code |
| 437 | points will be returned by this function, in preference to the official name. |
| 438 | |
| 439 | Some code points also have abbreviated names, such as "LF" or "NL". |
| 440 | C<viacode> never returns these. |
| 441 | |
| 442 | Because a name correction may be added in future Unicode releases, the name |
| 443 | that C<viacode> returns may change as a result. This is a rare event, but it |
| 444 | does happen. |
| 445 | |
| 446 | =head1 CUSTOM TRANSLATORS |
| 447 | |
| 448 | The mechanism of translation of C<\N{...}> escapes is general and not |
| 449 | hardwired into F<charnames.pm>. A module can install custom |
| 450 | translations (inside the scope which C<use>s the module) with the |
| 451 | following magic incantation: |
| 452 | |
| 453 | sub import { |
| 454 | shift; |
| 455 | $^H{charnames} = \&translator; |
| 456 | } |
| 457 | |
| 458 | Here translator() is a subroutine which takes I<CHARNAME> as an |
| 459 | argument, and returns text to insert into the string instead of the |
| 460 | C<\N{I<CHARNAME>}> escape. |
| 461 | |
| 462 | This is the only way you can create a custom named sequence of code points. |
| 463 | |
| 464 | Since the text to insert should be different |
| 465 | in C<bytes> mode and out of it, the function should check the current |
| 466 | state of C<bytes>-flag as in: |
| 467 | |
| 468 | use bytes (); # for $bytes::hint_bits |
| 469 | sub translator { |
| 470 | if ($^H & $bytes::hint_bits) { |
| 471 | return bytes_translator(@_); |
| 472 | } |
| 473 | else { |
| 474 | return utf8_translator(@_); |
| 475 | } |
| 476 | } |
| 477 | |
| 478 | See L</CUSTOM ALIASES> above for restrictions on I<CHARNAME>. |
| 479 | |
| 480 | Of course, C<vianame>, C<viacode>, and C<string_vianame> would need to be |
| 481 | overridden as well. |
| 482 | |
| 483 | =head1 BUGS |
| 484 | |
| 485 | vianame() normally returns an ordinal code point, but when the input name is of |
| 486 | the form C<U+...>, it returns a chr instead. In this case, if C<use bytes> is |
| 487 | in effect and the character won't fit into a byte, it returns C<undef> and |
| 488 | raises a warning. |
| 489 | |
| 490 | Since evaluation of the translation function (see L</CUSTOM |
| 491 | TRANSLATORS>) happens in the middle of compilation (of a string |
| 492 | literal), the translation function should not do any C<eval>s or |
| 493 | C<require>s. This restriction should be lifted (but is low priority) in |
| 494 | a future version of Perl. |
| 495 | |
| 496 | =cut |
| 497 | |
| 498 | # ex: set ts=8 sts=2 sw=2 et: |