| 1 | package charnames; |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | use Carp; |
| 5 | our $VERSION = '1.01'; |
| 6 | |
| 7 | use bytes (); # for $bytes::hint_bits |
| 8 | $charnames::hint_bits = 0x20000; |
| 9 | |
| 10 | my %alias1 = ( |
| 11 | # Icky 3.2 names with parentheses. |
| 12 | 'LINE FEED' => 'LINE FEED (LF)', |
| 13 | 'FORM FEED' => 'FORM FEED (FF)', |
| 14 | 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)', |
| 15 | 'NEXT LINE' => 'NEXT LINE (NEL)', |
| 16 | # Convenience. |
| 17 | 'LF' => 'LINE FEED (LF)', |
| 18 | 'FF' => 'FORM FEED (FF)', |
| 19 | 'CR' => 'CARRIAGE RETURN (LF)', |
| 20 | 'NEL' => 'NEXT LINE (NEL)', |
| 21 | # More convenience. For futher convencience, |
| 22 | # it is suggested some way using using the NamesList |
| 23 | # aliases is implemented. |
| 24 | 'ZWNJ' => 'ZERO WIDTH NON-JOINER', |
| 25 | 'ZWJ' => 'ZERO WIDTH JOINER', |
| 26 | 'BOM' => 'BYTE ORDER MARK', |
| 27 | ); |
| 28 | |
| 29 | my %alias2 = ( |
| 30 | # Pre-3.2 compatibility (only for the first 256 characters). |
| 31 | 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION', |
| 32 | 'VERTICAL TABULATION' => 'LINE TABULATION', |
| 33 | 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR', |
| 34 | 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE', |
| 35 | 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO', |
| 36 | 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE', |
| 37 | 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD', |
| 38 | 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD', |
| 39 | ); |
| 40 | |
| 41 | my $txt; |
| 42 | |
| 43 | # This is not optimized in any way yet |
| 44 | sub charnames |
| 45 | { |
| 46 | my $name = shift; |
| 47 | |
| 48 | if (exists $alias1{$name}) { |
| 49 | $name = $alias1{$name}; |
| 50 | } |
| 51 | if (exists $alias2{$name}) { |
| 52 | require warnings; |
| 53 | warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead}); |
| 54 | $name = $alias2{$name}; |
| 55 | } |
| 56 | |
| 57 | my $ord; |
| 58 | my @off; |
| 59 | my $fname; |
| 60 | |
| 61 | if ($name eq "BYTE ORDER MARK") { |
| 62 | $fname = $name; |
| 63 | $ord = 0xFEFF; |
| 64 | } else { |
| 65 | ## Suck in the code/name list as a big string. |
| 66 | ## Lines look like: |
| 67 | ## "0052\t\tLATIN CAPITAL LETTER R\n" |
| 68 | $txt = do "unicore/Name.pl" unless $txt; |
| 69 | |
| 70 | ## @off will hold the index into the code/name string of the start and |
| 71 | ## end of the name as we find it. |
| 72 | |
| 73 | ## If :full, look for the the name exactly |
| 74 | if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) { |
| 75 | @off = ($-[0], $+[0]); |
| 76 | } |
| 77 | |
| 78 | ## If we didn't get above, and :short allowed, look for the short name. |
| 79 | ## The short name is like "greek:Sigma" |
| 80 | unless (@off) { |
| 81 | if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) { |
| 82 | my ($script, $cname) = ($1,$2); |
| 83 | my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); |
| 84 | if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) { |
| 85 | @off = ($-[0], $+[0]); |
| 86 | } |
| 87 | } |
| 88 | } |
| 89 | |
| 90 | ## If we still don't have it, check for the name among the loaded |
| 91 | ## scripts. |
| 92 | if (not @off) |
| 93 | { |
| 94 | my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); |
| 95 | for my $script ( @{$^H{charnames_scripts}} ) |
| 96 | { |
| 97 | if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) { |
| 98 | @off = ($-[0], $+[0]); |
| 99 | last; |
| 100 | } |
| 101 | } |
| 102 | } |
| 103 | |
| 104 | ## If we don't have it by now, give up. |
| 105 | unless (@off) { |
| 106 | carp "Unknown charname '$name'"; |
| 107 | return "\x{FFFD}"; |
| 108 | } |
| 109 | |
| 110 | ## |
| 111 | ## Now know where in the string the name starts. |
| 112 | ## The code, in hex, is before that. |
| 113 | ## |
| 114 | ## The code can be 4-6 characters long, so we've got to sort of |
| 115 | ## go look for it, just after the newline that comes before $off[0]. |
| 116 | ## |
| 117 | ## This would be much easier if unicore/Name.pl had info in |
| 118 | ## a name/code order, instead of code/name order. |
| 119 | ## |
| 120 | ## The +1 after the rindex() is to skip past the newline we're finding, |
| 121 | ## or, if the rindex() fails, to put us to an offset of zero. |
| 122 | ## |
| 123 | my $hexstart = rindex($txt, "\n", $off[0]) + 1; |
| 124 | |
| 125 | ## we know where it starts, so turn into number - |
| 126 | ## the ordinal for the char. |
| 127 | $ord = hex substr($txt, $hexstart, $off[0] - $hexstart); |
| 128 | } |
| 129 | |
| 130 | if ($^H & $bytes::hint_bits) { # "use bytes" in effect? |
| 131 | use bytes; |
| 132 | return chr $ord if $ord <= 255; |
| 133 | my $hex = sprintf "%04x", $ord; |
| 134 | if (not defined $fname) { |
| 135 | $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2; |
| 136 | } |
| 137 | croak "Character 0x$hex with name '$fname' is above 0xFF"; |
| 138 | } |
| 139 | |
| 140 | no warnings 'utf8'; # allow even illegal characters |
| 141 | return pack "U", $ord; |
| 142 | } |
| 143 | |
| 144 | sub import |
| 145 | { |
| 146 | shift; ## ignore class name |
| 147 | |
| 148 | if (not @_) |
| 149 | { |
| 150 | carp("`use charnames' needs explicit imports list"); |
| 151 | } |
| 152 | $^H |= $charnames::hint_bits; |
| 153 | $^H{charnames} = \&charnames ; |
| 154 | |
| 155 | ## |
| 156 | ## fill %h keys with our @_ args. |
| 157 | ## |
| 158 | my %h; |
| 159 | @h{@_} = (1) x @_; |
| 160 | |
| 161 | $^H{charnames_full} = delete $h{':full'}; |
| 162 | $^H{charnames_short} = delete $h{':short'}; |
| 163 | $^H{charnames_scripts} = [map uc, keys %h]; |
| 164 | |
| 165 | ## |
| 166 | ## If utf8? warnings are enabled, and some scripts were given, |
| 167 | ## see if at least we can find one letter of each script. |
| 168 | ## |
| 169 | if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) |
| 170 | { |
| 171 | $txt = do "unicore/Name.pl" unless $txt; |
| 172 | |
| 173 | for my $script (@{$^H{charnames_scripts}}) |
| 174 | { |
| 175 | if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) { |
| 176 | warnings::warn('utf8', "No such script: '$script'"); |
| 177 | } |
| 178 | } |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | require Unicode::UCD; # for Unicode::UCD::_getcode() |
| 183 | |
| 184 | my %viacode; |
| 185 | |
| 186 | sub viacode |
| 187 | { |
| 188 | if (@_ != 1) { |
| 189 | carp "charnames::viacode() expects one argument"; |
| 190 | return () |
| 191 | } |
| 192 | |
| 193 | my $arg = shift; |
| 194 | my $code = Unicode::UCD::_getcode($arg); |
| 195 | |
| 196 | my $hex; |
| 197 | |
| 198 | if (defined $code) { |
| 199 | $hex = sprintf "%04X", $arg; |
| 200 | } else { |
| 201 | carp("unexpected arg \"$arg\" to charnames::viacode()"); |
| 202 | return; |
| 203 | } |
| 204 | |
| 205 | if ($code > 0x10FFFF) { |
| 206 | carp "Unicode characters only allocated up to 0x10FFFF (you asked for $hex)"; |
| 207 | return; |
| 208 | } |
| 209 | |
| 210 | return $viacode{$hex} if exists $viacode{$hex}; |
| 211 | |
| 212 | $txt = do "unicore/Name.pl" unless $txt; |
| 213 | |
| 214 | if ($txt =~ m/^$hex\t\t(.+)/m) { |
| 215 | return $viacode{$hex} = $1; |
| 216 | } else { |
| 217 | return; |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | my %vianame; |
| 222 | |
| 223 | sub vianame |
| 224 | { |
| 225 | if (@_ != 1) { |
| 226 | carp "charnames::vianame() expects one name argument"; |
| 227 | return () |
| 228 | } |
| 229 | |
| 230 | my $arg = shift; |
| 231 | |
| 232 | return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/; |
| 233 | |
| 234 | return $vianame{$arg} if exists $vianame{$arg}; |
| 235 | |
| 236 | $txt = do "unicore/Name.pl" unless $txt; |
| 237 | |
| 238 | if ($txt =~ m/^([0-9A-F]+)\t\t($arg)/m) { |
| 239 | return $vianame{$arg} = hex $1; |
| 240 | } else { |
| 241 | return; |
| 242 | } |
| 243 | } |
| 244 | |
| 245 | |
| 246 | 1; |
| 247 | __END__ |
| 248 | |
| 249 | =head1 NAME |
| 250 | |
| 251 | charnames - define character names for C<\N{named}> string literal escapes |
| 252 | |
| 253 | =head1 SYNOPSIS |
| 254 | |
| 255 | use charnames ':full'; |
| 256 | print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; |
| 257 | |
| 258 | use charnames ':short'; |
| 259 | print "\N{greek:Sigma} is an upper-case sigma.\n"; |
| 260 | |
| 261 | use charnames qw(cyrillic greek); |
| 262 | print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n"; |
| 263 | |
| 264 | print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE" |
| 265 | printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330" |
| 266 | |
| 267 | =head1 DESCRIPTION |
| 268 | |
| 269 | Pragma C<use charnames> supports arguments C<:full>, C<:short> and |
| 270 | script names. If C<:full> is present, for expansion of |
| 271 | C<\N{CHARNAME}> string C<CHARNAME> is first looked in the list of |
| 272 | standard Unicode names of chars. If C<:short> is present, and |
| 273 | C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up |
| 274 | as a letter in script C<SCRIPT>. If pragma C<use charnames> is used |
| 275 | with script name arguments, then for C<\N{CHARNAME}> the name |
| 276 | C<CHARNAME> is looked up as a letter in the given scripts (in the |
| 277 | specified order). |
| 278 | |
| 279 | For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME> |
| 280 | this pragma looks for the names |
| 281 | |
| 282 | SCRIPTNAME CAPITAL LETTER CHARNAME |
| 283 | SCRIPTNAME SMALL LETTER CHARNAME |
| 284 | SCRIPTNAME LETTER CHARNAME |
| 285 | |
| 286 | in the table of standard Unicode names. If C<CHARNAME> is lowercase, |
| 287 | then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant |
| 288 | is ignored. |
| 289 | |
| 290 | Note that C<\N{...}> is compile-time, it's a special form of string |
| 291 | constant used inside double-quoted strings: in other words, you cannot |
| 292 | use variables inside the C<\N{...}>. If you want similar run-time |
| 293 | functionality, use charnames::vianame(). |
| 294 | |
| 295 | For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F) |
| 296 | as of Unicode 3.1, there are no official Unicode names but you can use |
| 297 | instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). In |
| 298 | Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429 |
| 299 | has been updated, see L</ALIASES>. Also note that the U+UU80, U+0081, |
| 300 | U+0084, and U+0099 do not have names even in ISO 6429. |
| 301 | |
| 302 | Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}" |
| 303 | is the Unicode smiley face, or "\N{WHITE SMILING FACE}". |
| 304 | |
| 305 | =head1 CUSTOM TRANSLATORS |
| 306 | |
| 307 | The mechanism of translation of C<\N{...}> escapes is general and not |
| 308 | hardwired into F<charnames.pm>. A module can install custom |
| 309 | translations (inside the scope which C<use>s the module) with the |
| 310 | following magic incantation: |
| 311 | |
| 312 | use charnames (); # for $charnames::hint_bits |
| 313 | sub import { |
| 314 | shift; |
| 315 | $^H |= $charnames::hint_bits; |
| 316 | $^H{charnames} = \&translator; |
| 317 | } |
| 318 | |
| 319 | Here translator() is a subroutine which takes C<CHARNAME> as an |
| 320 | argument, and returns text to insert into the string instead of the |
| 321 | C<\N{CHARNAME}> escape. Since the text to insert should be different |
| 322 | in C<bytes> mode and out of it, the function should check the current |
| 323 | state of C<bytes>-flag as in: |
| 324 | |
| 325 | use bytes (); # for $bytes::hint_bits |
| 326 | sub translator { |
| 327 | if ($^H & $bytes::hint_bits) { |
| 328 | return bytes_translator(@_); |
| 329 | } |
| 330 | else { |
| 331 | return utf8_translator(@_); |
| 332 | } |
| 333 | } |
| 334 | |
| 335 | =head1 charnames::viacode(code) |
| 336 | |
| 337 | Returns the full name of the character indicated by the numeric code. |
| 338 | The example |
| 339 | |
| 340 | print charnames::viacode(0x2722); |
| 341 | |
| 342 | prints "FOUR TEARDROP-SPOKED ASTERISK". |
| 343 | |
| 344 | Returns undef if no name is known for the code. |
| 345 | |
| 346 | This works only for the standard names, and does not yet apply |
| 347 | to custom translators. |
| 348 | |
| 349 | Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK |
| 350 | SPACE", not "BYTE ORDER MARK". |
| 351 | |
| 352 | =head1 charnames::vianame(name) |
| 353 | |
| 354 | Returns the code point indicated by the name. |
| 355 | The example |
| 356 | |
| 357 | printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK"); |
| 358 | |
| 359 | prints "2722". |
| 360 | |
| 361 | Returns undef if the name is unknown. |
| 362 | |
| 363 | This works only for the standard names, and does not yet apply |
| 364 | to custom translators. |
| 365 | |
| 366 | =head1 ALIASES |
| 367 | |
| 368 | A few aliases have been defined for convenience: instead of having |
| 369 | to use the official names |
| 370 | |
| 371 | LINE FEED (LF) |
| 372 | FORM FEED (FF) |
| 373 | CARRIAGE RETURN (CR) |
| 374 | NEXT LINE (NEL) |
| 375 | |
| 376 | (yes, with parentheses) one can use |
| 377 | |
| 378 | LINE FEED |
| 379 | FORM FEED |
| 380 | CARRIAGE RETURN |
| 381 | NEXT LINE |
| 382 | LF |
| 383 | FF |
| 384 | CR |
| 385 | NEL |
| 386 | |
| 387 | One can also use |
| 388 | |
| 389 | BYTE ORDER MARK |
| 390 | BOM |
| 391 | |
| 392 | and |
| 393 | |
| 394 | ZWNJ |
| 395 | ZWJ |
| 396 | |
| 397 | for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER. |
| 398 | |
| 399 | For backward compatibility one can use the old names for |
| 400 | certain C0 and C1 controls |
| 401 | |
| 402 | old new |
| 403 | |
| 404 | HORIZONTAL TABULATION CHARACTER TABULATION |
| 405 | VERTICAL TABULATION LINE TABULATION |
| 406 | FILE SEPARATOR INFORMATION SEPARATOR FOUR |
| 407 | GROUP SEPARATOR INFORMATION SEPARATOR THREE |
| 408 | RECORD SEPARATOR INFORMATION SEPARATOR TWO |
| 409 | UNIT SEPARATOR INFORMATION SEPARATOR ONE |
| 410 | PARTIAL LINE DOWN PARTIAL LINE FORWARD |
| 411 | PARTIAL LINE UP PARTIAL LINE BACKWARD |
| 412 | |
| 413 | but the old names in addition to giving the character |
| 414 | will also give a warning about being deprecated. |
| 415 | |
| 416 | =head1 ILLEGAL CHARACTERS |
| 417 | |
| 418 | If you ask for a character that does not exist, a warning is given |
| 419 | and the Unicode I<replacement character> "\x{FFFD}" is returned. |
| 420 | |
| 421 | =head1 BUGS |
| 422 | |
| 423 | Since evaluation of the translation function happens in a middle of |
| 424 | compilation (of a string literal), the translation function should not |
| 425 | do any C<eval>s or C<require>s. This restriction should be lifted in |
| 426 | a future version of Perl. |
| 427 | |
| 428 | =cut |