lib/charnames.pm

   1 package charnames;
   2 use strict;
   3 use warnings;
   4 use File::Spec;
   5 our $VERSION = '1.05';
   6
   7 use bytes ();           # for $bytes::hint_bits
   8 $charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH
   9
  10 my %alias1 = (
  11                 # Icky 3.2 names with parentheses.
  12                 'LINE FEED'             => 'LINE FEED (LF)',
  13                 'FORM FEED'             => 'FORM FEED (FF)',
  14                 'CARRIAGE RETURN'       => 'CARRIAGE RETURN (CR)',
  15                 'NEXT LINE'             => 'NEXT LINE (NEL)',
  16                 # Convenience.
  17                 'LF'                    => 'LINE FEED (LF)',
  18                 'FF'                    => 'FORM FEED (FF)',
  19                 'CR'                    => 'CARRIAGE RETURN (CR)',
  20                 'NEL'                   => 'NEXT LINE (NEL)',
  21                 # More convenience.  For futher convencience,
  22                 # it is suggested some way using using the NamesList
  23                 # aliases is implemented.
  24                 'ZWNJ'                  => 'ZERO WIDTH NON-JOINER',
  25                 'ZWJ'                   => 'ZERO WIDTH JOINER',
  26                 'BOM'                   => 'BYTE ORDER MARK',
  27             );
  28
  29 my %alias2 = (
  30                 # Pre-3.2 compatibility (only for the first 256 characters).
  31                 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION',
  32                 'VERTICAL TABULATION'   => 'LINE TABULATION',
  33                 'FILE SEPARATOR'        => 'INFORMATION SEPARATOR FOUR',
  34                 'GROUP SEPARATOR'       => 'INFORMATION SEPARATOR THREE',
  35                 'RECORD SEPARATOR'      => 'INFORMATION SEPARATOR TWO',
  36                 'UNIT SEPARATOR'        => 'INFORMATION SEPARATOR ONE',
  37                 'PARTIAL LINE DOWN'     => 'PARTIAL LINE FORWARD',
  38                 'PARTIAL LINE UP'       => 'PARTIAL LINE BACKWARD',
  39             );
  40
  41 my %alias3 = (
  42                 # User defined aliasses. Even more convenient :)
  43             );
  44 my $txt;
  45
  46 sub croak
  47 {
  48   require Carp; goto &Carp::croak;
  49 } # croak
  50
  51 sub carp
  52 {
  53   require Carp; goto &Carp::carp;
  54 } # carp
  55
  56 sub alias (@)
  57 {
  58   @_ or return %alias3;
  59   my $alias = ref $_[0] ? $_[0] : { @_ };
  60   @alias3{keys %$alias} = values %$alias;
  61 } # alias
  62
  63 sub alias_file ($)
  64 {
  65   my ($arg, $file) = @_;
  66   if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
  67     $file = $arg;
  68   }
  69   elsif ($arg =~ m/^\w+$/) {
  70     $file = "unicore/${arg}_alias.pl";
  71   }
  72   else {
  73     croak "Charnames alias files can only have identifier characters";
  74   }
  75   if (my @alias = do $file) {
  76     @alias == 1 && !defined $alias[0] and
  77       croak "$file cannot be used as alias file for charnames";
  78     @alias % 2 and
  79       croak "$file did not return a (valid) list of alias pairs";
  80     alias (@alias);
  81     return (1);
  82   }
  83   0;
  84 } # alias_file
  85
  86 # This is not optimized in any way yet
  87 sub charnames
  88 {
  89   my $name = shift;
  90
  91   if (exists $alias1{$name}) {
  92     $name = $alias1{$name};
  93   }
  94   elsif (exists $alias2{$name}) {
  95     require warnings;
  96     warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
  97     $name = $alias2{$name};
  98   }
  99   elsif (exists $alias3{$name}) {
 100     $name = $alias3{$name};
 101   }
 102
 103   my $ord;
 104   my @off;
 105   my $fname;
 106
 107   if ($name eq "BYTE ORDER MARK") {
 108     $fname = $name;
 109     $ord = 0xFEFF;
 110   } else {
 111     ## Suck in the code/name list as a big string.
 112     ## Lines look like:
 113     ##     "0052\t\tLATIN CAPITAL LETTER R\n"
 114     $txt = do "unicore/Name.pl" unless $txt;
 115
 116     ## @off will hold the index into the code/name string of the start and
 117     ## end of the name as we find it.
 118
 119     ## If :full, look for the name exactly
 120     if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
 121       @off = ($-[0], $+[0]);
 122     }
 123
 124     ## If we didn't get above, and :short allowed, look for the short name.
 125     ## The short name is like "greek:Sigma"
 126     unless (@off) {
 127       if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
 128         my ($script, $cname) = ($1, $2);
 129         my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
 130         if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
 131           @off = ($-[0], $+[0]);
 132         }
 133       }
 134     }
 135
 136     ## If we still don't have it, check for the name among the loaded
 137     ## scripts.
 138     if (not @off) {
 139       my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
 140       for my $script (@{$^H{charnames_scripts}}) {
 141         if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
 142           @off = ($-[0], $+[0]);
 143           last;
 144         }
 145       }
 146     }
 147
 148     ## If we don't have it by now, give up.
 149     unless (@off) {
 150       carp "Unknown charname '$name'";
 151       return "\x{FFFD}";
 152     }
 153
 154     ##
 155     ## Now know where in the string the name starts.
 156     ## The code, in hex, is before that.
 157     ##
 158     ## The code can be 4-6 characters long, so we've got to sort of
 159     ## go look for it, just after the newline that comes before $off[0].
 160     ##
 161     ## This would be much easier if unicore/Name.pl had info in
 162     ## a name/code order, instead of code/name order.
 163     ##
 164     ## The +1 after the rindex() is to skip past the newline we're finding,
 165     ## or, if the rindex() fails, to put us to an offset of zero.
 166     ##
 167     my $hexstart = rindex($txt, "\n", $off[0]) + 1;
 168
 169     ## we know where it starts, so turn into number -
 170     ## the ordinal for the char.
 171     $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
 172   }
 173
 174   if ($^H & $bytes::hint_bits) {        # "use bytes" in effect?
 175     use bytes;
 176     return chr $ord if $ord <= 255;
 177     my $hex = sprintf "%04x", $ord;
 178     if (not defined $fname) {
 179       $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
 180     }
 181     croak "Character 0x$hex with name '$fname' is above 0xFF";
 182   }
 183
 184   no warnings 'utf8'; # allow even illegal characters
 185   return pack "U", $ord;
 186 } # charnames
 187
 188 sub import
 189 {
 190   shift; ## ignore class name
 191
 192   if (not @_) {
 193     carp("`use charnames' needs explicit imports list");
 194   }
 195   $^H |= $charnames::hint_bits;
 196   $^H{charnames} = \&charnames ;
 197
 198   ##
 199   ## fill %h keys with our @_ args.
 200   ##
 201   my ($promote, %h, @args) = (0);
 202   while (my $arg = shift) {
 203     if ($arg eq ":alias") {
 204       @_ or
 205         croak ":alias needs an argument in charnames";
 206       my $alias = shift;
 207       if (ref $alias) {
 208         ref $alias eq "HASH" or
 209           croak "Only HASH reference supported as argument to :alias";
 210         alias ($alias);
 211         next;
 212       }
 213       if ($alias =~ m{:(\w+)$}) {
 214         $1 eq "full" || $1 eq "short" and
 215           croak ":alias cannot use existing pragma :$1 (reversed order?)";
 216         alias_file ($1) and $promote = 1;
 217         next;
 218       }
 219       alias_file ($alias);
 220       next;
 221     }
 222     if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" || $arg eq ":short")) {
 223       warn "unsupported special '$arg' in charnames";
 224       next;
 225     }
 226     push @args, $arg;
 227   }
 228   @args == 0 && $promote and @args = (":full");
 229   @h{@args} = (1) x @args;
 230
 231   $^H{charnames_full} = delete $h{':full'};
 232   $^H{charnames_short} = delete $h{':short'};
 233   $^H{charnames_scripts} = [map uc, keys %h];
 234
 235   ##
 236   ## If utf8? warnings are enabled, and some scripts were given,
 237   ## see if at least we can find one letter of each script.
 238   ##
 239   if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
 240     $txt = do "unicore/Name.pl" unless $txt;
 241
 242     for my $script (@{$^H{charnames_scripts}}) {
 243       if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
 244         warnings::warn('utf8',  "No such script: '$script'");
 245       }
 246     }
 247   }
 248 } # import
 249
 250 my %viacode;
 251
 252 sub viacode
 253 {
 254   if (@_ != 1) {
 255     carp "charnames::viacode() expects one argument";
 256     return;
 257   }
 258
 259   my $arg = shift;
 260
 261   # this comes actually from Unicode::UCD, where it is the named
 262   # function _getcode (), but it avoids the overhead of loading it
 263   my $hex;
 264   if ($arg =~ /^[1-9]\d*$/) {
 265     $hex = sprintf "%04X", $arg;
 266   } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
 267     $hex = $1;
 268   } else {
 269     carp("unexpected arg \"$arg\" to charnames::viacode()");
 270     return;
 271   }
 272
 273   # checking the length first is slightly faster
 274   if (length($hex) > 5 && hex($hex) > 0x10FFFF) {
 275     carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex;
 276     return;
 277   }
 278
 279   return $viacode{$hex} if exists $viacode{$hex};
 280
 281   $txt = do "unicore/Name.pl" unless $txt;
 282
 283   return unless $txt =~ m/^$hex\t\t(.+)/m;
 284
 285   $viacode{$hex} = $1;
 286 } # viacode
 287
 288 my %vianame;
 289
 290 sub vianame
 291 {
 292   if (@_ != 1) {
 293     carp "charnames::vianame() expects one name argument";
 294     return ()
 295   }
 296
 297   my $arg = shift;
 298
 299   return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;
 300
 301   return $vianame{$arg} if exists $vianame{$arg};
 302
 303   $txt = do "unicore/Name.pl" unless $txt;
 304
 305   my $pos = index $txt, "\t\t$arg\n";
 306   if ($[ <= $pos) {
 307     my $posLF = rindex $txt, "\n", $pos;
 308     (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d;
 309     return $vianame{$arg} = hex $code;
 310
 311     # If $pos is at the 1st line, $posLF must be $[ - 1 (not found);
 312     # then $posLF + 1 equals to $[ (at the beginning of $txt).
 313     # Otherwise $posLF is the position of "\n";
 314     # then $posLF + 1 must be the position of the next to "\n"
 315     # (the beginning of the line).
 316     # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t",
 317     # "10300\t", "100000", etc. So we can get the code via removing TAB.
 318   } else {
 319     return;
 320   }
 321 } # vianame
 322
 323
 324 1;
 325 __END__
 326
 327 =head1 NAME
 328
 329 charnames - define character names for C<\N{named}> string literal escapes
 330
 331 =head1 SYNOPSIS
 332
 333   use charnames ':full';
 334   print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
 335
 336   use charnames ':short';
 337   print "\N{greek:Sigma} is an upper-case sigma.\n";
 338
 339   use charnames qw(cyrillic greek);
 340   print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
 341
 342   use charnames ":full", ":alias" => {
 343     e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
 344   };
 345   print "\N{e_ACUTE} is a small letter e with an acute.\n";
 346
 347   use charnames ();
 348   print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
 349   printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330"
 350
 351 =head1 DESCRIPTION
 352
 353 Pragma C<use charnames> supports arguments C<:full>, C<:short>, script
 354 names and customized aliases.  If C<:full> is present, for expansion of
 355 C<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of
 356 standard Unicode character names.  If C<:short> is present, and
 357 C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
 358 as a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
 359 with script name arguments, then for C<\N{CHARNAME}> the name
 360 C<CHARNAME> is looked up as a letter in the given scripts (in the
 361 specified order). Customized aliases are explained in L</CUSTOM ALIASES>.
 362
 363 For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
 364 this pragma looks for the names
 365
 366   SCRIPTNAME CAPITAL LETTER CHARNAME
 367   SCRIPTNAME SMALL LETTER CHARNAME
 368   SCRIPTNAME LETTER CHARNAME
 369
 370 in the table of standard Unicode names.  If C<CHARNAME> is lowercase,
 371 then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
 372 is ignored.
 373
 374 Note that C<\N{...}> is compile-time, it's a special form of string
 375 constant used inside double-quoted strings: in other words, you cannot
 376 use variables inside the C<\N{...}>.  If you want similar run-time
 377 functionality, use charnames::vianame().
 378
 379 For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
 380 as of Unicode 3.1, there are no official Unicode names but you can use
 381 instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth).  In
 382 Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
 383 has been updated, see L</ALIASES>.  Also note that the U+UU80, U+0081,
 384 U+0084, and U+0099 do not have names even in ISO 6429.
 385
 386 Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
 387 is the Unicode smiley face, or "\N{WHITE SMILING FACE}".
 388
 389 =head1 CUSTOM TRANSLATORS
 390
 391 The mechanism of translation of C<\N{...}> escapes is general and not
 392 hardwired into F<charnames.pm>.  A module can install custom
 393 translations (inside the scope which C<use>s the module) with the
 394 following magic incantation:
 395
 396     use charnames ();           # for $charnames::hint_bits
 397     sub import {
 398         shift;
 399         $^H |= $charnames::hint_bits;
 400         $^H{charnames} = \&translator;
 401     }
 402
 403 Here translator() is a subroutine which takes C<CHARNAME> as an
 404 argument, and returns text to insert into the string instead of the
 405 C<\N{CHARNAME}> escape.  Since the text to insert should be different
 406 in C<bytes> mode and out of it, the function should check the current
 407 state of C<bytes>-flag as in:
 408
 409     use bytes ();                       # for $bytes::hint_bits
 410     sub translator {
 411         if ($^H & $bytes::hint_bits) {
 412             return bytes_translator(@_);
 413         }
 414         else {
 415             return utf8_translator(@_);
 416         }
 417     }
 418
 419 =head1 CUSTOM ALIASES
 420
 421 This version of charnames supports three mechanisms of adding local
 422 or customized aliases to standard Unicode naming conventions (:full)
 423
 424 =head2 Anonymous hashes
 425
 426     use charnames ":full", ":alias" => {
 427         e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
 428         };
 429     my $str = "\N{e_ACUTE}";
 430
 431 =head2 Alias file
 432
 433     use charnames ":full", ":alias" => "pro";
 434
 435     will try to read "unicore/pro_alias.pl" from the @INC path. This
 436     file should return a list in plain perl:
 437
 438     (
 439     A_GRAVE         => "LATIN CAPITAL LETTER A WITH GRAVE",
 440     A_CIRCUM        => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
 441     A_DIAERES       => "LATIN CAPITAL LETTER A WITH DIAERESIS",
 442     A_TILDE         => "LATIN CAPITAL LETTER A WITH TILDE",
 443     A_BREVE         => "LATIN CAPITAL LETTER A WITH BREVE",
 444     A_RING          => "LATIN CAPITAL LETTER A WITH RING ABOVE",
 445     A_MACRON        => "LATIN CAPITAL LETTER A WITH MACRON",
 446     );
 447
 448 =head2 Alias shortcut
 449
 450     use charnames ":alias" => ":pro";
 451
 452     works exactly the same as the alias pairs, only this time,
 453     ":full" is inserted automatically as first argument (if no
 454     other argument is given).
 455
 456 =head1 charnames::viacode(code)
 457
 458 Returns the full name of the character indicated by the numeric code.
 459 The example
 460
 461     print charnames::viacode(0x2722);
 462
 463 prints "FOUR TEARDROP-SPOKED ASTERISK".
 464
 465 Returns undef if no name is known for the code.
 466
 467 This works only for the standard names, and does not yet apply
 468 to custom translators.
 469
 470 Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
 471 SPACE", not "BYTE ORDER MARK".
 472
 473 =head1 charnames::vianame(name)
 474
 475 Returns the code point indicated by the name.
 476 The example
 477
 478     printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
 479
 480 prints "2722".
 481
 482 Returns undef if the name is unknown.
 483
 484 This works only for the standard names, and does not yet apply
 485 to custom translators.
 486
 487 =head1 ALIASES
 488
 489 A few aliases have been defined for convenience: instead of having
 490 to use the official names
 491
 492     LINE FEED (LF)
 493     FORM FEED (FF)
 494     CARRIAGE RETURN (CR)
 495     NEXT LINE (NEL)
 496
 497 (yes, with parentheses) one can use
 498
 499     LINE FEED
 500     FORM FEED
 501     CARRIAGE RETURN
 502     NEXT LINE
 503     LF
 504     FF
 505     CR
 506     NEL
 507
 508 One can also use
 509
 510     BYTE ORDER MARK
 511     BOM
 512
 513 and
 514
 515     ZWNJ
 516     ZWJ
 517
 518 for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER.
 519
 520 For backward compatibility one can use the old names for
 521 certain C0 and C1 controls
 522
 523     old                         new
 524
 525     HORIZONTAL TABULATION       CHARACTER TABULATION
 526     VERTICAL TABULATION         LINE TABULATION
 527     FILE SEPARATOR              INFORMATION SEPARATOR FOUR
 528     GROUP SEPARATOR             INFORMATION SEPARATOR THREE
 529     RECORD SEPARATOR            INFORMATION SEPARATOR TWO
 530     UNIT SEPARATOR              INFORMATION SEPARATOR ONE
 531     PARTIAL LINE DOWN           PARTIAL LINE FORWARD
 532     PARTIAL LINE UP             PARTIAL LINE BACKWARD
 533
 534 but the old names in addition to giving the character
 535 will also give a warning about being deprecated.
 536
 537 =head1 ILLEGAL CHARACTERS
 538
 539 If you ask by name for a character that does not exist, a warning is
 540 given and the Unicode I<replacement character> "\x{FFFD}" is returned.
 541
 542 If you ask by code for a character that does not exist, no warning is
 543 given and C<undef> is returned.  (Though if you ask for a code point
 544 past U+10FFFF you do get a warning.)
 545
 546 =head1 BUGS
 547
 548 Since evaluation of the translation function happens in a middle of
 549 compilation (of a string literal), the translation function should not
 550 do any C<eval>s or C<require>s.  This restriction should be lifted in
 551 a future version of Perl.
 552
 553 =cut