lib/charnames.pm

   1 package charnames;
   2 use strict;
   3 use warnings;
   4 use Carp;
   5 our $VERSION = '1.01';
   6
   7 use bytes ();           # for $bytes::hint_bits
   8 $charnames::hint_bits = 0x20000;
   9
  10 my $txt;
  11
  12 # This is not optimized in any way yet
  13 sub charnames
  14 {
  15   my $name = shift;
  16
  17   ## Suck in the code/name list as a big string.
  18   ## Lines look like:
  19   ##     "0052\t\tLATIN CAPITAL LETTER R\n"
  20   $txt = do "unicore/Name.pl" unless $txt;
  21
  22   ## @off will hold the index into the code/name string of the start and
  23   ## end of the name as we find it.
  24   my @off;
  25
  26   ## If :full, look for the the name exactly
  27   if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
  28     @off = ($-[0], $+[0]);
  29   }
  30
  31   ## If we didn't get above, and :short allowed, look for the short name.
  32   ## The short name is like "greek:Sigma"
  33   unless (@off) {
  34     if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
  35       my ($script, $cname) = ($1,$2);
  36       my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
  37       if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
  38         @off = ($-[0], $+[0]);
  39       }
  40     }
  41   }
  42
  43   ## If we still don't have it, check for the name among the loaded
  44   ## scripts.
  45   if (not @off)
  46   {
  47       my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
  48       for my $script ( @{$^H{charnames_scripts}} )
  49       {
  50           if ($txt =~ m/\t\t$script (?:$case )?LETTER \U$name$/m) {
  51               @off = ($-[0], $+[0]);
  52               last;
  53           }
  54       }
  55   }
  56
  57   ## If we don't have it by now, give up.
  58   unless (@off) {
  59       carp "Unknown charname '$name'";
  60       return "\x{FFFD}";
  61   }
  62
  63   ##
  64   ## Now know where in the string the name starts.
  65   ## The code, in hex, is befor that.
  66   ##
  67   ## The code can be 4-6 characters long, so we've got to sort of
  68   ## go look for it, just after the newline that comes before $off[0].
  69   ##
  70   ## This would be much easier if unicore/Name.pl had info in
  71   ## a name/code order, instead of code/name order.
  72   ##
  73   ## The +1 after the rindex() is to skip past the newline we're finding,
  74   ## or, if the rindex() fails, to put us to an offset of zero.
  75   ##
  76   my $hexstart = rindex($txt, "\n", $off[0]) + 1;
  77
  78   ## we know where it starts, so turn into number - the ordinal for the char.
  79   my $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
  80
  81   if ($^H & $bytes::hint_bits) {        # "use bytes" in effect?
  82     use bytes;
  83     return chr $ord if $ord <= 255;
  84     my $hex = sprintf "%04x", $ord;
  85     my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
  86     croak "Character 0x$hex with name '$fname' is above 0xFF";
  87   }
  88
  89   return pack "U", $ord;
  90 }
  91
  92 sub import
  93 {
  94   shift; ## ignore class name
  95
  96   if (not @_)
  97   {
  98       carp("`use charnames' needs explicit imports list");
  99   }
 100   $^H |= $charnames::hint_bits;
 101   $^H{charnames} = \&charnames ;
 102
 103   ##
 104   ## fill %h keys with our @_ args.
 105   ##
 106   my %h;
 107   @h{@_} = (1) x @_;
 108
 109   $^H{charnames_full} = delete $h{':full'};
 110   $^H{charnames_short} = delete $h{':short'};
 111   $^H{charnames_scripts} = [map uc, keys %h];
 112
 113   ##
 114   ## If utf8? warnings are enabled, and some scripts were given,
 115   ## see if at least we can find one letter of each script.
 116   ##
 117   if (warnings::enabled('utf8') && @{$^H{charnames_scripts}})
 118   {
 119       $txt = do "unicore/Name.pl" unless $txt;
 120
 121       for my $script (@{$^H{charnames_scripts}})
 122       {
 123           if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
 124               warnings::warn('utf8',  "No such script: '$script'");
 125           }
 126       }
 127   }
 128 }
 129
 130 require Unicode::UCD; # for Unicode::UCD::_getcode()
 131
 132 my %viacode;
 133
 134 sub viacode
 135 {
 136     if (@_ != 1) {
 137         carp "charnames::viacode() expects one numeric argument";
 138         return ()
 139     }
 140
 141     my $arg = shift;
 142     my $code = Unicode::UCD::_getcode($arg);
 143
 144     my $hex;
 145
 146     if (defined $code) {
 147         $hex = sprintf "%04X", $arg;
 148     } else {
 149         carp("unexpected arg \"$arg\" to charnames::viacode()");
 150         return;
 151     }
 152
 153     if ($code > 0x10FFFF) {
 154         carp "Unicode characters only allocated up to 0x10FFFF (you asked for $hex)";
 155         return "\x{FFFD}";
 156     }
 157
 158     return $viacode{$hex} if exists $viacode{$hex};
 159
 160     $txt = do "unicore/Name.pl" unless $txt;
 161
 162     if ($txt =~ m/^$hex\t\t(.+)/m) {
 163         return $viacode{$hex} = $1;
 164     } else {
 165         return;
 166     }
 167 }
 168
 169 my %vianame;
 170
 171 sub vianame
 172 {
 173     if (@_ != 1) {
 174         carp "charnames::vianame() expects one name argument";
 175         return ()
 176     }
 177
 178     my $arg = shift;
 179
 180     return $vianame{$arg} if exists $vianame{$arg};
 181
 182     $txt = do "unicore/Name.pl" unless $txt;
 183
 184     if ($txt =~ m/^([0-9A-F]+)\t\t($arg)/m) {
 185         return $vianame{$arg} = hex $1;
 186     } else {
 187         return;
 188     }
 189 }
 190
 191
 192 1;
 193 __END__
 194
 195 =head1 NAME
 196
 197 charnames - define character names for C<\N{named}> string literal escapes.
 198
 199 =head1 SYNOPSIS
 200
 201   use charnames ':full';
 202   print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
 203
 204   use charnames ':short';
 205   print "\N{greek:Sigma} is an upper-case sigma.\n";
 206
 207   use charnames qw(cyrillic greek);
 208   print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
 209
 210   print charname::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
 211   printf "%04X", charname::vianame("GOTHIC LETTER AHSA"); # prints "10330"
 212
 213 =head1 DESCRIPTION
 214
 215 Pragma C<use charnames> supports arguments C<:full>, C<:short> and
 216 script names.  If C<:full> is present, for expansion of
 217 C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
 218 standard Unicode names of chars.  If C<:short> is present, and
 219 C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
 220 as a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
 221 with script name arguments, then for C<\N{CHARNAME}}> the name
 222 C<CHARNAME> is looked up as a letter in the given scripts (in the
 223 specified order).
 224
 225 For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
 226 this pragma looks for the names
 227
 228   SCRIPTNAME CAPITAL LETTER CHARNAME
 229   SCRIPTNAME SMALL LETTER CHARNAME
 230   SCRIPTNAME LETTER CHARNAME
 231
 232 in the table of standard Unicode names.  If C<CHARNAME> is lowercase,
 233 then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
 234 is ignored.
 235
 236 Note that C<\N{...}> is compile-time, it's a special form of string
 237 constant used inside double-quoted strings: in other words, you cannot
 238 use variables inside the C<\N{...}>.  If you want similar run-time
 239 functionality, use charnames::vianame().
 240
 241 For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
 242 as of Unicode 3.1, there are no official Unicode names but you can
 243 use instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth).
 244 In Unicode 3.2 some naming changes will happen since ISO 6429 has been
 245 updated.  Also note that the U+UU80, U+0081, U+0084, and U+0099
 246 do not have names even in ISO 6429.
 247
 248 =head1 CUSTOM TRANSLATORS
 249
 250 The mechanism of translation of C<\N{...}> escapes is general and not
 251 hardwired into F<charnames.pm>.  A module can install custom
 252 translations (inside the scope which C<use>s the module) with the
 253 following magic incantation:
 254
 255     use charnames ();           # for $charnames::hint_bits
 256     sub import {
 257         shift;
 258         $^H |= $charnames::hint_bits;
 259         $^H{charnames} = \&translator;
 260     }
 261
 262 Here translator() is a subroutine which takes C<CHARNAME> as an
 263 argument, and returns text to insert into the string instead of the
 264 C<\N{CHARNAME}> escape.  Since the text to insert should be different
 265 in C<bytes> mode and out of it, the function should check the current
 266 state of C<bytes>-flag as in:
 267
 268     use bytes ();                       # for $bytes::hint_bits
 269     sub translator {
 270         if ($^H & $bytes::hint_bits) {
 271             return bytes_translator(@_);
 272         }
 273         else {
 274             return utf8_translator(@_);
 275         }
 276     }
 277
 278 =head1 charnames::viacode(code)
 279
 280 Returns the full name of the character indicated by the numeric code.
 281 The example
 282
 283     print charnames::viacode(0x2722);
 284
 285 prints "FOUR TEARDROP-SPOKED ASTERISK".
 286
 287 Returns undef if no name is known for the code.
 288
 289 This works only for the standard names, and does not yet aply
 290 to custom translators.
 291
 292 =head1 charnames::vianame(code)
 293
 294 Returns the code point indicated by the name.
 295 The example
 296
 297     printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
 298
 299 prints "2722".
 300
 301 Returns undef if no name is known for the name.
 302
 303 This works only for the standard names, and does not yet aply
 304 to custom translators.
 305
 306 =head1 ILLEGAL CHARACTERS
 307
 308 If you ask for a character that does not exist, a warning is given
 309 and the special Unicode I<replacement character> "\x{FFFD}" is returned.
 310
 311 =head1 BUGS
 312
 313 Since evaluation of the translation function happens in a middle of
 314 compilation (of a string literal), the translation function should not
 315 do any C<eval>s or C<require>s.  This restriction should be lifted in
 316 a future version of Perl.
 317
 318 =cut