lib/charnames.pm

   1 package charnames;
   2
   3 our $VERSION = '1.00';
   4
   5 use bytes ();           # for $bytes::hint_bits
   6 use warnings();
   7 $charnames::hint_bits = 0x20000;
   8
   9 my $txt;
  10
  11 # This is not optimized in any way yet
  12 sub charnames {
  13   $name = shift;
  14   $txt = do "unicode/Name.pl" unless $txt;
  15   my @off;
  16   if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
  17     @off = ($-[0], $+[0]);
  18   }
  19   unless (@off) {
  20     if ($^H{charnames_short} and $name =~ /^(.*?):(.*)/s) {
  21       my ($script, $cname) = ($1,$2);
  22       my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
  23       if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
  24         @off = ($-[0], $+[0]);
  25       }
  26     }
  27   }
  28   unless (@off) {
  29     my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
  30     for ( @{$^H{charnames_scripts}} ) {
  31       (@off = ($-[0], $+[0])), last
  32         if $txt =~ m/\t\t$_ (?:$case )?LETTER \U$name$/m;
  33     }
  34   }
  35   die "Unknown charname '$name'" unless @off;
  36
  37   my $hexlen = 4; # Unicode guarantees 4-, 5-, or 6-digit format
  38   $hexlen++ while
  39       $hexlen < 6 && substr($txt, $off[0] - $hexlen - 1, 1) =~ /[0-9a-f]/;
  40   my $ord = hex substr $txt, $off[0] - $hexlen, $hexlen;
  41   if ($^H & $bytes::hint_bits) {        # "use bytes" in effect?
  42     use bytes;
  43     return chr $ord if $ord <= 255;
  44     my $hex = sprintf '%X=0%o', $ord, $ord;
  45     my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
  46     die "Character 0x$hex with name '$fname' is above 0xFF";
  47   }
  48   return chr $ord;
  49 }
  50
  51 sub import {
  52   shift;
  53   die "`use charnames' needs explicit imports list" unless @_;
  54   $^H |= $charnames::hint_bits;
  55   $^H{charnames} = \&charnames ;
  56   my %h;
  57   @h{@_} = (1) x @_;
  58   $^H{charnames_full} = delete $h{':full'};
  59   $^H{charnames_short} = delete $h{':short'};
  60   $^H{charnames_scripts} = [map uc, keys %h];
  61   if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
  62         $txt = do "unicode/Name.pl" unless $txt;
  63     for (@{$^H{charnames_scripts}}) {
  64         warnings::warn('utf8',  "No such script: '$_'") unless
  65             $txt =~ m/\t\t$_ (?:CAPITAL |SMALL )?LETTER /;
  66         }
  67   }
  68 }
  69
  70
  71 1;
  72 __END__
  73
  74 =head1 NAME
  75
  76 charnames - define character names for C<\N{named}> string literal escape.
  77
  78 =head1 SYNOPSIS
  79
  80   use charnames ':full';
  81   print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
  82
  83   use charnames ':short';
  84   print "\N{greek:Sigma} is an upper-case sigma.\n";
  85
  86   use charnames qw(cyrillic greek);
  87   print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
  88
  89 =head1 DESCRIPTION
  90
  91 Pragma C<use charnames> supports arguments C<:full>, C<:short> and
  92 script names.  If C<:full> is present, for expansion of
  93 C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
  94 standard Unicode names of chars.  If C<:short> is present, and
  95 C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
  96 as a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
  97 with script name arguments, then for C<\N{CHARNAME}}> the name
  98 C<CHARNAME> is looked up as a letter in the given scripts (in the
  99 specified order).
 100
 101 For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
 102 this pragma looks for the names
 103
 104   SCRIPTNAME CAPITAL LETTER CHARNAME
 105   SCRIPTNAME SMALL LETTER CHARNAME
 106   SCRIPTNAME LETTER CHARNAME
 107
 108 in the table of standard Unicode names.  If C<CHARNAME> is lowercase,
 109 then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant is
 110 ignored.
 111
 112 =head1 CUSTOM TRANSLATORS
 113
 114 The mechanism of translation of C<\N{...}> escapes is general and not
 115 hardwired into F<charnames.pm>.  A module can install custom
 116 translations (inside the scope which C<use>s the module) with the
 117 following magic incantation:
 118
 119     use charnames ();           # for $charnames::hint_bits
 120     sub import {
 121         shift;
 122         $^H |= $charnames::hint_bits;
 123         $^H{charnames} = \&translator;
 124     }
 125
 126 Here translator() is a subroutine which takes C<CHARNAME> as an
 127 argument, and returns text to insert into the string instead of the
 128 C<\N{CHARNAME}> escape.  Since the text to insert should be different
 129 in C<bytes> mode and out of it, the function should check the current
 130 state of C<bytes>-flag as in:
 131
 132     use bytes ();                       # for $bytes::hint_bits
 133     sub translator {
 134         if ($^H & $bytes::hint_bits) {
 135             return bytes_translator(@_);
 136         }
 137         else {
 138             return utf8_translator(@_);
 139         }
 140     }
 141
 142 =head1 BUGS
 143
 144 Since evaluation of the translation function happens in a middle of
 145 compilation (of a string literal), the translation function should not
 146 do any C<eval>s or C<require>s.  This restriction should be lifted in
 147 a future version of Perl.
 148
 149 =cut