[perl5.git] / lib / charnames.pm

package charnames;

my $fname = 'unicode/UnicodeData-Latest.txt';
my $txt;

# This is not optimized in any way yet
sub charnames {
  $name = shift;
  $txt = do "unicode/Name.pl" unless $txt;
  my @off;
  if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
    @off = ($-[0], $+[0]);
  }
  unless (@off) {
    if ($^H{charnames_short} and $name =~ /^(.*?):(.*)/s) {
      my ($script, $cname) = ($1,$2);
      my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
      if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
	@off = ($-[0], $+[0]);
      }
    }
  }
  unless (@off) {
    my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
    for ( @{$^H{charnames_scripts}} ) {
      (@off = ($-[0], $+[0])), last 
	if $txt =~ m/\t\t$_ (?:$case )?LETTER \U$name$/m;
    }
  }
  die "Unknown charname '$name'" unless @off;
  
  # use caller 'encoding';	# Does not work at compile time?

  my $ord = hex substr $txt, $off[0] - 4, 4;
  if ($^H & 0x8) {
    use utf8;
    return chr $ord;
  }
  return chr $ord if $ord <= 255;
  my $hex = sprintf '%X=0%o', $ord, $ord;
  my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
  die "Character 0x$hex with name '$fname' is above 0xFF";
}

sub import {
  shift;
  die "No scripts for `use charnames'" unless @_;
  $^H |= 0x20000;
  $^H{charnames} = \&charnames ;
  my %h;
  @h{@_} = (1) x @_;
  $^H{charnames_full} = delete $h{':full'};
  $^H{charnames_short} = delete $h{':short'};
  $^H{charnames_scripts} = [map uc, keys %h];
}


1;
__END__

=head1 NAME

charnames - define character names for C<\N{named}> string literal escape.

=head1 SYNOPSIS

  use charnames ':full';
  print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";

  use charnames ':short';
  print "\N{greek:Sigma} is an upper-case sigma.\n";

  use charnames qw(cyrillic greek);
  print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";

=head1 DESCRIPTION

Pragma C<use charnames> supports arguments C<:full>, C<:short> and
script names.  If C<:full> is present, for expansion of
C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
standard Unicode names of chars.  If C<:short> is present, and
C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
as a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
with script name arguments, then for C<\N{CHARNAME}}> the name
C<CHARNAME> is looked up as a letter in the given scripts (in the
specified order).

For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
F<charcodes.pm> looks for the names

  SCRIPTNAME CAPITAL LETTER CHARNAME
  SCRIPTNAME SMALL LETTER CHARNAME
  SCRIPTNAME LETTER CHARNAME

in the table of standard Unicode names.  If C<CHARNAME> is lowercase,
then the C<CAPITAL> variant is ignored, otherwise C<SMALL> variant is
ignored.

=head1 CUSTOM TRANSLATORS

The mechanism of translation is C<\N{...}> escapes is general and not
hardwired into F<charnames.pm>.  A module can install custom
translations (inside the scope which C<use>s the module) by the
following magic incantation:

  sub import {
    shift;
    $^H |= 0x20000;
    $^H{charnames} = \&translator;
  }

Here translator() is a subroutine which takes C<CHARNAME> as an
argument, and returns text to insert into the string instead of the
C<\N{CHARNAME}> escape.  Since the text to insert should be different
in C<utf8> mode and out of it, the function should check the current
state of C<utf8>-flag as in

  sub translator {
    if ($^H & 0x8) {
      return utf_translator(@_);
    } else {
      return no_utf_translator(@_);
    }
  }

=head1 BUGS

Since evaluation of the translation function happens in a middle of
compilation (of a string literal), the translation function should not
do any C<eval>s or C<require>s.  This restriction should be lifted in
a future version of Perl.

=cut
Commit	Line	Data
423cee85 JH	1	package charnames;
	2
	3	my $fname = 'unicode/UnicodeData-Latest.txt';
	4	my $txt;
	5
	6	# This is not optimized in any way yet
	7	sub charnames {
	8	$name = shift;
	9	$txt = do "unicode/Name.pl" unless $txt;
	10	my @off;
	11	if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
	12	@off = ($-[0], $+[0]);
	13	}
	14	unless (@off) {
	15	if ($^H{charnames_short} and $name =~ /^(.?):(.)/s) {
	16	my ($script, $cname) = ($1,$2);
	17	my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
	18	if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
	19	@off = ($-[0], $+[0]);
	20	}
	21	}
	22	}
	23	unless (@off) {
	24	my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
	25	for ( @{$^H{charnames_scripts}} ) {
	26	(@off = ($-[0], $+[0])), last
	27	if $txt =~ m/\t\t$_ (?:$case )?LETTER \U$name$/m;
	28	}
	29	}
	30	die "Unknown charname '$name'" unless @off;
	31
	32	# use caller 'encoding'; # Does not work at compile time?
	33
	34	my $ord = hex substr $txt, $off[0] - 4, 4;
	35	if ($^H & 0x8) {
	36	use utf8;
	37	return chr $ord;
	38	}
	39	return chr $ord if $ord <= 255;
	40	my $hex = sprintf '%X=0%o', $ord, $ord;
	41	my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
	42	die "Character 0x$hex with name '$fname' is above 0xFF";
	43	}
	44
	45	sub import {
	46	shift;
	47	die "No scripts for `use charnames'" unless @_;
	48	$^H \|= 0x20000;
	49	$^H{charnames} = \&charnames ;
	50	my %h;
	51	@h{@_} = (1) x @_;
	52	$^H{charnames_full} = delete $h{':full'};
	53	$^H{charnames_short} = delete $h{':short'};
	54	$^H{charnames_scripts} = [map uc, keys %h];
	55	}
	56
	57
	58	1;
	59	__END__
	60
	61	=head1 NAME
	62
4a2d328f	63	charnames - define character names for C<\N{named}> string literal escape.
423cee85 JH	64
	65	=head1 SYNOPSIS
	66
	67	use charnames ':full';
4a2d328f	68	print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85 JH	69
423cee85 JH	70	use charnames ':short';
4a2d328f	71	print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85 JH	72
423cee85 JH	73	use charnames qw(cyrillic greek);
4a2d328f	74	print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85 JH	75
	76	=head1 DESCRIPTION
	77
	78	Pragma C<use charnames> supports arguments C<:full>, C<:short> and
	79	script names. If C<:full> is present, for expansion of
4a2d328f	80	C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
423cee85 JH	81	standard Unicode names of chars. If C<:short> is present, and
	82	C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
	83	as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
4a2d328f	84	with script name arguments, then for C<\N{CHARNAME}}> the name
423cee85 JH	85	C<CHARNAME> is looked up as a letter in the given scripts (in the
	86	specified order).
	87
	88	For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
	89	F<charcodes.pm> looks for the names
	90
	91	SCRIPTNAME CAPITAL LETTER CHARNAME
	92	SCRIPTNAME SMALL LETTER CHARNAME
	93	SCRIPTNAME LETTER CHARNAME
	94
	95	in the table of standard Unicode names. If C<CHARNAME> is lowercase,
	96	then the C<CAPITAL> variant is ignored, otherwise C<SMALL> variant is
	97	ignored.
	98
	99	=head1 CUSTOM TRANSLATORS
	100
4a2d328f	101	The mechanism of translation is C<\N{...}> escapes is general and not
423cee85 JH	102	hardwired into F<charnames.pm>. A module can install custom
	103	translations (inside the scope which C<use>s the module) by the
	104	following magic incantation:
	105
	106	sub import {
	107	shift;
	108	$^H \|= 0x20000;
	109	$^H{charnames} = \&translator;
	110	}
	111
	112	Here translator() is a subroutine which takes C<CHARNAME> as an
	113	argument, and returns text to insert into the string instead of the
4a2d328f	114	C<\N{CHARNAME}> escape. Since the text to insert should be different
423cee85 JH	115	in C<utf8> mode and out of it, the function should check the current
	116	state of C<utf8>-flag as in
	117
	118	sub translator {
	119	if ($^H & 0x8) {
	120	return utf_translator(@_);
	121	} else {
	122	return no_utf_translator(@_);
	123	}
	124	}
	125
	126	=head1 BUGS
	127
	128	Since evaluation of the translation function happens in a middle of
	129	compilation (of a string literal), the translation function should not
	130	do any C<eval>s or C<require>s. This restriction should be lifted in
	131	a future version of Perl.
	132
	133	=cut
	134