This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [PATCH] fix for charnames above FFFF
[perl5.git] / lib / charnames.pm
CommitLineData
423cee85 1package charnames;
b75c8c73
MS
2
3our $VERSION = '1.00';
4
d5448623 5use bytes (); # for $bytes::hint_bits
bd62941a 6use warnings();
d5448623 7$charnames::hint_bits = 0x20000;
423cee85 8
423cee85
JH
9my $txt;
10
11# This is not optimized in any way yet
12sub charnames {
13 $name = shift;
14 $txt = do "unicode/Name.pl" unless $txt;
15 my @off;
16 if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
17 @off = ($-[0], $+[0]);
18 }
19 unless (@off) {
20 if ($^H{charnames_short} and $name =~ /^(.*?):(.*)/s) {
21 my ($script, $cname) = ($1,$2);
22 my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
23 if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
24 @off = ($-[0], $+[0]);
25 }
26 }
27 }
28 unless (@off) {
29 my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
30 for ( @{$^H{charnames_scripts}} ) {
31 (@off = ($-[0], $+[0])), last
32 if $txt =~ m/\t\t$_ (?:$case )?LETTER \U$name$/m;
33 }
34 }
35 die "Unknown charname '$name'" unless @off;
b896c7a5 36
2f430fd2
A
37 my $hexlen = 4; # Unicode guarantees 4-, 5-, or 6-digit format
38 $hexlen++ while
39 $hexlen < 6 && substr($txt, $off[0] - $hexlen - 1, 1) =~ /[0-9a-f]/;
b896c7a5 40 my $ord = hex substr $txt, $off[0] - $hexlen, $hexlen;
d5448623 41 if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
8058d7ab 42 use bytes;
d41ff1b8
GS
43 return chr $ord if $ord <= 255;
44 my $hex = sprintf '%X=0%o', $ord, $ord;
45 my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
46 die "Character 0x$hex with name '$fname' is above 0xFF";
423cee85 47 }
d41ff1b8 48 return chr $ord;
423cee85
JH
49}
50
51sub import {
52 shift;
d5448623
GS
53 die "`use charnames' needs explicit imports list" unless @_;
54 $^H |= $charnames::hint_bits;
423cee85
JH
55 $^H{charnames} = \&charnames ;
56 my %h;
57 @h{@_} = (1) x @_;
58 $^H{charnames_full} = delete $h{':full'};
59 $^H{charnames_short} = delete $h{':short'};
60 $^H{charnames_scripts} = [map uc, keys %h];
bd62941a
DD
61 if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
62 $txt = do "unicode/Name.pl" unless $txt;
63 for (@{$^H{charnames_scripts}}) {
64 warnings::warn('utf8', "No such script: '$_'") unless
65 $txt =~ m/\t\t$_ (?:CAPITAL |SMALL )?LETTER /;
66 }
67 }
423cee85
JH
68}
69
70
711;
72__END__
73
74=head1 NAME
75
4a2d328f 76charnames - define character names for C<\N{named}> string literal escape.
423cee85
JH
77
78=head1 SYNOPSIS
79
80 use charnames ':full';
4a2d328f 81 print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
423cee85
JH
82
83 use charnames ':short';
4a2d328f 84 print "\N{greek:Sigma} is an upper-case sigma.\n";
423cee85
JH
85
86 use charnames qw(cyrillic greek);
4a2d328f 87 print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
423cee85
JH
88
89=head1 DESCRIPTION
90
91Pragma C<use charnames> supports arguments C<:full>, C<:short> and
92script names. If C<:full> is present, for expansion of
4a2d328f 93C<\N{CHARNAME}}> string C<CHARNAME> is first looked in the list of
423cee85
JH
94standard Unicode names of chars. If C<:short> is present, and
95C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
96as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
4a2d328f 97with script name arguments, then for C<\N{CHARNAME}}> the name
423cee85
JH
98C<CHARNAME> is looked up as a letter in the given scripts (in the
99specified order).
100
101For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
d5448623 102this pragma looks for the names
423cee85
JH
103
104 SCRIPTNAME CAPITAL LETTER CHARNAME
105 SCRIPTNAME SMALL LETTER CHARNAME
106 SCRIPTNAME LETTER CHARNAME
107
108in the table of standard Unicode names. If C<CHARNAME> is lowercase,
d5448623 109then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant is
423cee85
JH
110ignored.
111
112=head1 CUSTOM TRANSLATORS
113
d5448623 114The mechanism of translation of C<\N{...}> escapes is general and not
423cee85 115hardwired into F<charnames.pm>. A module can install custom
d5448623 116translations (inside the scope which C<use>s the module) with the
423cee85
JH
117following magic incantation:
118
d5448623
GS
119 use charnames (); # for $charnames::hint_bits
120 sub import {
121 shift;
122 $^H |= $charnames::hint_bits;
123 $^H{charnames} = \&translator;
124 }
423cee85
JH
125
126Here translator() is a subroutine which takes C<CHARNAME> as an
127argument, and returns text to insert into the string instead of the
4a2d328f 128C<\N{CHARNAME}> escape. Since the text to insert should be different
d5448623
GS
129in C<bytes> mode and out of it, the function should check the current
130state of C<bytes>-flag as in:
131
132 use bytes (); # for $bytes::hint_bits
133 sub translator {
134 if ($^H & $bytes::hint_bits) {
135 return bytes_translator(@_);
136 }
137 else {
138 return utf8_translator(@_);
139 }
423cee85 140 }
423cee85
JH
141
142=head1 BUGS
143
144Since evaluation of the translation function happens in a middle of
145compilation (of a string literal), the translation function should not
146do any C<eval>s or C<require>s. This restriction should be lifted in
147a future version of Perl.
148
149=cut