This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Todo updates.
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
561c79ed
JH
1package Unicode::UCD;
2
3use strict;
4use warnings;
5
6f50a187 6our $VERSION = '3.1.0';
561c79ed
JH
7
8require Exporter;
9
10our @ISA = qw(Exporter);
11our @EXPORT_OK = qw(charinfo charblock);
12
13use Carp;
14
15=head1 NAME
16
00f2772c 17Unicode::UCD - Unicode character database
561c79ed
JH
18
19=head1 SYNOPSIS
20
21 use Unicode::UCD 3.1.0;
22 # requires that level of the Unicode character database
23
24 use Unicode::UCD 'charinfo';
25 my %charinfo = charinfo($codepoint);
26
27 use Unicode::UCD 'charblock';
28 my $charblock = charblock($codepoint);
29
30=head1 DESCRIPTION
31
32The Unicode module offers a simple interface to the Unicode Character
33Database.
34
35=cut
36
37my $UNICODE;
38my $BLOCKS;
39
40sub openunicode {
41 my ($rfh, @path) = @_;
42 my $f;
43 unless (defined $$rfh) {
44 for my $d (@INC) {
45 use File::Spec;
46 $f = File::Spec->catfile($d, "unicode", @path);
32c16050 47 last if open($$rfh, $f);
561c79ed
JH
48 }
49 croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
50 unless defined $rfh;
51 }
52 return $f;
53}
54
55=head2 charinfo
56
57 use Unicode::UCD 'charinfo';
58
59 my %charinfo = charinfo(0x41);
60
61charinfo() returns a hash that has the following fields as defined
62by the Unicode standard:
63
64 key
65
66 code code point with at least four hexdigits
67 name name of the character IN UPPER CASE
68 category general category of the character
69 combining classes used in the Canonical Ordering Algorithm
70 bidi bidirectional category
71 decomposition character decomposition mapping
72 decimal if decimal digit this is the integer numeric value
73 digit if digit this is the numeric value
74 numeric if numeric is the integer or rational numeric value
75 mirrored if mirrored in bidirectional text
76 unicode10 Unicode 1.0 name if existed and different
77 comment ISO 10646 comment field
78 upper uppercase equivalent mapping
79 lower lowercase equivalent mapping
80 title titlecase equivalent mapping
81 block block the character belongs to (used in \p{In...})
82
83If no match is found, an empty hash is returned.
84
32c16050
JH
85The C<block> property is the same as as returned by charinfo(). It is
86not defined in the Unicode Character Database proper (Chapter 4 of the
87Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
88of TUS3).
89
90Note that you cannot do (de)composition and casing based solely on the
91above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
92you will need also the I<Composition Exclusions> and I<SpecialCasing>
93tables, available as files F<CompExcl.txt> and F<SpecCase.txt> in the
94Perl distribution.
561c79ed
JH
95
96=cut
97
98sub charinfo {
99 my $code = shift;
100 my $hexk = sprintf("%04X", $code);
101
102 openunicode(\$UNICODE, "Unicode.txt");
103 if (defined $UNICODE) {
104 use Search::Dict;
105 if (look($UNICODE, "$hexk;") >= 0) {
106 my $line = <$UNICODE>;
107 chomp $line;
108 my %prop;
109 @prop{qw(
110 code name category
111 combining bidi decomposition
112 decimal digit numeric
113 mirrored unicode10 comment
114 upper lower title
115 )} = split(/;/, $line, -1);
116 if ($prop{code} eq $hexk) {
117 $prop{block} = charblock($code);
118 return %prop;
119 }
120 }
121 }
122 return;
123}
124
354a27bf 125=head2 charblock
561c79ed
JH
126
127 use Unicode::UCD 'charblock';
128
129 my $charblock = charblock(0x41);
130
131charblock() returns the block the character belongs to, e.g.
132C<Basic Latin>. Note that not all the character positions within all
133block are defined.
134
135The name is the same name that is used in the C<\p{In...}> construct,
136for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
137away from the names for the C<\p{In...}>.
138
139=cut
140
141my @BLOCKS;
142
143sub _charblock {
144 my ($code, $lo, $hi) = @_;
145
146 return if $lo > $hi;
147
148 my $mid = int(($lo+$hi) / 2);
149
150 if ($BLOCKS[$mid]->[0] < $code) {
151 if ($BLOCKS[$mid]->[1] >= $code) {
152 return $BLOCKS[$mid]->[2];
153 } else {
154 _charblock($code, $mid + 1, $hi);
155 }
156 } elsif ($BLOCKS[$mid]->[0] > $code) {
157 _charblock($code, $lo, $mid - 1);
158 } else {
159 return $BLOCKS[$mid]->[2];
160 }
161}
162
163sub charblock {
164 my $code = shift;
165
166 unless (@BLOCKS) {
167 if (openunicode(\$BLOCKS, "Blocks.pl")) {
168 while (<$BLOCKS>) {
169 if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
170 push @BLOCKS, [ hex($1), hex($2), $3 ];
171 }
172 }
173 close($BLOCKS);
174 }
175 }
176
177 _charblock($code, 0, $#BLOCKS);
178}
179
32c16050
JH
180=head1 NOTE
181
182The first use of L<charinfo> opens a read-only filehandle to the Unicode
183Character Database. The filehandle is kept open for further queries.
184
561c79ed
JH
185=head1 AUTHOR
186
187Jarkko Hietaniemi
188
189=cut
190
1911;