This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update Unicode::UCD on \p{In...}.
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
561c79ed
JH
1package Unicode::UCD;
2
3use strict;
4use warnings;
5
6f50a187 6our $VERSION = '3.1.0';
561c79ed
JH
7
8require Exporter;
9
10our @ISA = qw(Exporter);
e882dd67 11our @EXPORT_OK = qw(charinfo charblock charscript);
561c79ed
JH
12
13use Carp;
14
15=head1 NAME
16
00f2772c 17Unicode::UCD - Unicode character database
561c79ed
JH
18
19=head1 SYNOPSIS
20
21 use Unicode::UCD 3.1.0;
22 # requires that level of the Unicode character database
23
24 use Unicode::UCD 'charinfo';
e882dd67 25 my %charinfo = charinfo($codepoint);
561c79ed
JH
26
27 use Unicode::UCD 'charblock';
e882dd67
JH
28 my $charblock = charblock($codepoint);
29
30 use Unicode::UCD 'charscript';
31 my $charscript = charblock($codepoint);
561c79ed
JH
32
33=head1 DESCRIPTION
34
35The Unicode module offers a simple interface to the Unicode Character
36Database.
37
38=cut
39
40my $UNICODE;
41my $BLOCKS;
e882dd67 42my $SCRIPTS;
561c79ed
JH
43
44sub openunicode {
45 my ($rfh, @path) = @_;
46 my $f;
47 unless (defined $$rfh) {
48 for my $d (@INC) {
49 use File::Spec;
50 $f = File::Spec->catfile($d, "unicode", @path);
32c16050 51 last if open($$rfh, $f);
e882dd67 52 undef $f;
561c79ed 53 }
e882dd67
JH
54 croak __PACKAGE__, ": failed to find ",
55 File::Spec->catfile(@path), " in @INC"
56 unless defined $f;
561c79ed
JH
57 }
58 return $f;
59}
60
61=head2 charinfo
62
63 use Unicode::UCD 'charinfo';
64
65 my %charinfo = charinfo(0x41);
66
67charinfo() returns a hash that has the following fields as defined
68by the Unicode standard:
69
70 key
71
72 code code point with at least four hexdigits
73 name name of the character IN UPPER CASE
74 category general category of the character
75 combining classes used in the Canonical Ordering Algorithm
76 bidi bidirectional category
77 decomposition character decomposition mapping
78 decimal if decimal digit this is the integer numeric value
79 digit if digit this is the numeric value
80 numeric if numeric is the integer or rational numeric value
81 mirrored if mirrored in bidirectional text
82 unicode10 Unicode 1.0 name if existed and different
83 comment ISO 10646 comment field
84 upper uppercase equivalent mapping
85 lower lowercase equivalent mapping
86 title titlecase equivalent mapping
e882dd67 87
561c79ed 88 block block the character belongs to (used in \p{In...})
e882dd67 89 script script the character belongs to
561c79ed
JH
90
91If no match is found, an empty hash is returned.
92
32c16050
JH
93The C<block> property is the same as as returned by charinfo(). It is
94not defined in the Unicode Character Database proper (Chapter 4 of the
95Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
e882dd67 96of TUS3). Similarly for the C<script> property.
32c16050
JH
97
98Note that you cannot do (de)composition and casing based solely on the
99above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
e882dd67
JH
100you will need also the I<Composition Exclusions>, I<Case Folding>, and
101I<SpecialCasing> tables, available as files F<CompExcl.txt>,
102F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
561c79ed
JH
103
104=cut
105
106sub charinfo {
107 my $code = shift;
108 my $hexk = sprintf("%04X", $code);
109
110 openunicode(\$UNICODE, "Unicode.txt");
111 if (defined $UNICODE) {
112 use Search::Dict;
113 if (look($UNICODE, "$hexk;") >= 0) {
114 my $line = <$UNICODE>;
115 chomp $line;
116 my %prop;
117 @prop{qw(
118 code name category
119 combining bidi decomposition
120 decimal digit numeric
121 mirrored unicode10 comment
122 upper lower title
123 )} = split(/;/, $line, -1);
124 if ($prop{code} eq $hexk) {
a196fbfd
JH
125 $prop{block} = charblock($code);
126 $prop{script} = charscript($code);
561c79ed
JH
127 return %prop;
128 }
129 }
130 }
131 return;
132}
133
e882dd67
JH
134sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
135 my ($table, $lo, $hi, $code) = @_;
136
137 return if $lo > $hi;
138
139 my $mid = int(($lo+$hi) / 2);
140
141 if ($table->[$mid]->[0] < $code) {
a196fbfd 142 if (defined $table->[$mid]->[1] && $table->[$mid]->[1] >= $code) {
e882dd67
JH
143 return $table->[$mid]->[2];
144 } else {
145 _search($table, $mid + 1, $hi, $code);
146 }
147 } elsif ($table->[$mid]->[0] > $code) {
148 _search($table, $lo, $mid - 1, $code);
149 } else {
150 return $table->[$mid]->[2];
151 }
152}
153
354a27bf 154=head2 charblock
561c79ed
JH
155
156 use Unicode::UCD 'charblock';
157
158 my $charblock = charblock(0x41);
159
160charblock() returns the block the character belongs to, e.g.
161C<Basic Latin>. Note that not all the character positions within all
e882dd67 162blocks are defined.
561c79ed 163
561c79ed
JH
164=cut
165
166my @BLOCKS;
167
561c79ed
JH
168sub charblock {
169 my $code = shift;
170
171 unless (@BLOCKS) {
2796c109 172 if (openunicode(\$BLOCKS, "Blocks.txt")) {
561c79ed 173 while (<$BLOCKS>) {
2796c109 174 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
561c79ed
JH
175 push @BLOCKS, [ hex($1), hex($2), $3 ];
176 }
177 }
178 close($BLOCKS);
179 }
180 }
181
e882dd67
JH
182 _search(\@BLOCKS, 0, $#BLOCKS, $code);
183}
184
185=head2 charscript
186
187 use Unicode::UCD 'charscript';
188
189 my $charscript = charscript(0x41);
190
191charscript() returns the script the character belongs to, e.g.
ad9cab37 192C<Latin>, C<Greek>, C<Han>.
e882dd67 193
e882dd67
JH
194=cut
195
196my @SCRIPTS;
197
198sub charscript {
199 my $code = shift;
200
201 unless (@SCRIPTS) {
202 if (openunicode(\$SCRIPTS, "Scripts.txt")) {
203 while (<$SCRIPTS>) {
204 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
205 push @SCRIPTS, [ hex($1), $2 ? hex($2) : undef, $3 ];
206 }
207 }
208 close($SCRIPTS);
209 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
210 }
211 }
212
213 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
561c79ed
JH
214}
215
ad9cab37
JH
216=head2 charblock versus charscript
217
218The difference between a character block and a script is that scripts
219are closer to the linguistic notion of a set of characters required to
220present languages, while block is more of an artifact of the Unicode
3aa957f9
JH
221character numbering and separation into blocks of 256 characters.
222
223For example the Latin B<script> is spread over several B<blocks>, such
224as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
225C<Latin Extended-B>. On the other hand, the Latin script does not
226contain all the characters of the C<Basic Latin> block (also known as
227the ASCII): it includes only the letters, not for example the digits
228or the punctuation.
ad9cab37 229
3aa957f9 230For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37
JH
231
232For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
233
3aa957f9
JH
234=head2 Matching Scripts and Blocks
235
236Both scripts and blocks can be matched using the regular expression
237construct C<\p{In...}> and its negation C<\P{In...}>.
238
239The name of the script or the block comes after the C<In>, for example
240C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
241squished away from the names for the C<\p{In...}>, for example
242C<LatinExtendedA> instead of C<Latin Extended-A>. There are a few
243cases where there exists both a script and a block by the same name,
244in these cases the block version has C<Block> appended: C<\p{InKatakana}>
245is the script, C<\p{InKatakanaBlock}> is the block.
246
247=head2 Implementation Note
32c16050 248
ad9cab37
JH
249The first use of charinfo() opens a read-only filehandle to the Unicode
250Character Database (the database is included in the Perl distribution).
251The filehandle is then kept open for further queries.
32c16050 252
561c79ed
JH
253=head1 AUTHOR
254
255Jarkko Hietaniemi
256
257=cut
258
2591;