This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [PATCH lib/ExtUtils.t] Extra Files for QNX
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
561c79ed
JH
1package Unicode::UCD;
2
3use strict;
4use warnings;
5
6f50a187 6our $VERSION = '3.1.0';
561c79ed
JH
7
8require Exporter;
9
10our @ISA = qw(Exporter);
e882dd67 11our @EXPORT_OK = qw(charinfo charblock charscript);
561c79ed
JH
12
13use Carp;
14
15=head1 NAME
16
00f2772c 17Unicode::UCD - Unicode character database
561c79ed
JH
18
19=head1 SYNOPSIS
20
21 use Unicode::UCD 3.1.0;
22 # requires that level of the Unicode character database
23
24 use Unicode::UCD 'charinfo';
e882dd67 25 my %charinfo = charinfo($codepoint);
561c79ed
JH
26
27 use Unicode::UCD 'charblock';
e882dd67
JH
28 my $charblock = charblock($codepoint);
29
30 use Unicode::UCD 'charscript';
31 my $charscript = charblock($codepoint);
561c79ed
JH
32
33=head1 DESCRIPTION
34
35The Unicode module offers a simple interface to the Unicode Character
36Database.
37
38=cut
39
40my $UNICODE;
41my $BLOCKS;
e882dd67 42my $SCRIPTS;
561c79ed
JH
43
44sub openunicode {
45 my ($rfh, @path) = @_;
46 my $f;
47 unless (defined $$rfh) {
48 for my $d (@INC) {
49 use File::Spec;
50 $f = File::Spec->catfile($d, "unicode", @path);
32c16050 51 last if open($$rfh, $f);
e882dd67 52 undef $f;
561c79ed 53 }
e882dd67
JH
54 croak __PACKAGE__, ": failed to find ",
55 File::Spec->catfile(@path), " in @INC"
56 unless defined $f;
561c79ed
JH
57 }
58 return $f;
59}
60
61=head2 charinfo
62
63 use Unicode::UCD 'charinfo';
64
65 my %charinfo = charinfo(0x41);
66
67charinfo() returns a hash that has the following fields as defined
68by the Unicode standard:
69
70 key
71
72 code code point with at least four hexdigits
73 name name of the character IN UPPER CASE
74 category general category of the character
75 combining classes used in the Canonical Ordering Algorithm
76 bidi bidirectional category
77 decomposition character decomposition mapping
78 decimal if decimal digit this is the integer numeric value
79 digit if digit this is the numeric value
80 numeric if numeric is the integer or rational numeric value
81 mirrored if mirrored in bidirectional text
82 unicode10 Unicode 1.0 name if existed and different
83 comment ISO 10646 comment field
84 upper uppercase equivalent mapping
85 lower lowercase equivalent mapping
86 title titlecase equivalent mapping
e882dd67 87
561c79ed 88 block block the character belongs to (used in \p{In...})
e882dd67 89 script script the character belongs to
561c79ed
JH
90
91If no match is found, an empty hash is returned.
92
32c16050
JH
93The C<block> property is the same as as returned by charinfo(). It is
94not defined in the Unicode Character Database proper (Chapter 4 of the
95Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
e882dd67 96of TUS3). Similarly for the C<script> property.
32c16050
JH
97
98Note that you cannot do (de)composition and casing based solely on the
99above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
e882dd67
JH
100you will need also the I<Composition Exclusions>, I<Case Folding>, and
101I<SpecialCasing> tables, available as files F<CompExcl.txt>,
102F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
561c79ed
JH
103
104=cut
105
106sub charinfo {
107 my $code = shift;
108 my $hexk = sprintf("%04X", $code);
109
110 openunicode(\$UNICODE, "Unicode.txt");
111 if (defined $UNICODE) {
112 use Search::Dict;
113 if (look($UNICODE, "$hexk;") >= 0) {
114 my $line = <$UNICODE>;
115 chomp $line;
116 my %prop;
117 @prop{qw(
118 code name category
119 combining bidi decomposition
120 decimal digit numeric
121 mirrored unicode10 comment
122 upper lower title
123 )} = split(/;/, $line, -1);
124 if ($prop{code} eq $hexk) {
a196fbfd
JH
125 $prop{block} = charblock($code);
126 $prop{script} = charscript($code);
561c79ed
JH
127 return %prop;
128 }
129 }
130 }
131 return;
132}
133
e882dd67
JH
134sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
135 my ($table, $lo, $hi, $code) = @_;
136
137 return if $lo > $hi;
138
139 my $mid = int(($lo+$hi) / 2);
140
141 if ($table->[$mid]->[0] < $code) {
a196fbfd 142 if (defined $table->[$mid]->[1] && $table->[$mid]->[1] >= $code) {
e882dd67
JH
143 return $table->[$mid]->[2];
144 } else {
145 _search($table, $mid + 1, $hi, $code);
146 }
147 } elsif ($table->[$mid]->[0] > $code) {
148 _search($table, $lo, $mid - 1, $code);
149 } else {
150 return $table->[$mid]->[2];
151 }
152}
153
354a27bf 154=head2 charblock
561c79ed
JH
155
156 use Unicode::UCD 'charblock';
157
158 my $charblock = charblock(0x41);
159
160charblock() returns the block the character belongs to, e.g.
161C<Basic Latin>. Note that not all the character positions within all
e882dd67 162blocks are defined.
561c79ed
JH
163
164The name is the same name that is used in the C<\p{In...}> construct,
165for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
e882dd67
JH
166away from the names for the C<\p{In...}>, for example C<LatinExtendedA>
167instead of C<Latin Extended-A>.
561c79ed
JH
168
169=cut
170
171my @BLOCKS;
172
561c79ed
JH
173sub charblock {
174 my $code = shift;
175
176 unless (@BLOCKS) {
2796c109 177 if (openunicode(\$BLOCKS, "Blocks.txt")) {
561c79ed 178 while (<$BLOCKS>) {
2796c109 179 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
561c79ed
JH
180 push @BLOCKS, [ hex($1), hex($2), $3 ];
181 }
182 }
183 close($BLOCKS);
184 }
185 }
186
e882dd67
JH
187 _search(\@BLOCKS, 0, $#BLOCKS, $code);
188}
189
190=head2 charscript
191
192 use Unicode::UCD 'charscript';
193
194 my $charscript = charscript(0x41);
195
196charscript() returns the script the character belongs to, e.g.
ad9cab37 197C<Latin>, C<Greek>, C<Han>.
e882dd67
JH
198
199Unfortunately, currently (Perl 5.8.0) there is no regular expression
200notation for matching scripts as there is for blocks (C<\p{In...}>.
201
202=cut
203
204my @SCRIPTS;
205
206sub charscript {
207 my $code = shift;
208
209 unless (@SCRIPTS) {
210 if (openunicode(\$SCRIPTS, "Scripts.txt")) {
211 while (<$SCRIPTS>) {
212 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
213 push @SCRIPTS, [ hex($1), $2 ? hex($2) : undef, $3 ];
214 }
215 }
216 close($SCRIPTS);
217 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
218 }
219 }
220
221 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
561c79ed
JH
222}
223
ad9cab37
JH
224=head2 charblock versus charscript
225
226The difference between a character block and a script is that scripts
227are closer to the linguistic notion of a set of characters required to
228present languages, while block is more of an artifact of the Unicode
229character numbering. For example the Latin B<script> is spread over
230several B<blocks>, such as C<Basic Latin>, C<Latin 1 Supplement>,
231C<Latin Extended-A>, and C<Latin Extended-B>. On the other hand, the
232Latin script does not contain all the characters of the C<Basic Latin>
233block (also known as the ASCII): it includes only the letters, not for
234example the digits or the punctuation.
235
236For block see http://www.unicode.org/Public/UNIDATA/Blocks.txt
237
238For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
239
240Note also that the script names are all in uppercase, e.g. C<HEBREW>,
241while the block names are Capitalized and with intermixed spaces,
242e.g. C<Yi Syllables>.
243
2796c109
JH
244Greek
245Cyrillic
246Armenian
247Hebrew
248Arabic
249Syriac
250Thaana
251Devanagari
252Bengali
253Gurmukhi
254Gujarati
255Oriya
256Tamil
257Telugu
258Kannada
259Malayalam
260Sinhala
261Thai
262Lao
263Tibetan
264Myanmar
265Georgian
266Ethiopic
267Cherokee
268Ogham
269Runic
270Khmer
271Hiragana
272Katakana
273Bopomofo
274OldItalic
275Gothic
276Deseret
277
e882dd67 278=head1 IMPLEMENTATION NOTE
32c16050 279
ad9cab37
JH
280The first use of charinfo() opens a read-only filehandle to the Unicode
281Character Database (the database is included in the Perl distribution).
282The filehandle is then kept open for further queries.
32c16050 283
561c79ed
JH
284=head1 AUTHOR
285
286Jarkko Hietaniemi
287
288=cut
289
2901;