This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to Locale::Codes 2.00.
[perl5.git] / lib / Locale / Script.pm
CommitLineData
6b6e008c
JH
1#-----------------------------------------------------------------------
2
3=head1 NAME
4
5Locale::Script - ISO codes for script identification (ISO 15924)
6
7=head1 SYNOPSIS
8
9 use Locale::Script;
10 use Locale::Constants;
11
12 $script = code2script('ph'); # 'Phoenician'
13 $code = script2code('Tibetan'); # 'bo'
14 $code3 = script2code('Tibetan',
15 LOCALE_CODE_ALPHA_3); # 'bod'
16 $codeN = script2code('Tibetan',
17 LOCALE_CODE_ALPHA_NUMERIC); # 330
18
19 @codes = all_script_codes();
20 @scripts = all_script_names();
21
22=cut
23
24#-----------------------------------------------------------------------
25
26package Locale::Script;
27use strict;
28require 5.002;
29
30#-----------------------------------------------------------------------
31
32=head1 DESCRIPTION
33
34The C<Locale::Script> module provides access to the ISO
35codes for identifying scripts, as defined in ISO 15924.
36For example, Egyptian hieroglyphs are denoted by the two-letter
37code 'eg', the three-letter code 'egy', and the numeric code 050.
38
39You can either access the codes via the conversion routines
40(described below), or with the two functions which return lists
41of all script codes or all script names.
42
43There are three different code sets you can use for identifying
44scripts:
45
46=over 4
47
48=item B<alpha-2>
49
50Two letter codes, such as 'bo' for Tibetan.
51This code set is identified with the symbol C<LOCALE_CODE_ALPHA_2>.
52
53=item B<alpha-3>
54
55Three letter codes, such as 'ell' for Greek.
56This code set is identified with the symbol C<LOCALE_CODE_ALPHA_3>.
57
58=item B<numeric>
59
60Numeric codes, such as 410 for Hiragana.
61This code set is identified with the symbol C<LOCALE_CODE_NUMERIC>.
62
63=back
64
65All of the routines take an optional additional argument
66which specifies the code set to use.
67If not specified, it defaults to the two-letter codes.
68This is partly for backwards compatibility (previous versions
69of Locale modules only supported the alpha-2 codes), and
70partly because they are the most widely used codes.
71
72The alpha-2 and alpha-3 codes are not case-dependent,
73so you can use 'BO', 'Bo', 'bO' or 'bo' for Tibetan.
74When a code is returned by one of the functions in
75this module, it will always be lower-case.
76
77=head2 SPECIAL CODES
78
79The standard defines various special codes.
80
81=over 4
82
83=item *
84
85The standard reserves codes in the ranges B<qa> - B<qt>,
86B<qaa> - B<qat>, and B<900> - B<919>, for private use.
87
88=item *
89
90B<zx>, B<zxx>, and B<997>, are the codes for unwritten languages.
91
92=item *
93
94B<zy>, B<zyy>, and B<998>, are the codes for an undetermined script.
95
96=item *
97
98B<zz>, B<zzz>, and B<999>, are the codes for an uncoded script.
99
100=back
101
102The private codes are not recognised by Locale::Script,
103but the others are.
104
105=cut
106
107#-----------------------------------------------------------------------
108
109require Exporter;
110use Carp;
111use Locale::Constants;
112
113
114#-----------------------------------------------------------------------
115# Public Global Variables
116#-----------------------------------------------------------------------
117use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
118$VERSION = sprintf("%d.%02d", q$Revision: 2.0 $ =~ /(\d+)\.(\d+)/);
119@ISA = qw(Exporter);
120@EXPORT = qw(code2script script2code
121 all_script_codes all_script_names
122 script_code2code
123 LOCALE_CODE_ALPHA_2 LOCALE_CODE_ALPHA_3 LOCALE_CODE_NUMERIC);
124
125#-----------------------------------------------------------------------
126# Private Global Variables
127#-----------------------------------------------------------------------
128my $CODES = [];
129my $COUNTRIES = [];
130
131
132#=======================================================================
133
134=head1 CONVERSION ROUTINES
135
136There are three conversion routines: C<code2script()>, C<script2code()>,
137and C<script_code2code()>.
138
139=over 8
140
141=item code2script( CODE, [ CODESET ] )
142
143This function takes a script code and returns a string
144which contains the name of the script identified.
145If the code is not a valid script code, as defined by ISO 15924,
146then C<undef> will be returned:
147
148 $script = code2script('cy'); # Cyrillic
149
150=item script2code( STRING, [ CODESET ] )
151
152This function takes a script name and returns the corresponding
153script code, if such exists.
154If the argument could not be identified as a script name,
155then C<undef> will be returned:
156
157 $code = script2code('Gothic', LOCALE_CODE_ALPHA_3);
158 # $code will now be 'gth'
159
160The case of the script name is not important.
161See the section L<KNOWN BUGS AND LIMITATIONS> below.
162
163=item script_code2code( CODE, CODESET, CODESET )
164
165This function takes a script code from one code set,
166and returns the corresponding code from another code set.
167
168 $alpha2 = script_code2code('jwi',
169 LOCALE_CODE_ALPHA_3 => LOCALE_CODE_ALPHA_2);
170 # $alpha2 will now be 'jw' (Javanese)
171
172If the code passed is not a valid script code in
173the first code set, or if there isn't a code for the
174corresponding script in the second code set,
175then C<undef> will be returned.
176
177=back
178
179=cut
180
181#=======================================================================
182sub code2script
183{
184 my $code = shift;
185 my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT;
186
187
188 return undef unless defined $code;
189
190 #-------------------------------------------------------------------
191 # Make sure the code is in the right form before we use it
192 # to look up the corresponding script.
193 # We have to sprintf because the codes are given as 3-digits,
194 # with leading 0's. Eg 070 for Egyptian demotic.
195 #-------------------------------------------------------------------
196 if ($codeset == LOCALE_CODE_NUMERIC)
197 {
198 return undef if ($code =~ /\D/);
199 $code = sprintf("%.3d", $code);
200 }
201 else
202 {
203 $code = lc($code);
204 }
205
206 if (exists $CODES->[$codeset]->{$code})
207 {
208 return $CODES->[$codeset]->{$code};
209 }
210 else
211 {
212 #---------------------------------------------------------------
213 # no such script code!
214 #---------------------------------------------------------------
215 return undef;
216 }
217}
218
219sub script2code
220{
221 my $script = shift;
222 my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT;
223
224
225 return undef unless defined $script;
226 $script = lc($script);
227 if (exists $COUNTRIES->[$codeset]->{$script})
228 {
229 return $COUNTRIES->[$codeset]->{$script};
230 }
231 else
232 {
233 #---------------------------------------------------------------
234 # no such script!
235 #---------------------------------------------------------------
236 return undef;
237 }
238}
239
240sub script_code2code
241{
242 (@_ == 3) or croak "script_code2code() takes 3 arguments!";
243
244 my $code = shift;
245 my $inset = shift;
246 my $outset = shift;
247 my $outcode = shift;
248 my $script;
249
250
251 return undef if $inset == $outset;
252 $script = code2script($code, $inset);
253 return undef if not defined $script;
254 $outcode = script2code($script, $outset);
255 return $outcode;
256}
257
258#=======================================================================
259
260=head1 QUERY ROUTINES
261
262There are two function which can be used to obtain a list of all codes,
263or all script names:
264
265=over 8
266
267=item C<all_script_codes ( [ CODESET ] )>
268
269Returns a list of all two-letter script codes.
270The codes are guaranteed to be all lower-case,
271and not in any particular order.
272
273=item C<all_script_names ( [ CODESET ] )>
274
275Returns a list of all script names for which there is a corresponding
276script code in the specified code set.
277The names are capitalised, and not returned in any particular order.
278
279=back
280
281=cut
282
283#=======================================================================
284sub all_script_codes
285{
286 my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT;
287
288 return keys %{ $CODES->[$codeset] };
289}
290
291sub all_script_names
292{
293 my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT;
294
295 return values %{ $CODES->[$codeset] };
296}
297
298
299#-----------------------------------------------------------------------
300
301=head1 EXAMPLES
302
303The following example illustrates use of the C<code2script()> function.
304The user is prompted for a script code, and then told the corresponding
305script name:
306
307 $| = 1; # turn off buffering
308
309 print "Enter script code: ";
310 chop($code = <STDIN>);
311 $script = code2script($code, LOCALE_CODE_ALPHA_2);
312 if (defined $script)
313 {
314 print "$code = $script\n";
315 }
316 else
317 {
318 print "'$code' is not a valid script code!\n";
319 }
320
321
322=head1 KNOWN BUGS AND LIMITATIONS
323
324=over 4
325
326=item *
327
328When using C<script2code()>, the script name must currently appear
329exactly as it does in the source of the module. For example,
330
331 script2code('Egyptian hieroglyphs')
332
333will return B<eg>, as expected. But the following will all return C<undef>:
334
335 script2code('hieroglyphs')
336 script2code('Egyptian Hieroglypics')
337
338If there's need for it, a future version could have variants
339for script names.
340
341=item *
342
343In the current implementation, all data is read in when the
344module is loaded, and then held in memory.
345A lazy implementation would be more memory friendly.
346
347=back
348
349=head1 SEE ALSO
350
351=over 4
352
353=item Locale::Language
354
355ISO two letter codes for identification of language (ISO 639).
356
357=item Locale::Currency
358
359ISO three letter codes for identification of currencies
360and funds (ISO 4217).
361
362=item Locale::Country
363
364ISO three letter codes for identification of countries (ISO 3166)
365
366=item ISO 15924
367
368The ISO standard which defines these codes.
369
370=item http://www.evertype.com/standards/iso15924/
371
372Home page for ISO 15924.
373
374
375=back
376
377
378=head1 AUTHOR
379
380Neil Bowers E<lt>neil@bowers.comE<gt>
381
382=head1 COPYRIGHT
383
384Copyright (c) 2002 Neil Bowers.
385
386This module is free software; you can redistribute it and/or
387modify it under the same terms as Perl itself.
388
389=cut
390
391#-----------------------------------------------------------------------
392
393#=======================================================================
394# initialisation code - stuff the DATA into the ALPHA2 hash
395#=======================================================================
396{
397 my ($alpha2, $alpha3, $numeric);
398 my $script;
399
400
401 while (<DATA>)
402 {
403 next unless /\S/;
404 chop;
405 ($alpha2, $alpha3, $numeric, $script) = split(/:/, $_, 4);
406
407 $CODES->[LOCALE_CODE_ALPHA_2]->{$alpha2} = $script;
408 $COUNTRIES->[LOCALE_CODE_ALPHA_2]->{"\L$script"} = $alpha2;
409
410 if ($alpha3)
411 {
412 $CODES->[LOCALE_CODE_ALPHA_3]->{$alpha3} = $script;
413 $COUNTRIES->[LOCALE_CODE_ALPHA_3]->{"\L$script"} = $alpha3;
414 }
415
416 if ($numeric)
417 {
418 $CODES->[LOCALE_CODE_NUMERIC]->{$numeric} = $script;
419 $COUNTRIES->[LOCALE_CODE_NUMERIC]->{"\L$script"} = $numeric;
420 }
421
422 }
423}
424
4251;
426
427__DATA__
428am:ama:130:Aramaic
429ar:ara:160:Arabic
430av:ave:151:Avestan
431bh:bhm:300:Brahmi (Ashoka)
432bi:bid:372:Buhid
433bn:ben:325:Bengali
434bo:bod:330:Tibetan
435bp:bpm:285:Bopomofo
436br:brl:570:Braille
437bt:btk:365:Batak
438bu:bug:367:Buginese (Makassar)
439by:bys:550:Blissymbols
440ca:cam:358:Cham
441ch:chu:221:Old Church Slavonic
442ci:cir:291:Cirth
443cm:cmn:402:Cypro-Minoan
444co:cop:205:Coptic
445cp:cpr:403:Cypriote syllabary
446cy:cyr:220:Cyrillic
447ds:dsr:250:Deserel (Mormon)
448dv:dvn:315:Devanagari (Nagari)
449ed:egd:070:Egyptian demotic
450eg:egy:050:Egyptian hieroglyphs
451eh:egh:060:Egyptian hieratic
452el:ell:200:Greek
453eo:eos:210:Etruscan and Oscan
454et:eth:430:Ethiopic
455gl:glg:225:Glagolitic
456gm:gmu:310:Gurmukhi
457gt:gth:206:Gothic
458gu:guj:320:Gujarati
459ha:han:500:Han ideographs
460he:heb:125:Hebrew
461hg:hgl:420:Hangul
462hm:hmo:450:Pahawh Hmong
463ho:hoo:371:Hanunoo
464hr:hrg:410:Hiragana
465hu:hun:176:Old Hungarian runic
466hv:hvn:175:Kok Turki runic
467hy:hye:230:Armenian
468iv:ivl:610:Indus Valley
469ja:jap:930:(alias for Han + Hiragana + Katakana)
470jl:jlg:445:Cherokee syllabary
471jw:jwi:360:Javanese
472ka:kam:241:Georgian (Mxedruli)
473kh:khn:931:(alias for Hangul + Han)
474kk:kkn:411:Katakana
475km:khm:354:Khmer
476kn:kan:345:Kannada
477kr:krn:357:Karenni (Kayah Li)
478ks:kst:305:Kharoshthi
479kx:kax:240:Georgian (Xucuri)
480la:lat:217:Latin
481lf:laf:215:Latin (Fraktur variant)
482lg:lag:216:Latin (Gaelic variant)
483lo:lao:356:Lao
484lp:lpc:335:Lepcha (Rong)
485md:mda:140:Mandaean
486me:mer:100:Meroitic
487mh:may:090:Mayan hieroglyphs
488ml:mlm:347:Malayalam
489mn:mon:145:Mongolian
490my:mya:350:Burmese
491na:naa:400:Linear A
492nb:nbb:401:Linear B
493og:ogm:212:Ogham
494or:ory:327:Oriya
495os:osm:260:Osmanya
496ph:phx:115:Phoenician
497ph:pah:150:Pahlavi
498pl:pld:282:Pollard Phonetic
499pq:pqd:295:Klingon plQaD
500pr:prm:227:Old Permic
501ps:pst:600:Phaistos Disk
502rn:rnr:211:Runic (Germanic)
503rr:rro:620:Rongo-rongo
504sa:sar:110:South Arabian
505si:sin:348:Sinhala
506sj:syj:137:Syriac (Jacobite variant)
507sl:slb:440:Unified Canadian Aboriginal Syllabics
508sn:syn:136:Syriac (Nestorian variant)
509sw:sww:281:Shavian (Shaw)
510sy:syr:135:Syriac (Estrangelo)
511ta:tam:346:Tamil
512tb:tbw:373:Tagbanwa
513te:tel:340:Telugu
514tf:tfn:120:Tifnagh
515tg:tag:370:Tagalog
516th:tha:352:Thai
517tn:tna:170:Thaana
518tw:twr:290:Tengwar
519va:vai:470:Vai
520vs:vsp:280:Visible Speech
521xa:xas:000:Cuneiform, Sumero-Akkadian
522xf:xfa:105:Cuneiform, Old Persian
523xk:xkn:412:(alias for Hiragana + Katakana)
524xu:xug:106:Cuneiform, Ugaritic
525yi:yii:460:Yi
526zx:zxx:997:Unwritten language
527zy:zyy:998:Undetermined script
528zz:zzz:999:Uncoded script