This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Paranoia regen_headers; no changes but at least
[perl5.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67
JH
1package Encode::Alias;
2use strict;
5129552c 3use Encode;
126bf8bf 4our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
5d030b67 5our $DEBUG = 0;
5d030b67 6
10c5ecbb 7use base qw(Exporter);
5d030b67
JH
8
9# Public, encouraged API is exported by default
5129552c 10
fcb875d4 11our @EXPORT =
5129552c
JH
12 qw (
13 define_alias
14 find_alias
15 );
5d030b67
JH
16
17our @Alias; # ordered matching list
18our %Alias; # cached known aliases
19
5129552c 20sub find_alias
5d030b67
JH
21{
22 my $class = shift;
23 local $_ = shift;
24 unless (exists $Alias{$_})
25 {
eaac0a15 26 $Alias{$_} = undef; # Recursion guard
5d030b67
JH
27 for (my $i=0; $i < @Alias; $i += 2)
28 {
29 my $alias = $Alias[$i];
30 my $val = $Alias[$i+1];
31 my $new;
32 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
33 {
a63c962f 34 $DEBUG and warn "eval $val";
5d030b67
JH
35 $new = eval $val;
36 # $@ and warn "$val, $@";
37 }
38 elsif (ref($alias) eq 'CODE')
39 {
a63c962f 40 $DEBUG and warn "$alias", "->", "($val)";
5d030b67
JH
41 $new = $alias->($val);
42 }
43 elsif (lc($_) eq lc($alias))
44 {
45 $new = $val;
46 }
47 if (defined($new))
48 {
49 next if $new eq $_; # avoid (direct) recursion on bugs
a63c962f 50 $DEBUG and warn "$alias, $new";
5d030b67
JH
51 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
52 if ($enc)
53 {
54 $Alias{$_} = $enc;
55 last;
56 }
57 }
58 }
59 }
a63c962f
JH
60 if ($DEBUG){
61 my $name;
62 if (my $e = $Alias{$_}){
63 $name = $e->name;
64 }else{
65 $name = "";
66 }
67 warn "find_alias($class, $_)->name = $name";
68 }
5d030b67
JH
69 return $Alias{$_};
70}
71
72sub define_alias
73{
74 while (@_)
75 {
76 my ($alias,$name) = splice(@_,0,2);
77 unshift(@Alias, $alias => $name); # newer one has precedence
78 # clear %Alias cache to allow overrides
79 if (ref($alias)){
5129552c
JH
80 my @a = keys %Alias;
81 for my $k (@a){
5d030b67
JH
82 if (ref($alias) eq 'Regexp' && $k =~ $alias)
83 {
a63c962f 84 $DEBUG and warn "delete \$Alias\{$k\}";
5d030b67
JH
85 delete $Alias{$k};
86 }
87 elsif (ref($alias) eq 'CODE')
88 {
a63c962f 89 $DEBUG and warn "delete \$Alias\{$k\}";
5d030b67
JH
90 delete $Alias{$alias->($name)};
91 }
92 }
93 }else{
a63c962f 94 $DEBUG and warn "delete \$Alias\{$alias\}";
5d030b67
JH
95 delete $Alias{$alias};
96 }
97 }
98}
99
5d030b67
JH
100# Allow latin-1 style names as well
101 # 0 1 2 3 4 5 6 7 8 9 10
102our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
5d030b67
JH
103# Allow winlatin1 style names as well
104our %Winlatin2cp = (
105 'latin1' => 1252,
106 'latin2' => 1250,
107 'cyrillic' => 1251,
108 'greek' => 1253,
109 'turkish' => 1254,
110 'hebrew' => 1255,
111 'arabic' => 1256,
112 'baltic' => 1257,
113 'vietnamese' => 1258,
114 );
115
5129552c
JH
116init_aliases();
117
118sub undef_aliases{
119 @Alias = ();
120 %Alias = ();
121}
122
123sub init_aliases
124{
125 undef_aliases();
a999c27c 126
f2a2953c 127 # Try all-lower-case version should all else fails
a999c27c
JH
128 define_alias( qr/^(.*)$/ => '"\L$1"' );
129
f2a2953c 130 # UTF/UCS stuff
11067275
JH
131 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
132 define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
126bf8bf 133 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
11067275
JH
134 qr/^iso-10646-1$/i => '"UCS-2BE"' );
135 define_alias( qr/^UTF(16|32)-?BE$/i => '"UTF-$1BE"',
136 qr/^UTF(16|32)-?LE$/i => '"UTF-$1LE"',
137 qr/^UTF(16|32)$/i => '"UTF-$1"',
f2a2953c
JH
138 );
139 # ASCII
a999c27c 140 define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
a999c27c 141 define_alias('C' => 'ascii');
67d7b5ef
JH
142 # Allow variants of iso-8859-1 etc.
143 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 144
67d7b5ef
JH
145 # At least HP-UX has these.
146 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 147
67d7b5ef
JH
148 # More HP stuff.
149 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
5129552c 150
67d7b5ef
JH
151 # The Official name of ASCII.
152 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 153
67d7b5ef
JH
154 # This is a font issue, not an encoding issue.
155 # (The currency symbol of the Latin 1 upper half
156 # has been redefined as the euro symbol.)
5129552c
JH
157 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
158
fcb875d4 159 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
5129552c
JH
160 => '"iso-8859-$Encode::Alias::Latin2iso[$1]"' );
161
67d7b5ef 162 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
fcb875d4 163 hebrew|arabic|baltic|vietnamese)$/ix =>
5129552c 164 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
5d030b67 165
67d7b5ef 166 # Common names for non-latin prefered MIME names
5129552c
JH
167 define_alias( 'ascii' => 'US-ascii',
168 'cyrillic' => 'iso-8859-5',
169 'arabic' => 'iso-8859-6',
170 'greek' => 'iso-8859-7',
171 'hebrew' => 'iso-8859-8',
172 'thai' => 'iso-8859-11',
173 'tis620' => 'iso-8859-11',
174 );
5d030b67 175
67d7b5ef
JH
176 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
177 # And Microsoft has their own naming (again, surprisingly).
fcb875d4 178 # And windows-* is registered in IANA!
67d7b5ef 179 define_alias( qr/\b(?:ibm|ms|windows)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
5d030b67 180
67d7b5ef 181 # Sometimes seen with a leading zero.
c731e18e 182 # define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 183
3ef515df 184 # Mac Mappings
a999c27c
JH
185 # predefined in *.ucm; unneeded
186 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
3ef515df 187 define_alias( qr/^mac_(.*)$/i => '"mac$1"');
a999c27c
JH
188 # Ououououou. gone. They are differente!
189 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
fcb875d4 190
f2a2953c 191 # Standardize on the dashed versions.
67d7b5ef
JH
192 # define_alias( qr/\butf8$/i => 'utf-8' );
193 define_alias( qr/\bkoi8r$/i => 'koi8-r' );
194 define_alias( qr/\bkoi8u$/i => 'koi8-u' );
5129552c 195
a63c962f
JH
196 unless ($Encode::ON_EBCDIC){
197 # for Encode::CN
67d7b5ef
JH
198 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
199 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
200 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
201 # CP936 doesn't have vendor-addon for GBK, so they're identical.
202 define_alias( qr/^gbk$/i => '"cp936"');
203 # This fixes gb2312 vs. euc-cn confusion, practically
204 define_alias( qr/\bGB[-_ ]?2312(?:\D.*$|$)/i => '"euc-cn"' );
205 # for Encode::JP
206 define_alias( qr/\bjis$/i => '"7bit-jis"' );
207 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
208 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
209 define_alias( qr/\bujis$/i => '"euc-jp"' );
210 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
211 define_alias( qr/\bsjis$/i => '"shiftjis"' );
a63c962f 212 # for Encode::KR
67d7b5ef
JH
213 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
214 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
215 # This fixes ksc5601 vs. euc-kr confusion, practically
216 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
217 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
218 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
a63c962f 219 # for Encode::TW
b0b300a3
JH
220 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
221 define_alias( qr/\bbig5-?et(?:en)$/i => '"big5-eten"' );
222 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
a63c962f 223 }
f2a2953c
JH
224 # utf8 is blessed :)
225 define_alias( qr/^UTF-8$/i => '"utf8"',);
67d7b5ef 226 # At last, Map white space and _ to '-'
5129552c
JH
227 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
228}
229
2301;
231__END__
5d030b67
JH
232
233# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
234# TODO: HP-UX '15' encodings japanese15 korean15 roi15
235# TODO: Cyrillic encoding ISO-IR-111 (useful?)
236# TODO: Armenian encoding ARMSCII-8
237# TODO: Hebrew encoding ISO-8859-8-1
238# TODO: Thai encoding TCVN
5d030b67
JH
239# TODO: Vietnamese encodings VPS
240# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
241# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
242# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
243# Kannada Khmer Korean Laotian Malayalam Mongolian
244# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
245
5d030b67
JH
246=head1 NAME
247
ce912cd4 248Encode::Alias - alias definitions to encodings
5d030b67
JH
249
250=head1 SYNOPSIS
251
5129552c
JH
252 use Encode;
253 use Encode::Alias;
5d030b67
JH
254 define_alias( newName => ENCODING);
255
256=head1 DESCRIPTION
257
3ef515df 258Allows newName to be used as an alias for ENCODING. ENCODING may be
fcb875d4 259either the name of an encoding or an encoding object (as described
3ef515df 260in L<Encode>).
5d030b67
JH
261
262Currently I<newName> can be specified in the following ways:
263
264=over 4
265
266=item As a simple string.
267
268=item As a qr// compiled regular expression, e.g.:
269
270 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
271
0ab8f81e
JH
272In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
273in order to allow C<$1> etc. to be substituted. The example is one
274way to alias names as used in X11 fonts to the MIME names for the
275iso-8859-* family. Note the double quotes inside the single quotes.
5d030b67 276
3ef515df
JH
277If you are using a regex here, you have to use the quotes as shown or
278it won't work. Also note that regex handling is tricky even for the
279experienced. Use it with caution.
5d030b67
JH
280
281=item As a code reference, e.g.:
282
283 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
284
0ab8f81e 285In this case, C<$_> will be set to the name that is being looked up and
5d030b67 286I<ENCODING> is passed to the sub as its first argument. The example
3ef515df
JH
287is another way to alias names as used in X11 fonts to the MIME names
288for the iso-8859-* family.
5d030b67 289
5129552c
JH
290=back
291
0ab8f81e 292=head2 Alias overloading
5d030b67 293
3ef515df 294You can override predefined aliases by simply applying define_alias().
0ab8f81e
JH
295The new alias is always evaluated first, and when neccessary,
296define_alias() flushes the internal cache to make the new definition
297available.
5d030b67 298
0ab8f81e 299 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
5d030b67
JH
300 # superset of SHIFT_JIS
301
5129552c
JH
302 define_alias( qr/shift.*jis$/i => '"cp932"' );
303 define_alias( qr/sjis$/i => '"cp932"' );
304
0ab8f81e 305If you want to zap all predefined aliases, you can use
5129552c
JH
306
307 Encode::Alias->undef_aliases;
308
309to do so. And
310
311 Encode::Alias->init_aliases;
312
0ab8f81e 313gets the factory settings back.
5d030b67
JH
314
315=head1 SEE ALSO
316
317L<Encode>, L<Encode::Supported>
318
5129552c 319=cut
5d030b67 320