This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
"646"
[perl5.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67
JH
1package Encode::Alias;
2use strict;
656ebd29 3use warnings;
1485817e 4no warnings 'redefine';
ba40575b 5our $VERSION = '2.8_01';# do { my @r = ( q$Revision: 2.8 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
8f139f4c 6sub DEBUG () { 0 }
5d030b67 7
10c5ecbb 8use base qw(Exporter);
5d030b67
JH
9
10# Public, encouraged API is exported by default
5129552c 11
d1256cb1
RGS
12our @EXPORT =
13 qw (
14 define_alias
15 find_alias
16);
5d030b67 17
d1256cb1
RGS
18our @Alias; # ordered matching list
19our %Alias; # cached known aliases
5d030b67 20
d1256cb1 21sub find_alias {
c0abe5aa 22 require Encode;
5d030b67 23 my $class = shift;
d1256cb1
RGS
24 my $find = shift;
25 unless ( exists $Alias{$find} ) {
26 $Alias{$find} = undef; # Recursion guard
27 for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
28 my $alias = $Alias[$i];
29 my $val = $Alias[ $i + 1 ];
30 my $new;
31 if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
32 DEBUG and warn "eval $val";
33 $new = eval $val;
34 DEBUG and $@ and warn "$val, $@";
35 }
36 elsif ( ref($alias) eq 'CODE' ) {
37 DEBUG and warn "$alias", "->", "($find)";
38 $new = $alias->($find);
39 }
40 elsif ( lc($find) eq lc($alias) ) {
41 $new = $val;
42 }
43 if ( defined($new) ) {
44 next if $new eq $find; # avoid (direct) recursion on bugs
45 DEBUG and warn "$alias, $new";
46 my $enc =
47 ( ref($new) ) ? $new : Encode::find_encoding($new);
48 if ($enc) {
49 $Alias{$find} = $enc;
50 last;
51 }
52 }
53 }
54
55 # case insensitive search when canonical is not in all lowercase
56 # RT ticket #7835
57 unless ( $Alias{$find} ) {
58 my $lcfind = lc($find);
59 for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
60 {
61 $lcfind eq lc($name) or next;
62 $Alias{$find} = Encode::find_encoding($name);
63 DEBUG and warn "$find => $name";
64 }
65 }
5d030b67 66 }
d1256cb1
RGS
67 if (DEBUG) {
68 my $name;
69 if ( my $e = $Alias{$find} ) {
70 $name = $e->name;
71 }
72 else {
73 $name = "";
74 }
75 warn "find_alias($class, $find)->name = $name";
a63c962f 76 }
151b5d36 77 return $Alias{$find};
5d030b67
JH
78}
79
d1256cb1
RGS
80sub define_alias {
81 while (@_) {
82 my ( $alias, $name ) = splice( @_, 0, 2 );
83 unshift( @Alias, $alias => $name ); # newer one has precedence
84 if ( ref($alias) ) {
85
86 # clear %Alias cache to allow overrides
87 my @a = keys %Alias;
88 for my $k (@a) {
89 if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
90 DEBUG and warn "delete \$Alias\{$k\}";
91 delete $Alias{$k};
92 }
93 elsif ( ref($alias) eq 'CODE' ) {
94 DEBUG and warn "delete \$Alias\{$k\}";
95 delete $Alias{ $alias->($name) };
96 }
97 }
98 }
99 else {
100 DEBUG and warn "delete \$Alias\{$alias\}";
101 delete $Alias{$alias};
102 }
5d030b67
JH
103 }
104}
105
5d030b67 106# Allow latin-1 style names as well
8f1ed24a 107# 0 1 2 3 4 5 6 7 8 9 10
5d030b67 108our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
d1256cb1 109
5d030b67 110# Allow winlatin1 style names as well
d1256cb1
RGS
111our %Winlatin2cp = (
112 'latin1' => 1252,
113 'latin2' => 1250,
114 'cyrillic' => 1251,
115 'greek' => 1253,
116 'turkish' => 1254,
117 'hebrew' => 1255,
118 'arabic' => 1256,
119 'baltic' => 1257,
120 'vietnamese' => 1258,
121);
5d030b67 122
5129552c
JH
123init_aliases();
124
d1256cb1 125sub undef_aliases {
5129552c
JH
126 @Alias = ();
127 %Alias = ();
128}
129
d1256cb1 130sub init_aliases {
c0abe5aa 131 require Encode;
5129552c 132 undef_aliases();
d1256cb1 133
f2a2953c 134 # Try all-lower-case version should all else fails
a999c27c
JH
135 define_alias( qr/^(.*)$/ => '"\L$1"' );
136
f2a2953c 137 # UTF/UCS stuff
d1256cb1
RGS
138 define_alias( qr/^UTF-?7$/i => '"UTF-7"' );
139 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
140 define_alias(
141 qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
142 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
143 qr/^iso-10646-1$/i => '"UCS-2BE"'
144 );
145 define_alias(
146 qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
147 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
148 qr/^UTF-?(16|32)$/i => '"UTF-$1"',
149 );
150
f2a2953c 151 # ASCII
d1256cb1
RGS
152 define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' );
153 define_alias( 'C' => 'ascii' );
ba40575b 154 define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' );
d1256cb1 155
67d7b5ef
JH
156 # Allow variants of iso-8859-1 etc.
157 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 158
67d7b5ef
JH
159 # At least HP-UX has these.
160 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 161
67d7b5ef 162 # More HP stuff.
d1256cb1
RGS
163 define_alias(
164 qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
165 '"${1}8"' );
5129552c 166
67d7b5ef
JH
167 # The Official name of ASCII.
168 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 169
67d7b5ef
JH
170 # This is a font issue, not an encoding issue.
171 # (The currency symbol of the Latin 1 upper half
172 # has been redefined as the euro symbol.)
5129552c
JH
173 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
174
d1256cb1
RGS
175 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
176'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
177 );
5129552c 178
d1256cb1
RGS
179 define_alias(
180 qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
181 hebrew|arabic|baltic|vietnamese)$/ix =>
182 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
183 );
5d030b67 184
3c4b39be 185 # Common names for non-latin preferred MIME names
d1256cb1
RGS
186 define_alias(
187 'ascii' => 'US-ascii',
188 'cyrillic' => 'iso-8859-5',
189 'arabic' => 'iso-8859-6',
190 'greek' => 'iso-8859-7',
191 'hebrew' => 'iso-8859-8',
192 'thai' => 'iso-8859-11',
d1256cb1 193 );
51e4e64d
NC
194 # RT #20781
195 define_alias(qr/\btis-?620\b/i => '"iso-8859-11"');
5d030b67 196
67d7b5ef
JH
197 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
198 # And Microsoft has their own naming (again, surprisingly).
d1256cb1
RGS
199 # And windows-* is registered in IANA!
200 define_alias(
201 qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
5d030b67 202
67d7b5ef 203 # Sometimes seen with a leading zero.
c731e18e 204 # define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 205
3ef515df 206 # Mac Mappings
a999c27c
JH
207 # predefined in *.ucm; unneeded
208 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
d1256cb1
RGS
209 define_alias( qr/^mac_(.*)$/i => '"mac$1"' );
210
a999c27c
JH
211 # Ououououou. gone. They are differente!
212 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
d1256cb1 213
f2a2953c 214 # Standardize on the dashed versions.
cf9f87ce 215 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
5129552c 216
d1256cb1
RGS
217 unless ($Encode::ON_EBCDIC) {
218
a63c962f 219 # for Encode::CN
d1256cb1
RGS
220 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
221 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
222
223 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
224 # CP936 doesn't have vendor-addon for GBK, so they're identical.
225 define_alias( qr/^gbk$/i => '"cp936"' );
226
227 # This fixes gb2312 vs. euc-cn confusion, practically
228 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
229
230 # for Encode::JP
231 define_alias( qr/\bjis$/i => '"7bit-jis"' );
232 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
233 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
234 define_alias( qr/\bujis$/i => '"euc-jp"' );
235 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
236 define_alias( qr/\bsjis$/i => '"shiftjis"' );
237 define_alias( qr/\bwindows-31j$/i => '"cp932"' );
238
a63c962f 239 # for Encode::KR
d1256cb1
RGS
240 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
241 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
242
243 # This fixes ksc5601 vs. euc-kr confusion, practically
244 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
245 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
246 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
247
a63c962f 248 # for Encode::TW
d1256cb1
RGS
249 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
250 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
251 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
252 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
253 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
a63c962f 254 }
d1256cb1 255
f2a2953c 256 # utf8 is blessed :)
d1256cb1
RGS
257 define_alias( qr/^UTF-8$/i => '"utf-8-strict"' );
258
67d7b5ef 259 # At last, Map white space and _ to '-'
5129552c
JH
260 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
261}
262
2631;
264__END__
5d030b67
JH
265
266# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
267# TODO: HP-UX '15' encodings japanese15 korean15 roi15
268# TODO: Cyrillic encoding ISO-IR-111 (useful?)
269# TODO: Armenian encoding ARMSCII-8
270# TODO: Hebrew encoding ISO-8859-8-1
271# TODO: Thai encoding TCVN
5d030b67
JH
272# TODO: Vietnamese encodings VPS
273# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
274# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
275# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
276# Kannada Khmer Korean Laotian Malayalam Mongolian
277# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
278
5d030b67
JH
279=head1 NAME
280
ce912cd4 281Encode::Alias - alias definitions to encodings
5d030b67
JH
282
283=head1 SYNOPSIS
284
5129552c
JH
285 use Encode;
286 use Encode::Alias;
5d030b67
JH
287 define_alias( newName => ENCODING);
288
289=head1 DESCRIPTION
290
3ef515df 291Allows newName to be used as an alias for ENCODING. ENCODING may be
fcb875d4 292either the name of an encoding or an encoding object (as described
3ef515df 293in L<Encode>).
5d030b67
JH
294
295Currently I<newName> can be specified in the following ways:
296
297=over 4
298
299=item As a simple string.
300
301=item As a qr// compiled regular expression, e.g.:
302
303 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
304
0ab8f81e
JH
305In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
306in order to allow C<$1> etc. to be substituted. The example is one
307way to alias names as used in X11 fonts to the MIME names for the
308iso-8859-* family. Note the double quotes inside the single quotes.
5d030b67 309
151b5d36
JH
310(or, you don't have to do this yourself because this example is predefined)
311
3ef515df
JH
312If you are using a regex here, you have to use the quotes as shown or
313it won't work. Also note that regex handling is tricky even for the
151b5d36 314experienced. Use this feature with caution.
5d030b67
JH
315
316=item As a code reference, e.g.:
317
151b5d36 318 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
5d030b67 319
151b5d36
JH
320The same effect as the example above in a different way. The coderef
321takes the alias name as an argument and returns a canonical name on
322success or undef if not. Note the second argument is not required.
323Use this with even more caution than the regex version.
5d030b67 324
5129552c
JH
325=back
326
151b5d36
JH
327=head3 Changes in code reference aliasing
328
329As of Encode 1.87, the older form
330
331 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
332
333no longer works.
334
335Encode up to 1.86 internally used "local $_" to implement ths older
336form. But consider the code below;
337
338 use Encode;
339 $_ = "eeeee" ;
340 while (/(e)/g) {
341 my $utf = decode('aliased-encoding-name', $1);
342 print "position:",pos,"\n";
343 }
344
345Prior to Encode 1.86 this fails because of "local $_".
346
0ab8f81e 347=head2 Alias overloading
5d030b67 348
3ef515df 349You can override predefined aliases by simply applying define_alias().
3c4b39be 350The new alias is always evaluated first, and when necessary,
0ab8f81e
JH
351define_alias() flushes the internal cache to make the new definition
352available.
5d030b67 353
0ab8f81e 354 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
5d030b67
JH
355 # superset of SHIFT_JIS
356
5129552c
JH
357 define_alias( qr/shift.*jis$/i => '"cp932"' );
358 define_alias( qr/sjis$/i => '"cp932"' );
359
0ab8f81e 360If you want to zap all predefined aliases, you can use
5129552c
JH
361
362 Encode::Alias->undef_aliases;
363
364to do so. And
365
366 Encode::Alias->init_aliases;
367
0ab8f81e 368gets the factory settings back.
5d030b67
JH
369
370=head1 SEE ALSO
371
372L<Encode>, L<Encode::Supported>
373
5129552c 374=cut
5d030b67 375