This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to Encode-2.27
[perl5.git] / ext / Encode / lib / Encode / Alias.pm
CommitLineData
5d030b67
JH
1package Encode::Alias;
2use strict;
656ebd29 3use warnings;
1485817e 4no warnings 'redefine';
64bc6d54 5our $VERSION = do { my @r = ( q$Revision: 2.11 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
8f139f4c 6sub DEBUG () { 0 }
5d030b67 7
10c5ecbb 8use base qw(Exporter);
5d030b67
JH
9
10# Public, encouraged API is exported by default
5129552c 11
d1256cb1
RGS
12our @EXPORT =
13 qw (
14 define_alias
15 find_alias
16);
5d030b67 17
d1256cb1
RGS
18our @Alias; # ordered matching list
19our %Alias; # cached known aliases
5d030b67 20
d1256cb1 21sub find_alias {
c0abe5aa 22 require Encode;
5d030b67 23 my $class = shift;
d1256cb1
RGS
24 my $find = shift;
25 unless ( exists $Alias{$find} ) {
26 $Alias{$find} = undef; # Recursion guard
27 for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
28 my $alias = $Alias[$i];
29 my $val = $Alias[ $i + 1 ];
30 my $new;
31 if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
32 DEBUG and warn "eval $val";
33 $new = eval $val;
34 DEBUG and $@ and warn "$val, $@";
35 }
36 elsif ( ref($alias) eq 'CODE' ) {
37 DEBUG and warn "$alias", "->", "($find)";
38 $new = $alias->($find);
39 }
40 elsif ( lc($find) eq lc($alias) ) {
41 $new = $val;
42 }
43 if ( defined($new) ) {
44 next if $new eq $find; # avoid (direct) recursion on bugs
45 DEBUG and warn "$alias, $new";
46 my $enc =
47 ( ref($new) ) ? $new : Encode::find_encoding($new);
48 if ($enc) {
49 $Alias{$find} = $enc;
50 last;
51 }
52 }
53 }
54
55 # case insensitive search when canonical is not in all lowercase
56 # RT ticket #7835
57 unless ( $Alias{$find} ) {
58 my $lcfind = lc($find);
59 for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
60 {
61 $lcfind eq lc($name) or next;
62 $Alias{$find} = Encode::find_encoding($name);
63 DEBUG and warn "$find => $name";
64 }
65 }
5d030b67 66 }
d1256cb1
RGS
67 if (DEBUG) {
68 my $name;
69 if ( my $e = $Alias{$find} ) {
70 $name = $e->name;
71 }
72 else {
73 $name = "";
74 }
75 warn "find_alias($class, $find)->name = $name";
a63c962f 76 }
151b5d36 77 return $Alias{$find};
5d030b67
JH
78}
79
d1256cb1
RGS
80sub define_alias {
81 while (@_) {
82 my ( $alias, $name ) = splice( @_, 0, 2 );
83 unshift( @Alias, $alias => $name ); # newer one has precedence
84 if ( ref($alias) ) {
85
86 # clear %Alias cache to allow overrides
87 my @a = keys %Alias;
88 for my $k (@a) {
89 if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
90 DEBUG and warn "delete \$Alias\{$k\}";
91 delete $Alias{$k};
92 }
93 elsif ( ref($alias) eq 'CODE' ) {
94 DEBUG and warn "delete \$Alias\{$k\}";
95 delete $Alias{ $alias->($name) };
96 }
97 }
98 }
99 else {
100 DEBUG and warn "delete \$Alias\{$alias\}";
101 delete $Alias{$alias};
102 }
5d030b67
JH
103 }
104}
105
5d030b67 106# Allow latin-1 style names as well
8f1ed24a 107# 0 1 2 3 4 5 6 7 8 9 10
5d030b67 108our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
d1256cb1 109
5d030b67 110# Allow winlatin1 style names as well
d1256cb1
RGS
111our %Winlatin2cp = (
112 'latin1' => 1252,
113 'latin2' => 1250,
114 'cyrillic' => 1251,
115 'greek' => 1253,
116 'turkish' => 1254,
117 'hebrew' => 1255,
118 'arabic' => 1256,
119 'baltic' => 1257,
120 'vietnamese' => 1258,
121);
5d030b67 122
5129552c
JH
123init_aliases();
124
d1256cb1 125sub undef_aliases {
5129552c
JH
126 @Alias = ();
127 %Alias = ();
128}
129
d1256cb1 130sub init_aliases {
c0abe5aa 131 require Encode;
5129552c 132 undef_aliases();
d1256cb1 133
f2a2953c 134 # Try all-lower-case version should all else fails
a999c27c
JH
135 define_alias( qr/^(.*)$/ => '"\L$1"' );
136
f2a2953c 137 # UTF/UCS stuff
64bc6d54 138 define_alias( qr/^(unicode-1-1-)?UTF-?7$/i => '"UTF-7"' );
d1256cb1
RGS
139 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
140 define_alias(
141 qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
142 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
143 qr/^iso-10646-1$/i => '"UCS-2BE"'
144 );
145 define_alias(
146 qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
147 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
148 qr/^UTF-?(16|32)$/i => '"UTF-$1"',
149 );
150
f2a2953c 151 # ASCII
d1256cb1
RGS
152 define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' );
153 define_alias( 'C' => 'ascii' );
ba40575b 154 define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' );
d1256cb1 155
67d7b5ef
JH
156 # Allow variants of iso-8859-1 etc.
157 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
5129552c 158
67d7b5ef
JH
159 # At least HP-UX has these.
160 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
5129552c 161
67d7b5ef 162 # More HP stuff.
d1256cb1
RGS
163 define_alias(
164 qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
165 '"${1}8"' );
5129552c 166
67d7b5ef
JH
167 # The Official name of ASCII.
168 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
5129552c 169
67d7b5ef
JH
170 # This is a font issue, not an encoding issue.
171 # (The currency symbol of the Latin 1 upper half
172 # has been redefined as the euro symbol.)
5129552c
JH
173 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
174
d1256cb1
RGS
175 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
176'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
177 );
5129552c 178
d1256cb1
RGS
179 define_alias(
180 qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
181 hebrew|arabic|baltic|vietnamese)$/ix =>
182 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
183 );
5d030b67 184
3c4b39be 185 # Common names for non-latin preferred MIME names
d1256cb1
RGS
186 define_alias(
187 'ascii' => 'US-ascii',
188 'cyrillic' => 'iso-8859-5',
189 'arabic' => 'iso-8859-6',
190 'greek' => 'iso-8859-7',
191 'hebrew' => 'iso-8859-8',
192 'thai' => 'iso-8859-11',
d1256cb1 193 );
51e4e64d
NC
194 # RT #20781
195 define_alias(qr/\btis-?620\b/i => '"iso-8859-11"');
5d030b67 196
67d7b5ef
JH
197 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
198 # And Microsoft has their own naming (again, surprisingly).
d1256cb1
RGS
199 # And windows-* is registered in IANA!
200 define_alias(
201 qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
5d030b67 202
67d7b5ef 203 # Sometimes seen with a leading zero.
c731e18e 204 # define_alias( qr/\bcp037\b/i => '"cp37"');
5d030b67 205
3ef515df 206 # Mac Mappings
a999c27c
JH
207 # predefined in *.ucm; unneeded
208 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
d1256cb1 209 define_alias( qr/^mac_(.*)$/i => '"mac$1"' );
2fd0906e
SH
210 # http://rt.cpan.org/Ticket/Display.html?id=36326
211 define_alias( qr/^macintosh$/i => '"MacRoman"' );
d1256cb1 212
a999c27c
JH
213 # Ououououou. gone. They are differente!
214 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
d1256cb1 215
f2a2953c 216 # Standardize on the dashed versions.
cf9f87ce 217 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
5129552c 218
d1256cb1
RGS
219 unless ($Encode::ON_EBCDIC) {
220
a63c962f 221 # for Encode::CN
d1256cb1
RGS
222 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
223 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
224
225 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
226 # CP936 doesn't have vendor-addon for GBK, so they're identical.
227 define_alias( qr/^gbk$/i => '"cp936"' );
228
229 # This fixes gb2312 vs. euc-cn confusion, practically
230 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
231
232 # for Encode::JP
233 define_alias( qr/\bjis$/i => '"7bit-jis"' );
234 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
235 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
236 define_alias( qr/\bujis$/i => '"euc-jp"' );
237 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
238 define_alias( qr/\bsjis$/i => '"shiftjis"' );
239 define_alias( qr/\bwindows-31j$/i => '"cp932"' );
240
a63c962f 241 # for Encode::KR
d1256cb1
RGS
242 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
243 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
244
245 # This fixes ksc5601 vs. euc-kr confusion, practically
246 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
247 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
248 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
249
a63c962f 250 # for Encode::TW
d1256cb1
RGS
251 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
252 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
253 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
254 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
255 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
a63c962f 256 }
d1256cb1 257
f2a2953c 258 # utf8 is blessed :)
d1256cb1
RGS
259 define_alias( qr/^UTF-8$/i => '"utf-8-strict"' );
260
67d7b5ef 261 # At last, Map white space and _ to '-'
5129552c
JH
262 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
263}
264
2651;
266__END__
5d030b67
JH
267
268# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
269# TODO: HP-UX '15' encodings japanese15 korean15 roi15
270# TODO: Cyrillic encoding ISO-IR-111 (useful?)
271# TODO: Armenian encoding ARMSCII-8
272# TODO: Hebrew encoding ISO-8859-8-1
273# TODO: Thai encoding TCVN
5d030b67
JH
274# TODO: Vietnamese encodings VPS
275# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
276# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
277# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
278# Kannada Khmer Korean Laotian Malayalam Mongolian
279# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
280
5d030b67
JH
281=head1 NAME
282
ce912cd4 283Encode::Alias - alias definitions to encodings
5d030b67
JH
284
285=head1 SYNOPSIS
286
5129552c
JH
287 use Encode;
288 use Encode::Alias;
5d030b67
JH
289 define_alias( newName => ENCODING);
290
291=head1 DESCRIPTION
292
3ef515df 293Allows newName to be used as an alias for ENCODING. ENCODING may be
fcb875d4 294either the name of an encoding or an encoding object (as described
3ef515df 295in L<Encode>).
5d030b67
JH
296
297Currently I<newName> can be specified in the following ways:
298
299=over 4
300
301=item As a simple string.
302
303=item As a qr// compiled regular expression, e.g.:
304
305 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
306
0ab8f81e
JH
307In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
308in order to allow C<$1> etc. to be substituted. The example is one
309way to alias names as used in X11 fonts to the MIME names for the
310iso-8859-* family. Note the double quotes inside the single quotes.
5d030b67 311
151b5d36
JH
312(or, you don't have to do this yourself because this example is predefined)
313
3ef515df
JH
314If you are using a regex here, you have to use the quotes as shown or
315it won't work. Also note that regex handling is tricky even for the
151b5d36 316experienced. Use this feature with caution.
5d030b67
JH
317
318=item As a code reference, e.g.:
319
151b5d36 320 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
5d030b67 321
151b5d36
JH
322The same effect as the example above in a different way. The coderef
323takes the alias name as an argument and returns a canonical name on
324success or undef if not. Note the second argument is not required.
325Use this with even more caution than the regex version.
5d030b67 326
5129552c
JH
327=back
328
151b5d36
JH
329=head3 Changes in code reference aliasing
330
331As of Encode 1.87, the older form
332
333 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
334
335no longer works.
336
337Encode up to 1.86 internally used "local $_" to implement ths older
338form. But consider the code below;
339
340 use Encode;
341 $_ = "eeeee" ;
342 while (/(e)/g) {
343 my $utf = decode('aliased-encoding-name', $1);
344 print "position:",pos,"\n";
345 }
346
347Prior to Encode 1.86 this fails because of "local $_".
348
0ab8f81e 349=head2 Alias overloading
5d030b67 350
3ef515df 351You can override predefined aliases by simply applying define_alias().
3c4b39be 352The new alias is always evaluated first, and when necessary,
0ab8f81e
JH
353define_alias() flushes the internal cache to make the new definition
354available.
5d030b67 355
0ab8f81e 356 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
5d030b67
JH
357 # superset of SHIFT_JIS
358
5129552c
JH
359 define_alias( qr/shift.*jis$/i => '"cp932"' );
360 define_alias( qr/sjis$/i => '"cp932"' );
361
0ab8f81e 362If you want to zap all predefined aliases, you can use
5129552c
JH
363
364 Encode::Alias->undef_aliases;
365
366to do so. And
367
368 Encode::Alias->init_aliases;
369
0ab8f81e 370gets the factory settings back.
5d030b67
JH
371
372=head1 SEE ALSO
373
374L<Encode>, L<Encode::Supported>
375
5129552c 376=cut
5d030b67 377