This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to Encode-2.27
[perl5.git] / ext / Encode / lib / Encode / Alias.pm
... / ...
CommitLineData
1package Encode::Alias;
2use strict;
3use warnings;
4no warnings 'redefine';
5our $VERSION = do { my @r = ( q$Revision: 2.11 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
6sub DEBUG () { 0 }
7
8use base qw(Exporter);
9
10# Public, encouraged API is exported by default
11
12our @EXPORT =
13 qw (
14 define_alias
15 find_alias
16);
17
18our @Alias; # ordered matching list
19our %Alias; # cached known aliases
20
21sub find_alias {
22 require Encode;
23 my $class = shift;
24 my $find = shift;
25 unless ( exists $Alias{$find} ) {
26 $Alias{$find} = undef; # Recursion guard
27 for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
28 my $alias = $Alias[$i];
29 my $val = $Alias[ $i + 1 ];
30 my $new;
31 if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
32 DEBUG and warn "eval $val";
33 $new = eval $val;
34 DEBUG and $@ and warn "$val, $@";
35 }
36 elsif ( ref($alias) eq 'CODE' ) {
37 DEBUG and warn "$alias", "->", "($find)";
38 $new = $alias->($find);
39 }
40 elsif ( lc($find) eq lc($alias) ) {
41 $new = $val;
42 }
43 if ( defined($new) ) {
44 next if $new eq $find; # avoid (direct) recursion on bugs
45 DEBUG and warn "$alias, $new";
46 my $enc =
47 ( ref($new) ) ? $new : Encode::find_encoding($new);
48 if ($enc) {
49 $Alias{$find} = $enc;
50 last;
51 }
52 }
53 }
54
55 # case insensitive search when canonical is not in all lowercase
56 # RT ticket #7835
57 unless ( $Alias{$find} ) {
58 my $lcfind = lc($find);
59 for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
60 {
61 $lcfind eq lc($name) or next;
62 $Alias{$find} = Encode::find_encoding($name);
63 DEBUG and warn "$find => $name";
64 }
65 }
66 }
67 if (DEBUG) {
68 my $name;
69 if ( my $e = $Alias{$find} ) {
70 $name = $e->name;
71 }
72 else {
73 $name = "";
74 }
75 warn "find_alias($class, $find)->name = $name";
76 }
77 return $Alias{$find};
78}
79
80sub define_alias {
81 while (@_) {
82 my ( $alias, $name ) = splice( @_, 0, 2 );
83 unshift( @Alias, $alias => $name ); # newer one has precedence
84 if ( ref($alias) ) {
85
86 # clear %Alias cache to allow overrides
87 my @a = keys %Alias;
88 for my $k (@a) {
89 if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
90 DEBUG and warn "delete \$Alias\{$k\}";
91 delete $Alias{$k};
92 }
93 elsif ( ref($alias) eq 'CODE' ) {
94 DEBUG and warn "delete \$Alias\{$k\}";
95 delete $Alias{ $alias->($name) };
96 }
97 }
98 }
99 else {
100 DEBUG and warn "delete \$Alias\{$alias\}";
101 delete $Alias{$alias};
102 }
103 }
104}
105
106# Allow latin-1 style names as well
107# 0 1 2 3 4 5 6 7 8 9 10
108our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
109
110# Allow winlatin1 style names as well
111our %Winlatin2cp = (
112 'latin1' => 1252,
113 'latin2' => 1250,
114 'cyrillic' => 1251,
115 'greek' => 1253,
116 'turkish' => 1254,
117 'hebrew' => 1255,
118 'arabic' => 1256,
119 'baltic' => 1257,
120 'vietnamese' => 1258,
121);
122
123init_aliases();
124
125sub undef_aliases {
126 @Alias = ();
127 %Alias = ();
128}
129
130sub init_aliases {
131 require Encode;
132 undef_aliases();
133
134 # Try all-lower-case version should all else fails
135 define_alias( qr/^(.*)$/ => '"\L$1"' );
136
137 # UTF/UCS stuff
138 define_alias( qr/^(unicode-1-1-)?UTF-?7$/i => '"UTF-7"' );
139 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
140 define_alias(
141 qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
142 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
143 qr/^iso-10646-1$/i => '"UCS-2BE"'
144 );
145 define_alias(
146 qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
147 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
148 qr/^UTF-?(16|32)$/i => '"UTF-$1"',
149 );
150
151 # ASCII
152 define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' );
153 define_alias( 'C' => 'ascii' );
154 define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' );
155
156 # Allow variants of iso-8859-1 etc.
157 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
158
159 # At least HP-UX has these.
160 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
161
162 # More HP stuff.
163 define_alias(
164 qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
165 '"${1}8"' );
166
167 # The Official name of ASCII.
168 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
169
170 # This is a font issue, not an encoding issue.
171 # (The currency symbol of the Latin 1 upper half
172 # has been redefined as the euro symbol.)
173 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
174
175 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
176'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
177 );
178
179 define_alias(
180 qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
181 hebrew|arabic|baltic|vietnamese)$/ix =>
182 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
183 );
184
185 # Common names for non-latin preferred MIME names
186 define_alias(
187 'ascii' => 'US-ascii',
188 'cyrillic' => 'iso-8859-5',
189 'arabic' => 'iso-8859-6',
190 'greek' => 'iso-8859-7',
191 'hebrew' => 'iso-8859-8',
192 'thai' => 'iso-8859-11',
193 );
194 # RT #20781
195 define_alias(qr/\btis-?620\b/i => '"iso-8859-11"');
196
197 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
198 # And Microsoft has their own naming (again, surprisingly).
199 # And windows-* is registered in IANA!
200 define_alias(
201 qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
202
203 # Sometimes seen with a leading zero.
204 # define_alias( qr/\bcp037\b/i => '"cp37"');
205
206 # Mac Mappings
207 # predefined in *.ucm; unneeded
208 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
209 define_alias( qr/^mac_(.*)$/i => '"mac$1"' );
210 # http://rt.cpan.org/Ticket/Display.html?id=36326
211 define_alias( qr/^macintosh$/i => '"MacRoman"' );
212
213 # Ououououou. gone. They are differente!
214 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
215
216 # Standardize on the dashed versions.
217 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
218
219 unless ($Encode::ON_EBCDIC) {
220
221 # for Encode::CN
222 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
223 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
224
225 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
226 # CP936 doesn't have vendor-addon for GBK, so they're identical.
227 define_alias( qr/^gbk$/i => '"cp936"' );
228
229 # This fixes gb2312 vs. euc-cn confusion, practically
230 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
231
232 # for Encode::JP
233 define_alias( qr/\bjis$/i => '"7bit-jis"' );
234 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
235 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
236 define_alias( qr/\bujis$/i => '"euc-jp"' );
237 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
238 define_alias( qr/\bsjis$/i => '"shiftjis"' );
239 define_alias( qr/\bwindows-31j$/i => '"cp932"' );
240
241 # for Encode::KR
242 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
243 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
244
245 # This fixes ksc5601 vs. euc-kr confusion, practically
246 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
247 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
248 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
249
250 # for Encode::TW
251 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
252 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
253 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
254 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
255 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
256 }
257
258 # utf8 is blessed :)
259 define_alias( qr/^UTF-8$/i => '"utf-8-strict"' );
260
261 # At last, Map white space and _ to '-'
262 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
263}
264
2651;
266__END__
267
268# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
269# TODO: HP-UX '15' encodings japanese15 korean15 roi15
270# TODO: Cyrillic encoding ISO-IR-111 (useful?)
271# TODO: Armenian encoding ARMSCII-8
272# TODO: Hebrew encoding ISO-8859-8-1
273# TODO: Thai encoding TCVN
274# TODO: Vietnamese encodings VPS
275# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
276# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
277# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
278# Kannada Khmer Korean Laotian Malayalam Mongolian
279# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
280
281=head1 NAME
282
283Encode::Alias - alias definitions to encodings
284
285=head1 SYNOPSIS
286
287 use Encode;
288 use Encode::Alias;
289 define_alias( newName => ENCODING);
290
291=head1 DESCRIPTION
292
293Allows newName to be used as an alias for ENCODING. ENCODING may be
294either the name of an encoding or an encoding object (as described
295in L<Encode>).
296
297Currently I<newName> can be specified in the following ways:
298
299=over 4
300
301=item As a simple string.
302
303=item As a qr// compiled regular expression, e.g.:
304
305 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
306
307In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
308in order to allow C<$1> etc. to be substituted. The example is one
309way to alias names as used in X11 fonts to the MIME names for the
310iso-8859-* family. Note the double quotes inside the single quotes.
311
312(or, you don't have to do this yourself because this example is predefined)
313
314If you are using a regex here, you have to use the quotes as shown or
315it won't work. Also note that regex handling is tricky even for the
316experienced. Use this feature with caution.
317
318=item As a code reference, e.g.:
319
320 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
321
322The same effect as the example above in a different way. The coderef
323takes the alias name as an argument and returns a canonical name on
324success or undef if not. Note the second argument is not required.
325Use this with even more caution than the regex version.
326
327=back
328
329=head3 Changes in code reference aliasing
330
331As of Encode 1.87, the older form
332
333 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
334
335no longer works.
336
337Encode up to 1.86 internally used "local $_" to implement ths older
338form. But consider the code below;
339
340 use Encode;
341 $_ = "eeeee" ;
342 while (/(e)/g) {
343 my $utf = decode('aliased-encoding-name', $1);
344 print "position:",pos,"\n";
345 }
346
347Prior to Encode 1.86 this fails because of "local $_".
348
349=head2 Alias overloading
350
351You can override predefined aliases by simply applying define_alias().
352The new alias is always evaluated first, and when necessary,
353define_alias() flushes the internal cache to make the new definition
354available.
355
356 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
357 # superset of SHIFT_JIS
358
359 define_alias( qr/shift.*jis$/i => '"cp932"' );
360 define_alias( qr/sjis$/i => '"cp932"' );
361
362If you want to zap all predefined aliases, you can use
363
364 Encode::Alias->undef_aliases;
365
366to do so. And
367
368 Encode::Alias->init_aliases;
369
370gets the factory settings back.
371
372=head1 SEE ALSO
373
374L<Encode>, L<Encode::Supported>
375
376=cut
377