4 our $VERSION = do { my @r = (q$Revision: 1.20 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
8 our @ISA = qw(Exporter);
10 # Public, encouraged API is exported by default
18 our @Alias; # ordered matching list
19 our %Alias; # cached known aliases
25 unless (exists $Alias{$_})
27 for (my $i=0; $i < @Alias; $i += 2)
29 my $alias = $Alias[$i];
30 my $val = $Alias[$i+1];
32 if (ref($alias) eq 'Regexp' && $_ =~ $alias)
34 $DEBUG and warn "eval $val";
36 # $@ and warn "$val, $@";
38 elsif (ref($alias) eq 'CODE')
40 $DEBUG and warn "$alias", "->", "($val)";
41 $new = $alias->($val);
43 elsif (lc($_) eq lc($alias))
49 next if $new eq $_; # avoid (direct) recursion on bugs
50 $DEBUG and warn "$alias, $new";
51 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
62 if (my $e = $Alias{$_}){
67 warn "find_alias($class, $_)->name = $name";
76 my ($alias,$name) = splice(@_,0,2);
77 unshift(@Alias, $alias => $name); # newer one has precedence
78 # clear %Alias cache to allow overrides
82 if (ref($alias) eq 'Regexp' && $k =~ $alias)
84 $DEBUG and warn "delete \$Alias\{$k\}";
87 elsif (ref($alias) eq 'CODE')
89 $DEBUG and warn "delete \$Alias\{$k\}";
90 delete $Alias{$alias->($name)};
94 $DEBUG and warn "delete \$Alias\{$alias\}";
95 delete $Alias{$alias};
100 # Allow latin-1 style names as well
101 # 0 1 2 3 4 5 6 7 8 9 10
102 our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
103 # Allow winlatin1 style names as well
113 'vietnamese' => 1258,
127 # Try all-lower-case version should anything fails
128 define_alias( qr/^(.*)$/ => '"\L$1"' );
130 # Moved from Encode::utf8
131 define_alias( qr/^UTF-8$/i => '"utf8"',);
132 # Moved from Encode::ucs2_le
133 define_alias( qr/^UCS-2LE$/i => '"UTF-16LE"',
134 qr/^UCS2-le$/i => '"UCS-2"', );
135 # Moved from Encode::10464_1
136 define_alias( qr/^UTF-16BE$/i => '"UCS-2"',
137 qr/^UCS2$/i => '"UCS-2"',
138 qr/^iso-10646-1$/i => '"UCS-2"', );
139 # Moved from ascii.ucm
140 define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
141 # 'C' => 'US-ascii' so you can feed default locale directly.
142 define_alias('C' => 'ascii');
143 # Allow variants of iso-8859-1 etc.
144 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
146 # At least HP-UX has these.
147 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
150 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
152 # The Official name of ASCII.
153 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
155 # This is a font issue, not an encoding issue.
156 # (The currency symbol of the Latin 1 upper half
157 # has been redefined as the euro symbol.)
158 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
160 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
161 => '"iso-8859-$Encode::Alias::Latin2iso[$1]"' );
163 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
164 hebrew|arabic|baltic|vietnamese)$/ix =>
165 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
167 # Common names for non-latin prefered MIME names
168 define_alias( 'ascii' => 'US-ascii',
169 'cyrillic' => 'iso-8859-5',
170 'arabic' => 'iso-8859-6',
171 'greek' => 'iso-8859-7',
172 'hebrew' => 'iso-8859-8',
173 'thai' => 'iso-8859-11',
174 'tis620' => 'iso-8859-11',
177 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
178 # And Microsoft has their own naming (again, surprisingly).
179 # And windows-* is registered in IANA!
180 define_alias( qr/\b(?:ibm|ms|windows)[-_]?(\d\d\d\d?)$/i => '"cp$1"');
182 # Sometimes seen with a leading zero.
183 define_alias( qr/\bcp037\b/i => '"cp37"');
186 # predefined in *.ucm; unneeded
187 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
188 define_alias( qr/^mac_(.*)$/i => '"mac$1"');
189 # Ououououou. gone. They are differente!
190 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
192 # Standardize on the dashed versions.
193 # define_alias( qr/\butf8$/i => 'utf-8' );
194 define_alias( qr/\bkoi8r$/i => 'koi8-r' );
195 define_alias( qr/\bkoi8u$/i => 'koi8-u' );
197 unless ($Encode::ON_EBCDIC){
199 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
200 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
201 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
202 # CP936 doesn't have vendor-addon for GBK, so they're identical.
203 define_alias( qr/^gbk$/i => '"cp936"');
204 # This fixes gb2312 vs. euc-cn confusion, practically
205 define_alias( qr/\bGB[-_ ]?2312(?:\D.*$|$)/i => '"euc-cn"' );
207 define_alias( qr/\bjis$/i => '"7bit-jis"' );
208 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
209 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
210 define_alias( qr/\bujis$/i => '"euc-jp"' );
211 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
212 define_alias( qr/\bsjis$/i => '"shiftjis"' );
214 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
215 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
216 # This fixes ksc5601 vs. euc-kr confusion, practically
217 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
218 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
219 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
221 define_alias( qr/\bbig-?5$/i => '"big5"' );
222 define_alias( qr/\bbig5-hk(?:scs)?$/i => '"big5-hkscs"' );
224 # At last, Map white space and _ to '-'
225 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
231 # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
232 # TODO: HP-UX '15' encodings japanese15 korean15 roi15
233 # TODO: Cyrillic encoding ISO-IR-111 (useful?)
234 # TODO: Armenian encoding ARMSCII-8
235 # TODO: Hebrew encoding ISO-8859-8-1
236 # TODO: Thai encoding TCVN
237 # TODO: Vietnamese encodings VPS
238 # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
239 # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
240 # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
241 # Kannada Khmer Korean Laotian Malayalam Mongolian
242 # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
246 Encode::Alias - alias defintions to encodings
252 define_alias( newName => ENCODING);
256 Allows newName to be used as an alias for ENCODING. ENCODING may be
257 either the name of an encoding or an encoding object (as described
260 Currently I<newName> can be specified in the following ways:
264 =item As a simple string.
266 =item As a qr// compiled regular expression, e.g.:
268 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
270 In this case if I<ENCODING> is not a reference it is C<eval>-ed to
271 allow C<$1> etc. to be substituted. The example is one way to alias
272 names as used in X11 fonts to the MIME names for the iso-8859-*
273 family. Note the double quote inside the single quote.
275 If you are using a regex here, you have to use the quotes as shown or
276 it won't work. Also note that regex handling is tricky even for the
277 experienced. Use it with caution.
279 =item As a code reference, e.g.:
281 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
284 In this case C<$_> will be set to the name that is being looked up and
285 I<ENCODING> is passed to the sub as its first argument. The example
286 is another way to alias names as used in X11 fonts to the MIME names
287 for the iso-8859-* family.
291 =head2 Alias overloading
293 You can override predefined aliases by simply applying define_alias().
294 New alias is always evaluated first and when neccessary define_alias()
295 flushes internal cache to make new definition available.
297 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
298 # superset of SHIFT_JIS
300 define_alias( qr/shift.*jis$/i => '"cp932"' );
301 define_alias( qr/sjis$/i => '"cp932"' );
303 If you want to zap all predefined aliases, you can
305 Encode::Alias->undef_aliases;
309 Encode::Alias->init_aliases;
311 gets factory setting back.
316 L<Encode>, L<Encode::Supported>