Commit | Line | Data |
---|---|---|
5d030b67 JH |
1 | package Encode::Alias; |
2 | use strict; | |
5129552c | 3 | use Encode; |
126bf8bf | 4 | our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
5d030b67 | 5 | our $DEBUG = 0; |
5d030b67 | 6 | |
10c5ecbb | 7 | use base qw(Exporter); |
5d030b67 JH |
8 | |
9 | # Public, encouraged API is exported by default | |
5129552c | 10 | |
fcb875d4 | 11 | our @EXPORT = |
5129552c JH |
12 | qw ( |
13 | define_alias | |
14 | find_alias | |
15 | ); | |
5d030b67 JH |
16 | |
17 | our @Alias; # ordered matching list | |
18 | our %Alias; # cached known aliases | |
19 | ||
5129552c | 20 | sub find_alias |
5d030b67 JH |
21 | { |
22 | my $class = shift; | |
23 | local $_ = shift; | |
24 | unless (exists $Alias{$_}) | |
25 | { | |
eaac0a15 | 26 | $Alias{$_} = undef; # Recursion guard |
5d030b67 JH |
27 | for (my $i=0; $i < @Alias; $i += 2) |
28 | { | |
29 | my $alias = $Alias[$i]; | |
30 | my $val = $Alias[$i+1]; | |
31 | my $new; | |
32 | if (ref($alias) eq 'Regexp' && $_ =~ $alias) | |
33 | { | |
a63c962f | 34 | $DEBUG and warn "eval $val"; |
5d030b67 JH |
35 | $new = eval $val; |
36 | # $@ and warn "$val, $@"; | |
37 | } | |
38 | elsif (ref($alias) eq 'CODE') | |
39 | { | |
a63c962f | 40 | $DEBUG and warn "$alias", "->", "($val)"; |
5d030b67 JH |
41 | $new = $alias->($val); |
42 | } | |
43 | elsif (lc($_) eq lc($alias)) | |
44 | { | |
45 | $new = $val; | |
46 | } | |
47 | if (defined($new)) | |
48 | { | |
49 | next if $new eq $_; # avoid (direct) recursion on bugs | |
a63c962f | 50 | $DEBUG and warn "$alias, $new"; |
5d030b67 JH |
51 | my $enc = (ref($new)) ? $new : Encode::find_encoding($new); |
52 | if ($enc) | |
53 | { | |
54 | $Alias{$_} = $enc; | |
55 | last; | |
56 | } | |
57 | } | |
58 | } | |
59 | } | |
a63c962f JH |
60 | if ($DEBUG){ |
61 | my $name; | |
62 | if (my $e = $Alias{$_}){ | |
63 | $name = $e->name; | |
64 | }else{ | |
65 | $name = ""; | |
66 | } | |
67 | warn "find_alias($class, $_)->name = $name"; | |
68 | } | |
5d030b67 JH |
69 | return $Alias{$_}; |
70 | } | |
71 | ||
72 | sub define_alias | |
73 | { | |
74 | while (@_) | |
75 | { | |
76 | my ($alias,$name) = splice(@_,0,2); | |
77 | unshift(@Alias, $alias => $name); # newer one has precedence | |
78 | # clear %Alias cache to allow overrides | |
79 | if (ref($alias)){ | |
5129552c JH |
80 | my @a = keys %Alias; |
81 | for my $k (@a){ | |
5d030b67 JH |
82 | if (ref($alias) eq 'Regexp' && $k =~ $alias) |
83 | { | |
a63c962f | 84 | $DEBUG and warn "delete \$Alias\{$k\}"; |
5d030b67 JH |
85 | delete $Alias{$k}; |
86 | } | |
87 | elsif (ref($alias) eq 'CODE') | |
88 | { | |
a63c962f | 89 | $DEBUG and warn "delete \$Alias\{$k\}"; |
5d030b67 JH |
90 | delete $Alias{$alias->($name)}; |
91 | } | |
92 | } | |
93 | }else{ | |
a63c962f | 94 | $DEBUG and warn "delete \$Alias\{$alias\}"; |
5d030b67 JH |
95 | delete $Alias{$alias}; |
96 | } | |
97 | } | |
98 | } | |
99 | ||
5d030b67 JH |
100 | # Allow latin-1 style names as well |
101 | # 0 1 2 3 4 5 6 7 8 9 10 | |
102 | our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); | |
5d030b67 JH |
103 | # Allow winlatin1 style names as well |
104 | our %Winlatin2cp = ( | |
105 | 'latin1' => 1252, | |
106 | 'latin2' => 1250, | |
107 | 'cyrillic' => 1251, | |
108 | 'greek' => 1253, | |
109 | 'turkish' => 1254, | |
110 | 'hebrew' => 1255, | |
111 | 'arabic' => 1256, | |
112 | 'baltic' => 1257, | |
113 | 'vietnamese' => 1258, | |
114 | ); | |
115 | ||
5129552c JH |
116 | init_aliases(); |
117 | ||
118 | sub undef_aliases{ | |
119 | @Alias = (); | |
120 | %Alias = (); | |
121 | } | |
122 | ||
123 | sub init_aliases | |
124 | { | |
125 | undef_aliases(); | |
a999c27c | 126 | |
f2a2953c | 127 | # Try all-lower-case version should all else fails |
a999c27c JH |
128 | define_alias( qr/^(.*)$/ => '"\L$1"' ); |
129 | ||
f2a2953c | 130 | # UTF/UCS stuff |
11067275 JH |
131 | define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); |
132 | define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', | |
126bf8bf | 133 | qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', |
11067275 JH |
134 | qr/^iso-10646-1$/i => '"UCS-2BE"' ); |
135 | define_alias( qr/^UTF(16|32)-?BE$/i => '"UTF-$1BE"', | |
136 | qr/^UTF(16|32)-?LE$/i => '"UTF-$1LE"', | |
137 | qr/^UTF(16|32)$/i => '"UTF-$1"', | |
f2a2953c JH |
138 | ); |
139 | # ASCII | |
a999c27c | 140 | define_alias(qr/^(?:US-?)ascii$/i => '"ascii"'); |
a999c27c | 141 | define_alias('C' => 'ascii'); |
67d7b5ef JH |
142 | # Allow variants of iso-8859-1 etc. |
143 | define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' ); | |
5129552c | 144 | |
67d7b5ef JH |
145 | # At least HP-UX has these. |
146 | define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' ); | |
5129552c | 147 | |
67d7b5ef JH |
148 | # More HP stuff. |
149 | define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' ); | |
5129552c | 150 | |
67d7b5ef JH |
151 | # The Official name of ASCII. |
152 | define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' ); | |
5129552c | 153 | |
67d7b5ef JH |
154 | # This is a font issue, not an encoding issue. |
155 | # (The currency symbol of the Latin 1 upper half | |
156 | # has been redefined as the euro symbol.) | |
5129552c JH |
157 | define_alias( qr/^(.+)\@euro$/i => '"$1"' ); |
158 | ||
fcb875d4 | 159 | define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i |
5129552c JH |
160 | => '"iso-8859-$Encode::Alias::Latin2iso[$1]"' ); |
161 | ||
67d7b5ef | 162 | define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| |
fcb875d4 | 163 | hebrew|arabic|baltic|vietnamese)$/ix => |
5129552c | 164 | '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' ); |
5d030b67 | 165 | |
67d7b5ef | 166 | # Common names for non-latin prefered MIME names |
5129552c JH |
167 | define_alias( 'ascii' => 'US-ascii', |
168 | 'cyrillic' => 'iso-8859-5', | |
169 | 'arabic' => 'iso-8859-6', | |
170 | 'greek' => 'iso-8859-7', | |
171 | 'hebrew' => 'iso-8859-8', | |
172 | 'thai' => 'iso-8859-11', | |
173 | 'tis620' => 'iso-8859-11', | |
174 | ); | |
5d030b67 | 175 | |
67d7b5ef JH |
176 | # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. |
177 | # And Microsoft has their own naming (again, surprisingly). | |
fcb875d4 | 178 | # And windows-* is registered in IANA! |
67d7b5ef | 179 | define_alias( qr/\b(?:ibm|ms|windows)[-_]?(\d\d\d\d?)$/i => '"cp$1"'); |
5d030b67 | 180 | |
67d7b5ef | 181 | # Sometimes seen with a leading zero. |
c731e18e | 182 | # define_alias( qr/\bcp037\b/i => '"cp37"'); |
5d030b67 | 183 | |
3ef515df | 184 | # Mac Mappings |
a999c27c JH |
185 | # predefined in *.ucm; unneeded |
186 | # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); | |
3ef515df | 187 | define_alias( qr/^mac_(.*)$/i => '"mac$1"'); |
a999c27c JH |
188 | # Ououououou. gone. They are differente! |
189 | # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); | |
fcb875d4 | 190 | |
f2a2953c | 191 | # Standardize on the dashed versions. |
67d7b5ef JH |
192 | # define_alias( qr/\butf8$/i => 'utf-8' ); |
193 | define_alias( qr/\bkoi8r$/i => 'koi8-r' ); | |
194 | define_alias( qr/\bkoi8u$/i => 'koi8-u' ); | |
5129552c | 195 | |
a63c962f JH |
196 | unless ($Encode::ON_EBCDIC){ |
197 | # for Encode::CN | |
67d7b5ef JH |
198 | define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); |
199 | define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); | |
200 | # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) | |
201 | # CP936 doesn't have vendor-addon for GBK, so they're identical. | |
202 | define_alias( qr/^gbk$/i => '"cp936"'); | |
203 | # This fixes gb2312 vs. euc-cn confusion, practically | |
204 | define_alias( qr/\bGB[-_ ]?2312(?:\D.*$|$)/i => '"euc-cn"' ); | |
205 | # for Encode::JP | |
206 | define_alias( qr/\bjis$/i => '"7bit-jis"' ); | |
207 | define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); | |
208 | define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); | |
209 | define_alias( qr/\bujis$/i => '"euc-jp"' ); | |
210 | define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); | |
211 | define_alias( qr/\bsjis$/i => '"shiftjis"' ); | |
a63c962f | 212 | # for Encode::KR |
67d7b5ef JH |
213 | define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); |
214 | define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); | |
215 | # This fixes ksc5601 vs. euc-kr confusion, practically | |
216 | define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); | |
217 | define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); | |
218 | define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); | |
a63c962f | 219 | # for Encode::TW |
b0b300a3 JH |
220 | define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); |
221 | define_alias( qr/\bbig5-?et(?:en)$/i => '"big5-eten"' ); | |
222 | define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); | |
a63c962f | 223 | } |
f2a2953c JH |
224 | # utf8 is blessed :) |
225 | define_alias( qr/^UTF-8$/i => '"utf8"',); | |
67d7b5ef | 226 | # At last, Map white space and _ to '-' |
5129552c JH |
227 | define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); |
228 | } | |
229 | ||
230 | 1; | |
231 | __END__ | |
5d030b67 JH |
232 | |
233 | # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8 | |
234 | # TODO: HP-UX '15' encodings japanese15 korean15 roi15 | |
235 | # TODO: Cyrillic encoding ISO-IR-111 (useful?) | |
236 | # TODO: Armenian encoding ARMSCII-8 | |
237 | # TODO: Hebrew encoding ISO-8859-8-1 | |
238 | # TODO: Thai encoding TCVN | |
5d030b67 JH |
239 | # TODO: Vietnamese encodings VPS |
240 | # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese | |
241 | # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic | |
242 | # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese | |
243 | # Kannada Khmer Korean Laotian Malayalam Mongolian | |
244 | # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese | |
245 | ||
5d030b67 JH |
246 | =head1 NAME |
247 | ||
ce912cd4 | 248 | Encode::Alias - alias definitions to encodings |
5d030b67 JH |
249 | |
250 | =head1 SYNOPSIS | |
251 | ||
5129552c JH |
252 | use Encode; |
253 | use Encode::Alias; | |
5d030b67 JH |
254 | define_alias( newName => ENCODING); |
255 | ||
256 | =head1 DESCRIPTION | |
257 | ||
3ef515df | 258 | Allows newName to be used as an alias for ENCODING. ENCODING may be |
fcb875d4 | 259 | either the name of an encoding or an encoding object (as described |
3ef515df | 260 | in L<Encode>). |
5d030b67 JH |
261 | |
262 | Currently I<newName> can be specified in the following ways: | |
263 | ||
264 | =over 4 | |
265 | ||
266 | =item As a simple string. | |
267 | ||
268 | =item As a qr// compiled regular expression, e.g.: | |
269 | ||
270 | define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); | |
271 | ||
0ab8f81e JH |
272 | In this case, if I<ENCODING> is not a reference, it is C<eval>-ed |
273 | in order to allow C<$1> etc. to be substituted. The example is one | |
274 | way to alias names as used in X11 fonts to the MIME names for the | |
275 | iso-8859-* family. Note the double quotes inside the single quotes. | |
5d030b67 | 276 | |
3ef515df JH |
277 | If you are using a regex here, you have to use the quotes as shown or |
278 | it won't work. Also note that regex handling is tricky even for the | |
279 | experienced. Use it with caution. | |
5d030b67 JH |
280 | |
281 | =item As a code reference, e.g.: | |
282 | ||
283 | define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , ''); | |
284 | ||
0ab8f81e | 285 | In this case, C<$_> will be set to the name that is being looked up and |
5d030b67 | 286 | I<ENCODING> is passed to the sub as its first argument. The example |
3ef515df JH |
287 | is another way to alias names as used in X11 fonts to the MIME names |
288 | for the iso-8859-* family. | |
5d030b67 | 289 | |
5129552c JH |
290 | =back |
291 | ||
0ab8f81e | 292 | =head2 Alias overloading |
5d030b67 | 293 | |
3ef515df | 294 | You can override predefined aliases by simply applying define_alias(). |
0ab8f81e JH |
295 | The new alias is always evaluated first, and when neccessary, |
296 | define_alias() flushes the internal cache to make the new definition | |
297 | available. | |
5d030b67 | 298 | |
0ab8f81e | 299 | # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a |
5d030b67 JH |
300 | # superset of SHIFT_JIS |
301 | ||
5129552c JH |
302 | define_alias( qr/shift.*jis$/i => '"cp932"' ); |
303 | define_alias( qr/sjis$/i => '"cp932"' ); | |
304 | ||
0ab8f81e | 305 | If you want to zap all predefined aliases, you can use |
5129552c JH |
306 | |
307 | Encode::Alias->undef_aliases; | |
308 | ||
309 | to do so. And | |
310 | ||
311 | Encode::Alias->init_aliases; | |
312 | ||
0ab8f81e | 313 | gets the factory settings back. |
5d030b67 JH |
314 | |
315 | =head1 SEE ALSO | |
316 | ||
317 | L<Encode>, L<Encode::Supported> | |
318 | ||
5129552c | 319 | =cut |
5d030b67 | 320 |