Commit | Line | Data |
---|---|---|
5d030b67 JH |
1 | package Encode::Alias; |
2 | use strict; | |
656ebd29 | 3 | use warnings; |
1485817e | 4 | no warnings 'redefine'; |
64bc6d54 | 5 | our $VERSION = do { my @r = ( q$Revision: 2.11 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; |
8f139f4c | 6 | sub DEBUG () { 0 } |
5d030b67 | 7 | |
10c5ecbb | 8 | use base qw(Exporter); |
5d030b67 JH |
9 | |
10 | # Public, encouraged API is exported by default | |
5129552c | 11 | |
d1256cb1 RGS |
12 | our @EXPORT = |
13 | qw ( | |
14 | define_alias | |
15 | find_alias | |
16 | ); | |
5d030b67 | 17 | |
d1256cb1 RGS |
18 | our @Alias; # ordered matching list |
19 | our %Alias; # cached known aliases | |
5d030b67 | 20 | |
d1256cb1 | 21 | sub find_alias { |
c0abe5aa | 22 | require Encode; |
5d030b67 | 23 | my $class = shift; |
d1256cb1 RGS |
24 | my $find = shift; |
25 | unless ( exists $Alias{$find} ) { | |
26 | $Alias{$find} = undef; # Recursion guard | |
27 | for ( my $i = 0 ; $i < @Alias ; $i += 2 ) { | |
28 | my $alias = $Alias[$i]; | |
29 | my $val = $Alias[ $i + 1 ]; | |
30 | my $new; | |
31 | if ( ref($alias) eq 'Regexp' && $find =~ $alias ) { | |
32 | DEBUG and warn "eval $val"; | |
33 | $new = eval $val; | |
34 | DEBUG and $@ and warn "$val, $@"; | |
35 | } | |
36 | elsif ( ref($alias) eq 'CODE' ) { | |
37 | DEBUG and warn "$alias", "->", "($find)"; | |
38 | $new = $alias->($find); | |
39 | } | |
40 | elsif ( lc($find) eq lc($alias) ) { | |
41 | $new = $val; | |
42 | } | |
43 | if ( defined($new) ) { | |
44 | next if $new eq $find; # avoid (direct) recursion on bugs | |
45 | DEBUG and warn "$alias, $new"; | |
46 | my $enc = | |
47 | ( ref($new) ) ? $new : Encode::find_encoding($new); | |
48 | if ($enc) { | |
49 | $Alias{$find} = $enc; | |
50 | last; | |
51 | } | |
52 | } | |
53 | } | |
54 | ||
55 | # case insensitive search when canonical is not in all lowercase | |
56 | # RT ticket #7835 | |
57 | unless ( $Alias{$find} ) { | |
58 | my $lcfind = lc($find); | |
59 | for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule ) | |
60 | { | |
61 | $lcfind eq lc($name) or next; | |
62 | $Alias{$find} = Encode::find_encoding($name); | |
63 | DEBUG and warn "$find => $name"; | |
64 | } | |
65 | } | |
5d030b67 | 66 | } |
d1256cb1 RGS |
67 | if (DEBUG) { |
68 | my $name; | |
69 | if ( my $e = $Alias{$find} ) { | |
70 | $name = $e->name; | |
71 | } | |
72 | else { | |
73 | $name = ""; | |
74 | } | |
75 | warn "find_alias($class, $find)->name = $name"; | |
a63c962f | 76 | } |
151b5d36 | 77 | return $Alias{$find}; |
5d030b67 JH |
78 | } |
79 | ||
d1256cb1 RGS |
80 | sub define_alias { |
81 | while (@_) { | |
82 | my ( $alias, $name ) = splice( @_, 0, 2 ); | |
83 | unshift( @Alias, $alias => $name ); # newer one has precedence | |
84 | if ( ref($alias) ) { | |
85 | ||
86 | # clear %Alias cache to allow overrides | |
87 | my @a = keys %Alias; | |
88 | for my $k (@a) { | |
89 | if ( ref($alias) eq 'Regexp' && $k =~ $alias ) { | |
90 | DEBUG and warn "delete \$Alias\{$k\}"; | |
91 | delete $Alias{$k}; | |
92 | } | |
93 | elsif ( ref($alias) eq 'CODE' ) { | |
94 | DEBUG and warn "delete \$Alias\{$k\}"; | |
95 | delete $Alias{ $alias->($name) }; | |
96 | } | |
97 | } | |
98 | } | |
99 | else { | |
100 | DEBUG and warn "delete \$Alias\{$alias\}"; | |
101 | delete $Alias{$alias}; | |
102 | } | |
5d030b67 JH |
103 | } |
104 | } | |
105 | ||
5d030b67 | 106 | # Allow latin-1 style names as well |
8f1ed24a | 107 | # 0 1 2 3 4 5 6 7 8 9 10 |
5d030b67 | 108 | our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); |
d1256cb1 | 109 | |
5d030b67 | 110 | # Allow winlatin1 style names as well |
d1256cb1 RGS |
111 | our %Winlatin2cp = ( |
112 | 'latin1' => 1252, | |
113 | 'latin2' => 1250, | |
114 | 'cyrillic' => 1251, | |
115 | 'greek' => 1253, | |
116 | 'turkish' => 1254, | |
117 | 'hebrew' => 1255, | |
118 | 'arabic' => 1256, | |
119 | 'baltic' => 1257, | |
120 | 'vietnamese' => 1258, | |
121 | ); | |
5d030b67 | 122 | |
5129552c JH |
123 | init_aliases(); |
124 | ||
d1256cb1 | 125 | sub undef_aliases { |
5129552c JH |
126 | @Alias = (); |
127 | %Alias = (); | |
128 | } | |
129 | ||
d1256cb1 | 130 | sub init_aliases { |
c0abe5aa | 131 | require Encode; |
5129552c | 132 | undef_aliases(); |
d1256cb1 | 133 | |
f2a2953c | 134 | # Try all-lower-case version should all else fails |
a999c27c JH |
135 | define_alias( qr/^(.*)$/ => '"\L$1"' ); |
136 | ||
f2a2953c | 137 | # UTF/UCS stuff |
64bc6d54 | 138 | define_alias( qr/^(unicode-1-1-)?UTF-?7$/i => '"UTF-7"' ); |
d1256cb1 RGS |
139 | define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); |
140 | define_alias( | |
141 | qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', | |
142 | qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', | |
143 | qr/^iso-10646-1$/i => '"UCS-2BE"' | |
144 | ); | |
145 | define_alias( | |
146 | qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"', | |
147 | qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"', | |
148 | qr/^UTF-?(16|32)$/i => '"UTF-$1"', | |
149 | ); | |
150 | ||
f2a2953c | 151 | # ASCII |
d1256cb1 RGS |
152 | define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' ); |
153 | define_alias( 'C' => 'ascii' ); | |
ba40575b | 154 | define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' ); |
d1256cb1 | 155 | |
67d7b5ef JH |
156 | # Allow variants of iso-8859-1 etc. |
157 | define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' ); | |
5129552c | 158 | |
67d7b5ef JH |
159 | # At least HP-UX has these. |
160 | define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' ); | |
5129552c | 161 | |
67d7b5ef | 162 | # More HP stuff. |
d1256cb1 RGS |
163 | define_alias( |
164 | qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => | |
165 | '"${1}8"' ); | |
5129552c | 166 | |
67d7b5ef JH |
167 | # The Official name of ASCII. |
168 | define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' ); | |
5129552c | 169 | |
67d7b5ef JH |
170 | # This is a font issue, not an encoding issue. |
171 | # (The currency symbol of the Latin 1 upper half | |
172 | # has been redefined as the euro symbol.) | |
5129552c JH |
173 | define_alias( qr/^(.+)\@euro$/i => '"$1"' ); |
174 | ||
d1256cb1 RGS |
175 | define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i => |
176 | 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' | |
177 | ); | |
5129552c | 178 | |
d1256cb1 RGS |
179 | define_alias( |
180 | qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| | |
181 | hebrew|arabic|baltic|vietnamese)$/ix => | |
182 | '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' | |
183 | ); | |
5d030b67 | 184 | |
3c4b39be | 185 | # Common names for non-latin preferred MIME names |
d1256cb1 RGS |
186 | define_alias( |
187 | 'ascii' => 'US-ascii', | |
188 | 'cyrillic' => 'iso-8859-5', | |
189 | 'arabic' => 'iso-8859-6', | |
190 | 'greek' => 'iso-8859-7', | |
191 | 'hebrew' => 'iso-8859-8', | |
192 | 'thai' => 'iso-8859-11', | |
d1256cb1 | 193 | ); |
51e4e64d NC |
194 | # RT #20781 |
195 | define_alias(qr/\btis-?620\b/i => '"iso-8859-11"'); | |
5d030b67 | 196 | |
67d7b5ef JH |
197 | # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. |
198 | # And Microsoft has their own naming (again, surprisingly). | |
d1256cb1 RGS |
199 | # And windows-* is registered in IANA! |
200 | define_alias( | |
201 | qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' ); | |
5d030b67 | 202 | |
67d7b5ef | 203 | # Sometimes seen with a leading zero. |
c731e18e | 204 | # define_alias( qr/\bcp037\b/i => '"cp37"'); |
5d030b67 | 205 | |
3ef515df | 206 | # Mac Mappings |
a999c27c JH |
207 | # predefined in *.ucm; unneeded |
208 | # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); | |
d1256cb1 | 209 | define_alias( qr/^mac_(.*)$/i => '"mac$1"' ); |
2fd0906e SH |
210 | # http://rt.cpan.org/Ticket/Display.html?id=36326 |
211 | define_alias( qr/^macintosh$/i => '"MacRoman"' ); | |
d1256cb1 | 212 | |
a999c27c JH |
213 | # Ououououou. gone. They are differente! |
214 | # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); | |
d1256cb1 | 215 | |
f2a2953c | 216 | # Standardize on the dashed versions. |
cf9f87ce | 217 | define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' ); |
5129552c | 218 | |
d1256cb1 RGS |
219 | unless ($Encode::ON_EBCDIC) { |
220 | ||
a63c962f | 221 | # for Encode::CN |
d1256cb1 RGS |
222 | define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); |
223 | define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); | |
224 | ||
225 | # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) | |
226 | # CP936 doesn't have vendor-addon for GBK, so they're identical. | |
227 | define_alias( qr/^gbk$/i => '"cp936"' ); | |
228 | ||
229 | # This fixes gb2312 vs. euc-cn confusion, practically | |
230 | define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' ); | |
231 | ||
232 | # for Encode::JP | |
233 | define_alias( qr/\bjis$/i => '"7bit-jis"' ); | |
234 | define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); | |
235 | define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); | |
236 | define_alias( qr/\bujis$/i => '"euc-jp"' ); | |
237 | define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); | |
238 | define_alias( qr/\bsjis$/i => '"shiftjis"' ); | |
239 | define_alias( qr/\bwindows-31j$/i => '"cp932"' ); | |
240 | ||
a63c962f | 241 | # for Encode::KR |
d1256cb1 RGS |
242 | define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); |
243 | define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); | |
244 | ||
245 | # This fixes ksc5601 vs. euc-kr confusion, practically | |
246 | define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); | |
247 | define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); | |
248 | define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); | |
249 | ||
a63c962f | 250 | # for Encode::TW |
d1256cb1 RGS |
251 | define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); |
252 | define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' ); | |
253 | define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' ); | |
254 | define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); | |
255 | define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); | |
a63c962f | 256 | } |
d1256cb1 | 257 | |
f2a2953c | 258 | # utf8 is blessed :) |
d1256cb1 RGS |
259 | define_alias( qr/^UTF-8$/i => '"utf-8-strict"' ); |
260 | ||
67d7b5ef | 261 | # At last, Map white space and _ to '-' |
5129552c JH |
262 | define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); |
263 | } | |
264 | ||
265 | 1; | |
266 | __END__ | |
5d030b67 JH |
267 | |
268 | # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8 | |
269 | # TODO: HP-UX '15' encodings japanese15 korean15 roi15 | |
270 | # TODO: Cyrillic encoding ISO-IR-111 (useful?) | |
271 | # TODO: Armenian encoding ARMSCII-8 | |
272 | # TODO: Hebrew encoding ISO-8859-8-1 | |
273 | # TODO: Thai encoding TCVN | |
5d030b67 JH |
274 | # TODO: Vietnamese encodings VPS |
275 | # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese | |
276 | # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic | |
277 | # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese | |
278 | # Kannada Khmer Korean Laotian Malayalam Mongolian | |
279 | # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese | |
280 | ||
5d030b67 JH |
281 | =head1 NAME |
282 | ||
ce912cd4 | 283 | Encode::Alias - alias definitions to encodings |
5d030b67 JH |
284 | |
285 | =head1 SYNOPSIS | |
286 | ||
5129552c JH |
287 | use Encode; |
288 | use Encode::Alias; | |
5d030b67 JH |
289 | define_alias( newName => ENCODING); |
290 | ||
291 | =head1 DESCRIPTION | |
292 | ||
3ef515df | 293 | Allows newName to be used as an alias for ENCODING. ENCODING may be |
fcb875d4 | 294 | either the name of an encoding or an encoding object (as described |
3ef515df | 295 | in L<Encode>). |
5d030b67 JH |
296 | |
297 | Currently I<newName> can be specified in the following ways: | |
298 | ||
299 | =over 4 | |
300 | ||
301 | =item As a simple string. | |
302 | ||
303 | =item As a qr// compiled regular expression, e.g.: | |
304 | ||
305 | define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); | |
306 | ||
0ab8f81e JH |
307 | In this case, if I<ENCODING> is not a reference, it is C<eval>-ed |
308 | in order to allow C<$1> etc. to be substituted. The example is one | |
309 | way to alias names as used in X11 fonts to the MIME names for the | |
310 | iso-8859-* family. Note the double quotes inside the single quotes. | |
5d030b67 | 311 | |
151b5d36 JH |
312 | (or, you don't have to do this yourself because this example is predefined) |
313 | ||
3ef515df JH |
314 | If you are using a regex here, you have to use the quotes as shown or |
315 | it won't work. Also note that regex handling is tricky even for the | |
151b5d36 | 316 | experienced. Use this feature with caution. |
5d030b67 JH |
317 | |
318 | =item As a code reference, e.g.: | |
319 | ||
151b5d36 | 320 | define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); |
5d030b67 | 321 | |
151b5d36 JH |
322 | The same effect as the example above in a different way. The coderef |
323 | takes the alias name as an argument and returns a canonical name on | |
324 | success or undef if not. Note the second argument is not required. | |
325 | Use this with even more caution than the regex version. | |
5d030b67 | 326 | |
5129552c JH |
327 | =back |
328 | ||
151b5d36 JH |
329 | =head3 Changes in code reference aliasing |
330 | ||
331 | As of Encode 1.87, the older form | |
332 | ||
333 | define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); | |
334 | ||
335 | no longer works. | |
336 | ||
337 | Encode up to 1.86 internally used "local $_" to implement ths older | |
338 | form. But consider the code below; | |
339 | ||
340 | use Encode; | |
341 | $_ = "eeeee" ; | |
342 | while (/(e)/g) { | |
343 | my $utf = decode('aliased-encoding-name', $1); | |
344 | print "position:",pos,"\n"; | |
345 | } | |
346 | ||
347 | Prior to Encode 1.86 this fails because of "local $_". | |
348 | ||
0ab8f81e | 349 | =head2 Alias overloading |
5d030b67 | 350 | |
3ef515df | 351 | You can override predefined aliases by simply applying define_alias(). |
3c4b39be | 352 | The new alias is always evaluated first, and when necessary, |
0ab8f81e JH |
353 | define_alias() flushes the internal cache to make the new definition |
354 | available. | |
5d030b67 | 355 | |
0ab8f81e | 356 | # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a |
5d030b67 JH |
357 | # superset of SHIFT_JIS |
358 | ||
5129552c JH |
359 | define_alias( qr/shift.*jis$/i => '"cp932"' ); |
360 | define_alias( qr/sjis$/i => '"cp932"' ); | |
361 | ||
0ab8f81e | 362 | If you want to zap all predefined aliases, you can use |
5129552c JH |
363 | |
364 | Encode::Alias->undef_aliases; | |
365 | ||
366 | to do so. And | |
367 | ||
368 | Encode::Alias->init_aliases; | |
369 | ||
0ab8f81e | 370 | gets the factory settings back. |
5d030b67 JH |
371 | |
372 | =head1 SEE ALSO | |
373 | ||
374 | L<Encode>, L<Encode::Supported> | |
375 | ||
5129552c | 376 | =cut |
5d030b67 | 377 |