This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Move Encode from ext/ to cpan/
[perl5.git] / cpan / Encode / lib / Encode / GSM0338.pm
CommitLineData
44b3b9c7 1#
2fd0906e 2# $Id: GSM0338.pm,v 2.1 2008/05/07 20:56:05 dankogai Exp $
44b3b9c7
SP
3#
4package Encode::GSM0338;
5
6use strict;
7use warnings;
8use Carp;
9
10use vars qw($VERSION);
0263186c 11$VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
44b3b9c7
SP
12
13use Encode qw(:fallbacks);
14
15use base qw(Encode::Encoding);
16__PACKAGE__->Define('gsm0338');
17
18sub needs_lines { 1 }
19sub perlio_ok { 0 }
20
21use utf8;
22our %UNI2GSM = (
23 "\x{0040}" => "\x00", # COMMERCIAL AT
24 "\x{000A}" => "\x0A", # LINE FEED
25 "\x{000C}" => "\x1B\x0A", # FORM FEED
26 "\x{000D}" => "\x0D", # CARRIAGE RETURN
27 "\x{0020}" => "\x20", # SPACE
28 "\x{0021}" => "\x21", # EXCLAMATION MARK
29 "\x{0022}" => "\x22", # QUOTATION MARK
30 "\x{0023}" => "\x23", # NUMBER SIGN
31 "\x{0024}" => "\x02", # DOLLAR SIGN
32 "\x{0025}" => "\x25", # PERCENT SIGN
33 "\x{0026}" => "\x26", # AMPERSAND
34 "\x{0027}" => "\x27", # APOSTROPHE
35 "\x{0028}" => "\x28", # LEFT PARENTHESIS
36 "\x{0029}" => "\x29", # RIGHT PARENTHESIS
37 "\x{002A}" => "\x2A", # ASTERISK
38 "\x{002B}" => "\x2B", # PLUS SIGN
39 "\x{002C}" => "\x2C", # COMMA
40 "\x{002D}" => "\x2D", # HYPHEN-MINUS
41 "\x{002E}" => "\x2E", # FULL STOP
42 "\x{002F}" => "\x2F", # SOLIDUS
43 "\x{0030}" => "\x30", # DIGIT ZERO
44 "\x{0031}" => "\x31", # DIGIT ONE
45 "\x{0032}" => "\x32", # DIGIT TWO
46 "\x{0033}" => "\x33", # DIGIT THREE
47 "\x{0034}" => "\x34", # DIGIT FOUR
48 "\x{0035}" => "\x35", # DIGIT FIVE
49 "\x{0036}" => "\x36", # DIGIT SIX
50 "\x{0037}" => "\x37", # DIGIT SEVEN
51 "\x{0038}" => "\x38", # DIGIT EIGHT
52 "\x{0039}" => "\x39", # DIGIT NINE
53 "\x{003A}" => "\x3A", # COLON
54 "\x{003B}" => "\x3B", # SEMICOLON
55 "\x{003C}" => "\x3C", # LESS-THAN SIGN
56 "\x{003D}" => "\x3D", # EQUALS SIGN
57 "\x{003E}" => "\x3E", # GREATER-THAN SIGN
58 "\x{003F}" => "\x3F", # QUESTION MARK
59 "\x{0041}" => "\x41", # LATIN CAPITAL LETTER A
60 "\x{0042}" => "\x42", # LATIN CAPITAL LETTER B
61 "\x{0043}" => "\x43", # LATIN CAPITAL LETTER C
62 "\x{0044}" => "\x44", # LATIN CAPITAL LETTER D
63 "\x{0045}" => "\x45", # LATIN CAPITAL LETTER E
64 "\x{0046}" => "\x46", # LATIN CAPITAL LETTER F
65 "\x{0047}" => "\x47", # LATIN CAPITAL LETTER G
66 "\x{0048}" => "\x48", # LATIN CAPITAL LETTER H
67 "\x{0049}" => "\x49", # LATIN CAPITAL LETTER I
68 "\x{004A}" => "\x4A", # LATIN CAPITAL LETTER J
69 "\x{004B}" => "\x4B", # LATIN CAPITAL LETTER K
70 "\x{004C}" => "\x4C", # LATIN CAPITAL LETTER L
71 "\x{004D}" => "\x4D", # LATIN CAPITAL LETTER M
72 "\x{004E}" => "\x4E", # LATIN CAPITAL LETTER N
73 "\x{004F}" => "\x4F", # LATIN CAPITAL LETTER O
74 "\x{0050}" => "\x50", # LATIN CAPITAL LETTER P
75 "\x{0051}" => "\x51", # LATIN CAPITAL LETTER Q
76 "\x{0052}" => "\x52", # LATIN CAPITAL LETTER R
77 "\x{0053}" => "\x53", # LATIN CAPITAL LETTER S
78 "\x{0054}" => "\x54", # LATIN CAPITAL LETTER T
79 "\x{0055}" => "\x55", # LATIN CAPITAL LETTER U
80 "\x{0056}" => "\x56", # LATIN CAPITAL LETTER V
81 "\x{0057}" => "\x57", # LATIN CAPITAL LETTER W
82 "\x{0058}" => "\x58", # LATIN CAPITAL LETTER X
83 "\x{0059}" => "\x59", # LATIN CAPITAL LETTER Y
84 "\x{005A}" => "\x5A", # LATIN CAPITAL LETTER Z
85 "\x{005F}" => "\x11", # LOW LINE
86 "\x{0061}" => "\x61", # LATIN SMALL LETTER A
87 "\x{0062}" => "\x62", # LATIN SMALL LETTER B
88 "\x{0063}" => "\x63", # LATIN SMALL LETTER C
89 "\x{0064}" => "\x64", # LATIN SMALL LETTER D
90 "\x{0065}" => "\x65", # LATIN SMALL LETTER E
91 "\x{0066}" => "\x66", # LATIN SMALL LETTER F
92 "\x{0067}" => "\x67", # LATIN SMALL LETTER G
93 "\x{0068}" => "\x68", # LATIN SMALL LETTER H
94 "\x{0069}" => "\x69", # LATIN SMALL LETTER I
95 "\x{006A}" => "\x6A", # LATIN SMALL LETTER J
96 "\x{006B}" => "\x6B", # LATIN SMALL LETTER K
97 "\x{006C}" => "\x6C", # LATIN SMALL LETTER L
98 "\x{006D}" => "\x6D", # LATIN SMALL LETTER M
99 "\x{006E}" => "\x6E", # LATIN SMALL LETTER N
100 "\x{006F}" => "\x6F", # LATIN SMALL LETTER O
101 "\x{0070}" => "\x70", # LATIN SMALL LETTER P
102 "\x{0071}" => "\x71", # LATIN SMALL LETTER Q
103 "\x{0072}" => "\x72", # LATIN SMALL LETTER R
104 "\x{0073}" => "\x73", # LATIN SMALL LETTER S
105 "\x{0074}" => "\x74", # LATIN SMALL LETTER T
106 "\x{0075}" => "\x75", # LATIN SMALL LETTER U
107 "\x{0076}" => "\x76", # LATIN SMALL LETTER V
108 "\x{0077}" => "\x77", # LATIN SMALL LETTER W
109 "\x{0078}" => "\x78", # LATIN SMALL LETTER X
110 "\x{0079}" => "\x79", # LATIN SMALL LETTER Y
111 "\x{007A}" => "\x7A", # LATIN SMALL LETTER Z
112 "\x{000C}" => "\x1B\x0A", # FORM FEED
113 "\x{005B}" => "\x1B\x3C", # LEFT SQUARE BRACKET
114 "\x{005C}" => "\x1B\x2F", # REVERSE SOLIDUS
115 "\x{005D}" => "\x1B\x3E", # RIGHT SQUARE BRACKET
116 "\x{005E}" => "\x1B\x14", # CIRCUMFLEX ACCENT
117 "\x{007B}" => "\x1B\x28", # LEFT CURLY BRACKET
118 "\x{007C}" => "\x1B\x40", # VERTICAL LINE
119 "\x{007D}" => "\x1B\x29", # RIGHT CURLY BRACKET
120 "\x{007E}" => "\x1B\x3D", # TILDE
121 "\x{00A0}" => "\x1B", # NO-BREAK SPACE
122 "\x{00A1}" => "\x40", # INVERTED EXCLAMATION MARK
123 "\x{00A3}" => "\x01", # POUND SIGN
124 "\x{00A4}" => "\x24", # CURRENCY SIGN
125 "\x{00A5}" => "\x03", # YEN SIGN
126 "\x{00A7}" => "\x5F", # SECTION SIGN
127 "\x{00BF}" => "\x60", # INVERTED QUESTION MARK
128 "\x{00C4}" => "\x5B", # LATIN CAPITAL LETTER A WITH DIAERESIS
129 "\x{00C5}" => "\x0E", # LATIN CAPITAL LETTER A WITH RING ABOVE
130 "\x{00C6}" => "\x1C", # LATIN CAPITAL LETTER AE
131 "\x{00C9}" => "\x1F", # LATIN CAPITAL LETTER E WITH ACUTE
132 "\x{00D1}" => "\x5D", # LATIN CAPITAL LETTER N WITH TILDE
133 "\x{00D6}" => "\x5C", # LATIN CAPITAL LETTER O WITH DIAERESIS
134 "\x{00D8}" => "\x0B", # LATIN CAPITAL LETTER O WITH STROKE
135 "\x{00DC}" => "\x5E", # LATIN CAPITAL LETTER U WITH DIAERESIS
136 "\x{00DF}" => "\x1E", # LATIN SMALL LETTER SHARP S
137 "\x{00E0}" => "\x7F", # LATIN SMALL LETTER A WITH GRAVE
138 "\x{00E4}" => "\x7B", # LATIN SMALL LETTER A WITH DIAERESIS
139 "\x{00E5}" => "\x0F", # LATIN SMALL LETTER A WITH RING ABOVE
140 "\x{00E6}" => "\x1D", # LATIN SMALL LETTER AE
141 "\x{00E7}" => "\x09", # LATIN SMALL LETTER C WITH CEDILLA
142 "\x{00E8}" => "\x04", # LATIN SMALL LETTER E WITH GRAVE
143 "\x{00E9}" => "\x05", # LATIN SMALL LETTER E WITH ACUTE
144 "\x{00EC}" => "\x07", # LATIN SMALL LETTER I WITH GRAVE
145 "\x{00F1}" => "\x7D", # LATIN SMALL LETTER N WITH TILDE
146 "\x{00F2}" => "\x08", # LATIN SMALL LETTER O WITH GRAVE
147 "\x{00F6}" => "\x7C", # LATIN SMALL LETTER O WITH DIAERESIS
148 "\x{00F8}" => "\x0C", # LATIN SMALL LETTER O WITH STROKE
149 "\x{00F9}" => "\x06", # LATIN SMALL LETTER U WITH GRAVE
150 "\x{00FC}" => "\x7E", # LATIN SMALL LETTER U WITH DIAERESIS
151 "\x{0393}" => "\x13", # GREEK CAPITAL LETTER GAMMA
152 "\x{0394}" => "\x10", # GREEK CAPITAL LETTER DELTA
153 "\x{0398}" => "\x19", # GREEK CAPITAL LETTER THETA
154 "\x{039B}" => "\x14", # GREEK CAPITAL LETTER LAMDA
155 "\x{039E}" => "\x1A", # GREEK CAPITAL LETTER XI
156 "\x{03A0}" => "\x16", # GREEK CAPITAL LETTER PI
157 "\x{03A3}" => "\x18", # GREEK CAPITAL LETTER SIGMA
158 "\x{03A6}" => "\x12", # GREEK CAPITAL LETTER PHI
159 "\x{03A8}" => "\x17", # GREEK CAPITAL LETTER PSI
160 "\x{03A9}" => "\x15", # GREEK CAPITAL LETTER OMEGA
161 "\x{20AC}" => "\x1B\x65", # EURO SIGN
162);
163our %GSM2UNI = reverse %UNI2GSM;
164our $ESC = "\x1b";
165our $ATMARK = "\x40";
166our $FBCHAR = "\x3F";
167our $NBSP = "\x{00A0}";
168
169#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode"
170
171sub decode ($$;$) {
172 my ( $obj, $bytes, $chk ) = @_;
173 my $str;
174 while ( length $bytes ) {
175 my $c = substr( $bytes, 0, 1, '' );
176 my $u;
177 if ( $c eq "\x00" ) {
178 my $c2 = substr( $bytes, 0, 1, '' );
179 $u =
180 !length $c2 ? $ATMARK
181 : $c2 eq "\x00" ? "\x{0000}"
182 : exists $GSM2UNI{$c2} ? $ATMARK . $GSM2UNI{$c2}
183 : $chk
184 ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode",
185 ord($c), ord($c2) )
186 : $ATMARK . $FBCHAR;
187
188 }
189 elsif ( $c eq $ESC ) {
190 my $c2 = substr( $bytes, 0, 1, '' );
191 $u =
192 exists $GSM2UNI{ $c . $c2 } ? $GSM2UNI{ $c . $c2 }
193 : exists $GSM2UNI{$c2} ? $NBSP . $GSM2UNI{$c2}
194 : $chk
195 ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode",
196 ord($c), ord($c2) )
197 : $NBSP . $FBCHAR;
198 }
199 else {
200 $u =
0263186c
NC
201 exists $GSM2UNI{$c}
202 ? $GSM2UNI{$c}
203 : $chk ? ref $chk eq 'CODE'
204 ? $chk->( ord $c )
205 : croak sprintf( "\\x%02X does not map to Unicode", ord($c) )
44b3b9c7
SP
206 : $FBCHAR;
207 }
208 $str .= $u;
209 }
210 $_[1] = $bytes if $chk;
211 return $str;
212}
213
214#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s"
215
216sub encode($$;$) {
217 my ( $obj, $str, $chk ) = @_;
218 my $bytes;
219 while ( length $str ) {
220 my $u = substr( $str, 0, 1, '' );
221 my $c;
222 $bytes .=
0263186c
NC
223 exists $UNI2GSM{$u}
224 ? $UNI2GSM{$u}
225 : $chk ? ref $chk eq 'CODE'
226 ? $chk->( ord($u) )
227 : croak sprintf( "\\x{%04x} does not map to %s",
228 ord($u), $obj->name )
44b3b9c7
SP
229 : $FBCHAR;
230 }
231 $_[1] = $str if $chk;
232 return $bytes;
233}
234
2351;
236__END__
237
238=head1 NAME
239
240Encode::GSM0338 -- ESTI GSM 03.38 Encoding
241
242=head1 SYNOPSIS
243
244 use Encode qw/encode decode/;
245 $gsm0338 = encode("gsm0338", $utf8); # loads Encode::GSM0338 implicitly
246 $utf8 = decode("gsm0338", $gsm0338); # ditto
247
248=head1 DESCRIPTION
249
250GSM0338 is for GSM handsets. Though it shares alphanumerals with ASCII,
251control character ranges and other parts are mapped very differently,
252mainly to store Greek characters. There are also escape sequences
253(starting with 0x1B) to cover e.g. the Euro sign.
254
255This was once handled by L<Encode::Bytes> but because of all those
256unusual specifications, Encode 2.20 has relocated the support to
257this module.
258
259=head1 NOTES
260
261Unlike most other encodings, the following aways croaks on error
262for any $chk that evaluates to true.
263
264 $gsm0338 = encode("gsm0338", $utf8 $chk);
265 $utf8 = decode("gsm0338", $gsm0338, $chk);
266
267So if you want to check the validity of the encoding, surround the
268expression with C<eval {}> block as follows;
269
270 eval {
271 $utf8 = decode("gsm0338", $gsm0338, $chk);
272 };
273 if ($@){
274 # handle exception here
275 }
276
277=head1 BUGS
278
279ESTI GSM 03.38 Encoding itself.
280
281Mapping \x00 to '@' causes too much pain everywhere.
282
283Its use of \x1b (escape) is also very questionable.
284
285Because of those two, the code paging approach used use in ucm-based
286Encoding SOMETIMES fails so this module was written.
287
288=head1 SEE ALSO
289
290L<Encode>
291
292=cut