| 1 | # |
| 2 | # $Id: GSM0338.pm,v 2.1 2008/05/07 20:56:05 dankogai Exp $ |
| 3 | # |
| 4 | package Encode::GSM0338; |
| 5 | |
| 6 | use strict; |
| 7 | use warnings; |
| 8 | use Carp; |
| 9 | |
| 10 | use vars qw($VERSION); |
| 11 | $VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; |
| 12 | |
| 13 | use Encode qw(:fallbacks); |
| 14 | |
| 15 | use base qw(Encode::Encoding); |
| 16 | __PACKAGE__->Define('gsm0338'); |
| 17 | |
| 18 | sub needs_lines { 1 } |
| 19 | sub perlio_ok { 0 } |
| 20 | |
| 21 | use utf8; |
| 22 | our %UNI2GSM = ( |
| 23 | "\x{0040}" => "\x00", # COMMERCIAL AT |
| 24 | "\x{000A}" => "\x0A", # LINE FEED |
| 25 | "\x{000C}" => "\x1B\x0A", # FORM FEED |
| 26 | "\x{000D}" => "\x0D", # CARRIAGE RETURN |
| 27 | "\x{0020}" => "\x20", # SPACE |
| 28 | "\x{0021}" => "\x21", # EXCLAMATION MARK |
| 29 | "\x{0022}" => "\x22", # QUOTATION MARK |
| 30 | "\x{0023}" => "\x23", # NUMBER SIGN |
| 31 | "\x{0024}" => "\x02", # DOLLAR SIGN |
| 32 | "\x{0025}" => "\x25", # PERCENT SIGN |
| 33 | "\x{0026}" => "\x26", # AMPERSAND |
| 34 | "\x{0027}" => "\x27", # APOSTROPHE |
| 35 | "\x{0028}" => "\x28", # LEFT PARENTHESIS |
| 36 | "\x{0029}" => "\x29", # RIGHT PARENTHESIS |
| 37 | "\x{002A}" => "\x2A", # ASTERISK |
| 38 | "\x{002B}" => "\x2B", # PLUS SIGN |
| 39 | "\x{002C}" => "\x2C", # COMMA |
| 40 | "\x{002D}" => "\x2D", # HYPHEN-MINUS |
| 41 | "\x{002E}" => "\x2E", # FULL STOP |
| 42 | "\x{002F}" => "\x2F", # SOLIDUS |
| 43 | "\x{0030}" => "\x30", # DIGIT ZERO |
| 44 | "\x{0031}" => "\x31", # DIGIT ONE |
| 45 | "\x{0032}" => "\x32", # DIGIT TWO |
| 46 | "\x{0033}" => "\x33", # DIGIT THREE |
| 47 | "\x{0034}" => "\x34", # DIGIT FOUR |
| 48 | "\x{0035}" => "\x35", # DIGIT FIVE |
| 49 | "\x{0036}" => "\x36", # DIGIT SIX |
| 50 | "\x{0037}" => "\x37", # DIGIT SEVEN |
| 51 | "\x{0038}" => "\x38", # DIGIT EIGHT |
| 52 | "\x{0039}" => "\x39", # DIGIT NINE |
| 53 | "\x{003A}" => "\x3A", # COLON |
| 54 | "\x{003B}" => "\x3B", # SEMICOLON |
| 55 | "\x{003C}" => "\x3C", # LESS-THAN SIGN |
| 56 | "\x{003D}" => "\x3D", # EQUALS SIGN |
| 57 | "\x{003E}" => "\x3E", # GREATER-THAN SIGN |
| 58 | "\x{003F}" => "\x3F", # QUESTION MARK |
| 59 | "\x{0041}" => "\x41", # LATIN CAPITAL LETTER A |
| 60 | "\x{0042}" => "\x42", # LATIN CAPITAL LETTER B |
| 61 | "\x{0043}" => "\x43", # LATIN CAPITAL LETTER C |
| 62 | "\x{0044}" => "\x44", # LATIN CAPITAL LETTER D |
| 63 | "\x{0045}" => "\x45", # LATIN CAPITAL LETTER E |
| 64 | "\x{0046}" => "\x46", # LATIN CAPITAL LETTER F |
| 65 | "\x{0047}" => "\x47", # LATIN CAPITAL LETTER G |
| 66 | "\x{0048}" => "\x48", # LATIN CAPITAL LETTER H |
| 67 | "\x{0049}" => "\x49", # LATIN CAPITAL LETTER I |
| 68 | "\x{004A}" => "\x4A", # LATIN CAPITAL LETTER J |
| 69 | "\x{004B}" => "\x4B", # LATIN CAPITAL LETTER K |
| 70 | "\x{004C}" => "\x4C", # LATIN CAPITAL LETTER L |
| 71 | "\x{004D}" => "\x4D", # LATIN CAPITAL LETTER M |
| 72 | "\x{004E}" => "\x4E", # LATIN CAPITAL LETTER N |
| 73 | "\x{004F}" => "\x4F", # LATIN CAPITAL LETTER O |
| 74 | "\x{0050}" => "\x50", # LATIN CAPITAL LETTER P |
| 75 | "\x{0051}" => "\x51", # LATIN CAPITAL LETTER Q |
| 76 | "\x{0052}" => "\x52", # LATIN CAPITAL LETTER R |
| 77 | "\x{0053}" => "\x53", # LATIN CAPITAL LETTER S |
| 78 | "\x{0054}" => "\x54", # LATIN CAPITAL LETTER T |
| 79 | "\x{0055}" => "\x55", # LATIN CAPITAL LETTER U |
| 80 | "\x{0056}" => "\x56", # LATIN CAPITAL LETTER V |
| 81 | "\x{0057}" => "\x57", # LATIN CAPITAL LETTER W |
| 82 | "\x{0058}" => "\x58", # LATIN CAPITAL LETTER X |
| 83 | "\x{0059}" => "\x59", # LATIN CAPITAL LETTER Y |
| 84 | "\x{005A}" => "\x5A", # LATIN CAPITAL LETTER Z |
| 85 | "\x{005F}" => "\x11", # LOW LINE |
| 86 | "\x{0061}" => "\x61", # LATIN SMALL LETTER A |
| 87 | "\x{0062}" => "\x62", # LATIN SMALL LETTER B |
| 88 | "\x{0063}" => "\x63", # LATIN SMALL LETTER C |
| 89 | "\x{0064}" => "\x64", # LATIN SMALL LETTER D |
| 90 | "\x{0065}" => "\x65", # LATIN SMALL LETTER E |
| 91 | "\x{0066}" => "\x66", # LATIN SMALL LETTER F |
| 92 | "\x{0067}" => "\x67", # LATIN SMALL LETTER G |
| 93 | "\x{0068}" => "\x68", # LATIN SMALL LETTER H |
| 94 | "\x{0069}" => "\x69", # LATIN SMALL LETTER I |
| 95 | "\x{006A}" => "\x6A", # LATIN SMALL LETTER J |
| 96 | "\x{006B}" => "\x6B", # LATIN SMALL LETTER K |
| 97 | "\x{006C}" => "\x6C", # LATIN SMALL LETTER L |
| 98 | "\x{006D}" => "\x6D", # LATIN SMALL LETTER M |
| 99 | "\x{006E}" => "\x6E", # LATIN SMALL LETTER N |
| 100 | "\x{006F}" => "\x6F", # LATIN SMALL LETTER O |
| 101 | "\x{0070}" => "\x70", # LATIN SMALL LETTER P |
| 102 | "\x{0071}" => "\x71", # LATIN SMALL LETTER Q |
| 103 | "\x{0072}" => "\x72", # LATIN SMALL LETTER R |
| 104 | "\x{0073}" => "\x73", # LATIN SMALL LETTER S |
| 105 | "\x{0074}" => "\x74", # LATIN SMALL LETTER T |
| 106 | "\x{0075}" => "\x75", # LATIN SMALL LETTER U |
| 107 | "\x{0076}" => "\x76", # LATIN SMALL LETTER V |
| 108 | "\x{0077}" => "\x77", # LATIN SMALL LETTER W |
| 109 | "\x{0078}" => "\x78", # LATIN SMALL LETTER X |
| 110 | "\x{0079}" => "\x79", # LATIN SMALL LETTER Y |
| 111 | "\x{007A}" => "\x7A", # LATIN SMALL LETTER Z |
| 112 | "\x{000C}" => "\x1B\x0A", # FORM FEED |
| 113 | "\x{005B}" => "\x1B\x3C", # LEFT SQUARE BRACKET |
| 114 | "\x{005C}" => "\x1B\x2F", # REVERSE SOLIDUS |
| 115 | "\x{005D}" => "\x1B\x3E", # RIGHT SQUARE BRACKET |
| 116 | "\x{005E}" => "\x1B\x14", # CIRCUMFLEX ACCENT |
| 117 | "\x{007B}" => "\x1B\x28", # LEFT CURLY BRACKET |
| 118 | "\x{007C}" => "\x1B\x40", # VERTICAL LINE |
| 119 | "\x{007D}" => "\x1B\x29", # RIGHT CURLY BRACKET |
| 120 | "\x{007E}" => "\x1B\x3D", # TILDE |
| 121 | "\x{00A0}" => "\x1B", # NO-BREAK SPACE |
| 122 | "\x{00A1}" => "\x40", # INVERTED EXCLAMATION MARK |
| 123 | "\x{00A3}" => "\x01", # POUND SIGN |
| 124 | "\x{00A4}" => "\x24", # CURRENCY SIGN |
| 125 | "\x{00A5}" => "\x03", # YEN SIGN |
| 126 | "\x{00A7}" => "\x5F", # SECTION SIGN |
| 127 | "\x{00BF}" => "\x60", # INVERTED QUESTION MARK |
| 128 | "\x{00C4}" => "\x5B", # LATIN CAPITAL LETTER A WITH DIAERESIS |
| 129 | "\x{00C5}" => "\x0E", # LATIN CAPITAL LETTER A WITH RING ABOVE |
| 130 | "\x{00C6}" => "\x1C", # LATIN CAPITAL LETTER AE |
| 131 | "\x{00C9}" => "\x1F", # LATIN CAPITAL LETTER E WITH ACUTE |
| 132 | "\x{00D1}" => "\x5D", # LATIN CAPITAL LETTER N WITH TILDE |
| 133 | "\x{00D6}" => "\x5C", # LATIN CAPITAL LETTER O WITH DIAERESIS |
| 134 | "\x{00D8}" => "\x0B", # LATIN CAPITAL LETTER O WITH STROKE |
| 135 | "\x{00DC}" => "\x5E", # LATIN CAPITAL LETTER U WITH DIAERESIS |
| 136 | "\x{00DF}" => "\x1E", # LATIN SMALL LETTER SHARP S |
| 137 | "\x{00E0}" => "\x7F", # LATIN SMALL LETTER A WITH GRAVE |
| 138 | "\x{00E4}" => "\x7B", # LATIN SMALL LETTER A WITH DIAERESIS |
| 139 | "\x{00E5}" => "\x0F", # LATIN SMALL LETTER A WITH RING ABOVE |
| 140 | "\x{00E6}" => "\x1D", # LATIN SMALL LETTER AE |
| 141 | "\x{00E7}" => "\x09", # LATIN SMALL LETTER C WITH CEDILLA |
| 142 | "\x{00E8}" => "\x04", # LATIN SMALL LETTER E WITH GRAVE |
| 143 | "\x{00E9}" => "\x05", # LATIN SMALL LETTER E WITH ACUTE |
| 144 | "\x{00EC}" => "\x07", # LATIN SMALL LETTER I WITH GRAVE |
| 145 | "\x{00F1}" => "\x7D", # LATIN SMALL LETTER N WITH TILDE |
| 146 | "\x{00F2}" => "\x08", # LATIN SMALL LETTER O WITH GRAVE |
| 147 | "\x{00F6}" => "\x7C", # LATIN SMALL LETTER O WITH DIAERESIS |
| 148 | "\x{00F8}" => "\x0C", # LATIN SMALL LETTER O WITH STROKE |
| 149 | "\x{00F9}" => "\x06", # LATIN SMALL LETTER U WITH GRAVE |
| 150 | "\x{00FC}" => "\x7E", # LATIN SMALL LETTER U WITH DIAERESIS |
| 151 | "\x{0393}" => "\x13", # GREEK CAPITAL LETTER GAMMA |
| 152 | "\x{0394}" => "\x10", # GREEK CAPITAL LETTER DELTA |
| 153 | "\x{0398}" => "\x19", # GREEK CAPITAL LETTER THETA |
| 154 | "\x{039B}" => "\x14", # GREEK CAPITAL LETTER LAMDA |
| 155 | "\x{039E}" => "\x1A", # GREEK CAPITAL LETTER XI |
| 156 | "\x{03A0}" => "\x16", # GREEK CAPITAL LETTER PI |
| 157 | "\x{03A3}" => "\x18", # GREEK CAPITAL LETTER SIGMA |
| 158 | "\x{03A6}" => "\x12", # GREEK CAPITAL LETTER PHI |
| 159 | "\x{03A8}" => "\x17", # GREEK CAPITAL LETTER PSI |
| 160 | "\x{03A9}" => "\x15", # GREEK CAPITAL LETTER OMEGA |
| 161 | "\x{20AC}" => "\x1B\x65", # EURO SIGN |
| 162 | ); |
| 163 | our %GSM2UNI = reverse %UNI2GSM; |
| 164 | our $ESC = "\x1b"; |
| 165 | our $ATMARK = "\x40"; |
| 166 | our $FBCHAR = "\x3F"; |
| 167 | our $NBSP = "\x{00A0}"; |
| 168 | |
| 169 | #define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" |
| 170 | |
| 171 | sub decode ($$;$) { |
| 172 | my ( $obj, $bytes, $chk ) = @_; |
| 173 | my $str; |
| 174 | while ( length $bytes ) { |
| 175 | my $c = substr( $bytes, 0, 1, '' ); |
| 176 | my $u; |
| 177 | if ( $c eq "\x00" ) { |
| 178 | my $c2 = substr( $bytes, 0, 1, '' ); |
| 179 | $u = |
| 180 | !length $c2 ? $ATMARK |
| 181 | : $c2 eq "\x00" ? "\x{0000}" |
| 182 | : exists $GSM2UNI{$c2} ? $ATMARK . $GSM2UNI{$c2} |
| 183 | : $chk |
| 184 | ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode", |
| 185 | ord($c), ord($c2) ) |
| 186 | : $ATMARK . $FBCHAR; |
| 187 | |
| 188 | } |
| 189 | elsif ( $c eq $ESC ) { |
| 190 | my $c2 = substr( $bytes, 0, 1, '' ); |
| 191 | $u = |
| 192 | exists $GSM2UNI{ $c . $c2 } ? $GSM2UNI{ $c . $c2 } |
| 193 | : exists $GSM2UNI{$c2} ? $NBSP . $GSM2UNI{$c2} |
| 194 | : $chk |
| 195 | ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode", |
| 196 | ord($c), ord($c2) ) |
| 197 | : $NBSP . $FBCHAR; |
| 198 | } |
| 199 | else { |
| 200 | $u = |
| 201 | exists $GSM2UNI{$c} |
| 202 | ? $GSM2UNI{$c} |
| 203 | : $chk ? ref $chk eq 'CODE' |
| 204 | ? $chk->( ord $c ) |
| 205 | : croak sprintf( "\\x%02X does not map to Unicode", ord($c) ) |
| 206 | : $FBCHAR; |
| 207 | } |
| 208 | $str .= $u; |
| 209 | } |
| 210 | $_[1] = $bytes if $chk; |
| 211 | return $str; |
| 212 | } |
| 213 | |
| 214 | #define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" |
| 215 | |
| 216 | sub encode($$;$) { |
| 217 | my ( $obj, $str, $chk ) = @_; |
| 218 | my $bytes; |
| 219 | while ( length $str ) { |
| 220 | my $u = substr( $str, 0, 1, '' ); |
| 221 | my $c; |
| 222 | $bytes .= |
| 223 | exists $UNI2GSM{$u} |
| 224 | ? $UNI2GSM{$u} |
| 225 | : $chk ? ref $chk eq 'CODE' |
| 226 | ? $chk->( ord($u) ) |
| 227 | : croak sprintf( "\\x{%04x} does not map to %s", |
| 228 | ord($u), $obj->name ) |
| 229 | : $FBCHAR; |
| 230 | } |
| 231 | $_[1] = $str if $chk; |
| 232 | return $bytes; |
| 233 | } |
| 234 | |
| 235 | 1; |
| 236 | __END__ |
| 237 | |
| 238 | =head1 NAME |
| 239 | |
| 240 | Encode::GSM0338 -- ESTI GSM 03.38 Encoding |
| 241 | |
| 242 | =head1 SYNOPSIS |
| 243 | |
| 244 | use Encode qw/encode decode/; |
| 245 | $gsm0338 = encode("gsm0338", $utf8); # loads Encode::GSM0338 implicitly |
| 246 | $utf8 = decode("gsm0338", $gsm0338); # ditto |
| 247 | |
| 248 | =head1 DESCRIPTION |
| 249 | |
| 250 | GSM0338 is for GSM handsets. Though it shares alphanumerals with ASCII, |
| 251 | control character ranges and other parts are mapped very differently, |
| 252 | mainly to store Greek characters. There are also escape sequences |
| 253 | (starting with 0x1B) to cover e.g. the Euro sign. |
| 254 | |
| 255 | This was once handled by L<Encode::Bytes> but because of all those |
| 256 | unusual specifications, Encode 2.20 has relocated the support to |
| 257 | this module. |
| 258 | |
| 259 | =head1 NOTES |
| 260 | |
| 261 | Unlike most other encodings, the following aways croaks on error |
| 262 | for any $chk that evaluates to true. |
| 263 | |
| 264 | $gsm0338 = encode("gsm0338", $utf8 $chk); |
| 265 | $utf8 = decode("gsm0338", $gsm0338, $chk); |
| 266 | |
| 267 | So if you want to check the validity of the encoding, surround the |
| 268 | expression with C<eval {}> block as follows; |
| 269 | |
| 270 | eval { |
| 271 | $utf8 = decode("gsm0338", $gsm0338, $chk); |
| 272 | }; |
| 273 | if ($@){ |
| 274 | # handle exception here |
| 275 | } |
| 276 | |
| 277 | =head1 BUGS |
| 278 | |
| 279 | ESTI GSM 03.38 Encoding itself. |
| 280 | |
| 281 | Mapping \x00 to '@' causes too much pain everywhere. |
| 282 | |
| 283 | Its use of \x1b (escape) is also very questionable. |
| 284 | |
| 285 | Because of those two, the code paging approach used use in ucm-based |
| 286 | Encoding SOMETIMES fails so this module was written. |
| 287 | |
| 288 | =head1 SEE ALSO |
| 289 | |
| 290 | L<Encode> |
| 291 | |
| 292 | =cut |