Commit | Line | Data |
---|---|---|
44b3b9c7 | 1 | # |
7167e196 | 2 | # $Id: GSM0338.pm,v 2.10 2021/05/24 10:56:53 dankogai Exp $ |
44b3b9c7 SP |
3 | # |
4 | package Encode::GSM0338; | |
5 | ||
6 | use strict; | |
7 | use warnings; | |
8 | use Carp; | |
9 | ||
10 | use vars qw($VERSION); | |
7167e196 | 11 | $VERSION = do { my @r = ( q$Revision: 2.10 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; |
44b3b9c7 SP |
12 | |
13 | use Encode qw(:fallbacks); | |
14 | ||
369b9ffe | 15 | use parent qw(Encode::Encoding); |
44b3b9c7 SP |
16 | __PACKAGE__->Define('gsm0338'); |
17 | ||
44b3b9c7 | 18 | use utf8; |
27ee53f9 SH |
19 | |
20 | # Mapping table according to 3GPP TS 23.038 version 16.0.0 Release 16 and ETSI TS 123 038 V16.0.0 (2020-07) | |
21 | # https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf (page 20 and 22) | |
44b3b9c7 | 22 | our %UNI2GSM = ( |
44b3b9c7 SP |
23 | "\x{000A}" => "\x0A", # LINE FEED |
24 | "\x{000C}" => "\x1B\x0A", # FORM FEED | |
25 | "\x{000D}" => "\x0D", # CARRIAGE RETURN | |
26 | "\x{0020}" => "\x20", # SPACE | |
27 | "\x{0021}" => "\x21", # EXCLAMATION MARK | |
28 | "\x{0022}" => "\x22", # QUOTATION MARK | |
29 | "\x{0023}" => "\x23", # NUMBER SIGN | |
30 | "\x{0024}" => "\x02", # DOLLAR SIGN | |
31 | "\x{0025}" => "\x25", # PERCENT SIGN | |
32 | "\x{0026}" => "\x26", # AMPERSAND | |
33 | "\x{0027}" => "\x27", # APOSTROPHE | |
34 | "\x{0028}" => "\x28", # LEFT PARENTHESIS | |
35 | "\x{0029}" => "\x29", # RIGHT PARENTHESIS | |
36 | "\x{002A}" => "\x2A", # ASTERISK | |
37 | "\x{002B}" => "\x2B", # PLUS SIGN | |
38 | "\x{002C}" => "\x2C", # COMMA | |
39 | "\x{002D}" => "\x2D", # HYPHEN-MINUS | |
40 | "\x{002E}" => "\x2E", # FULL STOP | |
41 | "\x{002F}" => "\x2F", # SOLIDUS | |
42 | "\x{0030}" => "\x30", # DIGIT ZERO | |
43 | "\x{0031}" => "\x31", # DIGIT ONE | |
44 | "\x{0032}" => "\x32", # DIGIT TWO | |
45 | "\x{0033}" => "\x33", # DIGIT THREE | |
46 | "\x{0034}" => "\x34", # DIGIT FOUR | |
47 | "\x{0035}" => "\x35", # DIGIT FIVE | |
48 | "\x{0036}" => "\x36", # DIGIT SIX | |
49 | "\x{0037}" => "\x37", # DIGIT SEVEN | |
50 | "\x{0038}" => "\x38", # DIGIT EIGHT | |
51 | "\x{0039}" => "\x39", # DIGIT NINE | |
52 | "\x{003A}" => "\x3A", # COLON | |
53 | "\x{003B}" => "\x3B", # SEMICOLON | |
54 | "\x{003C}" => "\x3C", # LESS-THAN SIGN | |
55 | "\x{003D}" => "\x3D", # EQUALS SIGN | |
56 | "\x{003E}" => "\x3E", # GREATER-THAN SIGN | |
57 | "\x{003F}" => "\x3F", # QUESTION MARK | |
27ee53f9 | 58 | "\x{0040}" => "\x00", # COMMERCIAL AT |
44b3b9c7 SP |
59 | "\x{0041}" => "\x41", # LATIN CAPITAL LETTER A |
60 | "\x{0042}" => "\x42", # LATIN CAPITAL LETTER B | |
61 | "\x{0043}" => "\x43", # LATIN CAPITAL LETTER C | |
62 | "\x{0044}" => "\x44", # LATIN CAPITAL LETTER D | |
63 | "\x{0045}" => "\x45", # LATIN CAPITAL LETTER E | |
64 | "\x{0046}" => "\x46", # LATIN CAPITAL LETTER F | |
65 | "\x{0047}" => "\x47", # LATIN CAPITAL LETTER G | |
66 | "\x{0048}" => "\x48", # LATIN CAPITAL LETTER H | |
67 | "\x{0049}" => "\x49", # LATIN CAPITAL LETTER I | |
68 | "\x{004A}" => "\x4A", # LATIN CAPITAL LETTER J | |
69 | "\x{004B}" => "\x4B", # LATIN CAPITAL LETTER K | |
70 | "\x{004C}" => "\x4C", # LATIN CAPITAL LETTER L | |
71 | "\x{004D}" => "\x4D", # LATIN CAPITAL LETTER M | |
72 | "\x{004E}" => "\x4E", # LATIN CAPITAL LETTER N | |
73 | "\x{004F}" => "\x4F", # LATIN CAPITAL LETTER O | |
74 | "\x{0050}" => "\x50", # LATIN CAPITAL LETTER P | |
75 | "\x{0051}" => "\x51", # LATIN CAPITAL LETTER Q | |
76 | "\x{0052}" => "\x52", # LATIN CAPITAL LETTER R | |
77 | "\x{0053}" => "\x53", # LATIN CAPITAL LETTER S | |
78 | "\x{0054}" => "\x54", # LATIN CAPITAL LETTER T | |
79 | "\x{0055}" => "\x55", # LATIN CAPITAL LETTER U | |
80 | "\x{0056}" => "\x56", # LATIN CAPITAL LETTER V | |
81 | "\x{0057}" => "\x57", # LATIN CAPITAL LETTER W | |
82 | "\x{0058}" => "\x58", # LATIN CAPITAL LETTER X | |
83 | "\x{0059}" => "\x59", # LATIN CAPITAL LETTER Y | |
84 | "\x{005A}" => "\x5A", # LATIN CAPITAL LETTER Z | |
27ee53f9 SH |
85 | "\x{005B}" => "\x1B\x3C", # LEFT SQUARE BRACKET |
86 | "\x{005C}" => "\x1B\x2F", # REVERSE SOLIDUS | |
87 | "\x{005D}" => "\x1B\x3E", # RIGHT SQUARE BRACKET | |
88 | "\x{005E}" => "\x1B\x14", # CIRCUMFLEX ACCENT | |
44b3b9c7 SP |
89 | "\x{005F}" => "\x11", # LOW LINE |
90 | "\x{0061}" => "\x61", # LATIN SMALL LETTER A | |
91 | "\x{0062}" => "\x62", # LATIN SMALL LETTER B | |
92 | "\x{0063}" => "\x63", # LATIN SMALL LETTER C | |
93 | "\x{0064}" => "\x64", # LATIN SMALL LETTER D | |
94 | "\x{0065}" => "\x65", # LATIN SMALL LETTER E | |
95 | "\x{0066}" => "\x66", # LATIN SMALL LETTER F | |
96 | "\x{0067}" => "\x67", # LATIN SMALL LETTER G | |
97 | "\x{0068}" => "\x68", # LATIN SMALL LETTER H | |
98 | "\x{0069}" => "\x69", # LATIN SMALL LETTER I | |
99 | "\x{006A}" => "\x6A", # LATIN SMALL LETTER J | |
100 | "\x{006B}" => "\x6B", # LATIN SMALL LETTER K | |
101 | "\x{006C}" => "\x6C", # LATIN SMALL LETTER L | |
102 | "\x{006D}" => "\x6D", # LATIN SMALL LETTER M | |
103 | "\x{006E}" => "\x6E", # LATIN SMALL LETTER N | |
104 | "\x{006F}" => "\x6F", # LATIN SMALL LETTER O | |
105 | "\x{0070}" => "\x70", # LATIN SMALL LETTER P | |
106 | "\x{0071}" => "\x71", # LATIN SMALL LETTER Q | |
107 | "\x{0072}" => "\x72", # LATIN SMALL LETTER R | |
108 | "\x{0073}" => "\x73", # LATIN SMALL LETTER S | |
109 | "\x{0074}" => "\x74", # LATIN SMALL LETTER T | |
110 | "\x{0075}" => "\x75", # LATIN SMALL LETTER U | |
111 | "\x{0076}" => "\x76", # LATIN SMALL LETTER V | |
112 | "\x{0077}" => "\x77", # LATIN SMALL LETTER W | |
113 | "\x{0078}" => "\x78", # LATIN SMALL LETTER X | |
114 | "\x{0079}" => "\x79", # LATIN SMALL LETTER Y | |
115 | "\x{007A}" => "\x7A", # LATIN SMALL LETTER Z | |
44b3b9c7 SP |
116 | "\x{007B}" => "\x1B\x28", # LEFT CURLY BRACKET |
117 | "\x{007C}" => "\x1B\x40", # VERTICAL LINE | |
118 | "\x{007D}" => "\x1B\x29", # RIGHT CURLY BRACKET | |
119 | "\x{007E}" => "\x1B\x3D", # TILDE | |
44b3b9c7 SP |
120 | "\x{00A1}" => "\x40", # INVERTED EXCLAMATION MARK |
121 | "\x{00A3}" => "\x01", # POUND SIGN | |
122 | "\x{00A4}" => "\x24", # CURRENCY SIGN | |
123 | "\x{00A5}" => "\x03", # YEN SIGN | |
124 | "\x{00A7}" => "\x5F", # SECTION SIGN | |
125 | "\x{00BF}" => "\x60", # INVERTED QUESTION MARK | |
126 | "\x{00C4}" => "\x5B", # LATIN CAPITAL LETTER A WITH DIAERESIS | |
127 | "\x{00C5}" => "\x0E", # LATIN CAPITAL LETTER A WITH RING ABOVE | |
128 | "\x{00C6}" => "\x1C", # LATIN CAPITAL LETTER AE | |
27ee53f9 | 129 | "\x{00C7}" => "\x09", # LATIN CAPITAL LETTER C WITH CEDILLA |
44b3b9c7 SP |
130 | "\x{00C9}" => "\x1F", # LATIN CAPITAL LETTER E WITH ACUTE |
131 | "\x{00D1}" => "\x5D", # LATIN CAPITAL LETTER N WITH TILDE | |
132 | "\x{00D6}" => "\x5C", # LATIN CAPITAL LETTER O WITH DIAERESIS | |
133 | "\x{00D8}" => "\x0B", # LATIN CAPITAL LETTER O WITH STROKE | |
134 | "\x{00DC}" => "\x5E", # LATIN CAPITAL LETTER U WITH DIAERESIS | |
135 | "\x{00DF}" => "\x1E", # LATIN SMALL LETTER SHARP S | |
136 | "\x{00E0}" => "\x7F", # LATIN SMALL LETTER A WITH GRAVE | |
137 | "\x{00E4}" => "\x7B", # LATIN SMALL LETTER A WITH DIAERESIS | |
138 | "\x{00E5}" => "\x0F", # LATIN SMALL LETTER A WITH RING ABOVE | |
139 | "\x{00E6}" => "\x1D", # LATIN SMALL LETTER AE | |
44b3b9c7 SP |
140 | "\x{00E8}" => "\x04", # LATIN SMALL LETTER E WITH GRAVE |
141 | "\x{00E9}" => "\x05", # LATIN SMALL LETTER E WITH ACUTE | |
142 | "\x{00EC}" => "\x07", # LATIN SMALL LETTER I WITH GRAVE | |
143 | "\x{00F1}" => "\x7D", # LATIN SMALL LETTER N WITH TILDE | |
144 | "\x{00F2}" => "\x08", # LATIN SMALL LETTER O WITH GRAVE | |
145 | "\x{00F6}" => "\x7C", # LATIN SMALL LETTER O WITH DIAERESIS | |
146 | "\x{00F8}" => "\x0C", # LATIN SMALL LETTER O WITH STROKE | |
147 | "\x{00F9}" => "\x06", # LATIN SMALL LETTER U WITH GRAVE | |
148 | "\x{00FC}" => "\x7E", # LATIN SMALL LETTER U WITH DIAERESIS | |
149 | "\x{0393}" => "\x13", # GREEK CAPITAL LETTER GAMMA | |
150 | "\x{0394}" => "\x10", # GREEK CAPITAL LETTER DELTA | |
151 | "\x{0398}" => "\x19", # GREEK CAPITAL LETTER THETA | |
152 | "\x{039B}" => "\x14", # GREEK CAPITAL LETTER LAMDA | |
153 | "\x{039E}" => "\x1A", # GREEK CAPITAL LETTER XI | |
154 | "\x{03A0}" => "\x16", # GREEK CAPITAL LETTER PI | |
155 | "\x{03A3}" => "\x18", # GREEK CAPITAL LETTER SIGMA | |
156 | "\x{03A6}" => "\x12", # GREEK CAPITAL LETTER PHI | |
157 | "\x{03A8}" => "\x17", # GREEK CAPITAL LETTER PSI | |
158 | "\x{03A9}" => "\x15", # GREEK CAPITAL LETTER OMEGA | |
159 | "\x{20AC}" => "\x1B\x65", # EURO SIGN | |
160 | ); | |
161 | our %GSM2UNI = reverse %UNI2GSM; | |
7167e196 | 162 | our $ESC = "\x1b"; |
44b3b9c7 SP |
163 | |
164 | sub decode ($$;$) { | |
165 | my ( $obj, $bytes, $chk ) = @_; | |
3f60a930 | 166 | return undef unless defined $bytes; |
7167e196 | 167 | my $str = substr( $bytes, 0, 0 ); # to propagate taintedness; |
44b3b9c7 | 168 | while ( length $bytes ) { |
27ee53f9 SH |
169 | my $seq = ''; |
170 | my $c; | |
171 | do { | |
172 | $c = substr( $bytes, 0, 1, '' ); | |
173 | $seq .= $c; | |
174 | } while ( length $bytes and $c eq $ESC ); | |
175 | my $u = | |
7167e196 RS |
176 | exists $GSM2UNI{$seq} ? $GSM2UNI{$seq} |
177 | : ( $chk && ref $chk eq 'CODE' ) ? $chk->( unpack 'C*', $seq ) | |
178 | : "\x{FFFD}"; | |
27ee53f9 | 179 | if ( not exists $GSM2UNI{$seq} and $chk and not ref $chk ) { |
7167e196 RS |
180 | if ( substr( $seq, 0, 1 ) eq $ESC |
181 | and ( $chk & Encode::STOP_AT_PARTIAL ) ) | |
182 | { | |
0aee9517 DK |
183 | $bytes .= $seq; |
184 | last; | |
185 | } | |
7167e196 RS |
186 | croak join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq ) |
187 | . ' does not map to Unicode' | |
188 | if $chk & Encode::DIE_ON_ERR; | |
189 | carp join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq ) | |
190 | . ' does not map to Unicode' | |
191 | if $chk & Encode::WARN_ON_ERR; | |
192 | if ( $chk & Encode::RETURN_ON_ERR ) { | |
27ee53f9 SH |
193 | $bytes .= $seq; |
194 | last; | |
195 | } | |
44b3b9c7 SP |
196 | } |
197 | $str .= $u; | |
198 | } | |
7167e196 | 199 | $_[1] = $bytes if not ref $chk and $chk and !( $chk & Encode::LEAVE_SRC ); |
44b3b9c7 SP |
200 | return $str; |
201 | } | |
202 | ||
44b3b9c7 SP |
203 | sub encode($$;$) { |
204 | my ( $obj, $str, $chk ) = @_; | |
3f60a930 | 205 | return undef unless defined $str; |
7167e196 | 206 | my $bytes = substr( $str, 0, 0 ); # to propagate taintedness |
44b3b9c7 SP |
207 | while ( length $str ) { |
208 | my $u = substr( $str, 0, 1, '' ); | |
209 | my $c; | |
27ee53f9 | 210 | my $seq = |
7167e196 RS |
211 | exists $UNI2GSM{$u} ? $UNI2GSM{$u} |
212 | : ( $chk && ref $chk eq 'CODE' ) ? $chk->( ord($u) ) | |
213 | : $UNI2GSM{'?'}; | |
27ee53f9 | 214 | if ( not exists $UNI2GSM{$u} and $chk and not ref $chk ) { |
7167e196 RS |
215 | croak sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name ) |
216 | if $chk & Encode::DIE_ON_ERR; | |
217 | carp sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name ) | |
218 | if $chk & Encode::WARN_ON_ERR; | |
219 | if ( $chk & Encode::RETURN_ON_ERR ) { | |
27ee53f9 SH |
220 | $str .= $u; |
221 | last; | |
222 | } | |
223 | } | |
224 | $bytes .= $seq; | |
44b3b9c7 | 225 | } |
7167e196 | 226 | $_[1] = $str if not ref $chk and $chk and !( $chk & Encode::LEAVE_SRC ); |
44b3b9c7 SP |
227 | return $bytes; |
228 | } | |
229 | ||
230 | 1; | |
231 | __END__ | |
232 | ||
233 | =head1 NAME | |
234 | ||
27ee53f9 | 235 | Encode::GSM0338 -- ETSI GSM 03.38 Encoding |
44b3b9c7 SP |
236 | |
237 | =head1 SYNOPSIS | |
238 | ||
27ee53f9 SH |
239 | use Encode qw/encode decode/; |
240 | $gsm0338 = encode("gsm0338", $unicode); # loads Encode::GSM0338 implicitly | |
241 | $unicode = decode("gsm0338", $gsm0338); # ditto | |
44b3b9c7 SP |
242 | |
243 | =head1 DESCRIPTION | |
244 | ||
245 | GSM0338 is for GSM handsets. Though it shares alphanumerals with ASCII, | |
246 | control character ranges and other parts are mapped very differently, | |
247 | mainly to store Greek characters. There are also escape sequences | |
248 | (starting with 0x1B) to cover e.g. the Euro sign. | |
249 | ||
250 | This was once handled by L<Encode::Bytes> but because of all those | |
251 | unusual specifications, Encode 2.20 has relocated the support to | |
252 | this module. | |
253 | ||
27ee53f9 SH |
254 | This module implements only I<GSM 7 bit Default Alphabet> and |
255 | I<GSM 7 bit default alphabet extension table> according to standard | |
256 | 3GPP TS 23.038 version 16. Therefore I<National Language Single Shift> | |
257 | and I<National Language Locking Shift> are not implemented nor supported. | |
44b3b9c7 | 258 | |
27ee53f9 | 259 | =head2 Septets |
44b3b9c7 | 260 | |
27ee53f9 SH |
261 | This modules operates with octets (like any other Encode module) and not |
262 | with packed septets (unlike other GSM standards). Therefore for processing | |
263 | binary SMS or parts of GSM TPDU payload (3GPP TS 23.040) it is needed to do | |
264 | conversion between octets and packed septets. For this purpose perl's C<pack> | |
265 | and C<unpack> functions may be useful: | |
44b3b9c7 | 266 | |
27ee53f9 SH |
267 | $bytes = substr(pack('(b*)*', unpack '(A7)*', unpack 'b*', $septets), 0, $num_of_septets); |
268 | $unicode = decode('GSM0338', $bytes); | |
44b3b9c7 | 269 | |
27ee53f9 SH |
270 | $bytes = encode('GSM0338', $unicode); |
271 | $septets = pack 'b*', join '', map { substr $_, 0, 7 } unpack '(A8)*', unpack 'b*', $bytes; | |
272 | $num_of_septets = length $bytes; | |
44b3b9c7 | 273 | |
27ee53f9 SH |
274 | Please note that for correct decoding of packed septets it is required to |
275 | know number of septets packed in binary buffer as binary buffer is always | |
276 | padded with zero bits and 7 zero bits represents character C<@>. Number | |
277 | of septets is also stored in TPDU payload when dealing with 3GPP TS 23.040. | |
44b3b9c7 | 278 | |
27ee53f9 | 279 | =head1 BUGS |
44b3b9c7 | 280 | |
27ee53f9 SH |
281 | Encode::GSM0338 2.7 and older versions (part of Encode 3.06) incorrectly |
282 | handled zero bytes (character C<@>). This was fixed in Encode::GSM0338 | |
283 | version 2.8 (part of Encode 3.07). | |
44b3b9c7 | 284 | |
27ee53f9 | 285 | =head1 SEE ALSO |
44b3b9c7 | 286 | |
27ee53f9 | 287 | L<3GPP TS 23.038|https://www.3gpp.org/dynareport/23038.htm> |
44b3b9c7 | 288 | |
27ee53f9 | 289 | L<ETSI TS 123 038 V16.0.0 (2020-07)|https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf> |
44b3b9c7 SP |
290 | |
291 | L<Encode> | |
292 | ||
293 | =cut |