2 # $Id: GSM0338.pm,v 2.10 2021/05/24 10:56:53 dankogai Exp $
4 package Encode::GSM0338;
10 use vars qw($VERSION);
11 $VERSION = do { my @r = ( q$Revision: 2.10 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
13 use Encode qw(:fallbacks);
15 use parent qw(Encode::Encoding);
16 __PACKAGE__->Define('gsm0338');
20 # Mapping table according to 3GPP TS 23.038 version 16.0.0 Release 16 and ETSI TS 123 038 V16.0.0 (2020-07)
21 # https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf (page 20 and 22)
23 "\x{000A}" => "\x0A", # LINE FEED
24 "\x{000C}" => "\x1B\x0A", # FORM FEED
25 "\x{000D}" => "\x0D", # CARRIAGE RETURN
26 "\x{0020}" => "\x20", # SPACE
27 "\x{0021}" => "\x21", # EXCLAMATION MARK
28 "\x{0022}" => "\x22", # QUOTATION MARK
29 "\x{0023}" => "\x23", # NUMBER SIGN
30 "\x{0024}" => "\x02", # DOLLAR SIGN
31 "\x{0025}" => "\x25", # PERCENT SIGN
32 "\x{0026}" => "\x26", # AMPERSAND
33 "\x{0027}" => "\x27", # APOSTROPHE
34 "\x{0028}" => "\x28", # LEFT PARENTHESIS
35 "\x{0029}" => "\x29", # RIGHT PARENTHESIS
36 "\x{002A}" => "\x2A", # ASTERISK
37 "\x{002B}" => "\x2B", # PLUS SIGN
38 "\x{002C}" => "\x2C", # COMMA
39 "\x{002D}" => "\x2D", # HYPHEN-MINUS
40 "\x{002E}" => "\x2E", # FULL STOP
41 "\x{002F}" => "\x2F", # SOLIDUS
42 "\x{0030}" => "\x30", # DIGIT ZERO
43 "\x{0031}" => "\x31", # DIGIT ONE
44 "\x{0032}" => "\x32", # DIGIT TWO
45 "\x{0033}" => "\x33", # DIGIT THREE
46 "\x{0034}" => "\x34", # DIGIT FOUR
47 "\x{0035}" => "\x35", # DIGIT FIVE
48 "\x{0036}" => "\x36", # DIGIT SIX
49 "\x{0037}" => "\x37", # DIGIT SEVEN
50 "\x{0038}" => "\x38", # DIGIT EIGHT
51 "\x{0039}" => "\x39", # DIGIT NINE
52 "\x{003A}" => "\x3A", # COLON
53 "\x{003B}" => "\x3B", # SEMICOLON
54 "\x{003C}" => "\x3C", # LESS-THAN SIGN
55 "\x{003D}" => "\x3D", # EQUALS SIGN
56 "\x{003E}" => "\x3E", # GREATER-THAN SIGN
57 "\x{003F}" => "\x3F", # QUESTION MARK
58 "\x{0040}" => "\x00", # COMMERCIAL AT
59 "\x{0041}" => "\x41", # LATIN CAPITAL LETTER A
60 "\x{0042}" => "\x42", # LATIN CAPITAL LETTER B
61 "\x{0043}" => "\x43", # LATIN CAPITAL LETTER C
62 "\x{0044}" => "\x44", # LATIN CAPITAL LETTER D
63 "\x{0045}" => "\x45", # LATIN CAPITAL LETTER E
64 "\x{0046}" => "\x46", # LATIN CAPITAL LETTER F
65 "\x{0047}" => "\x47", # LATIN CAPITAL LETTER G
66 "\x{0048}" => "\x48", # LATIN CAPITAL LETTER H
67 "\x{0049}" => "\x49", # LATIN CAPITAL LETTER I
68 "\x{004A}" => "\x4A", # LATIN CAPITAL LETTER J
69 "\x{004B}" => "\x4B", # LATIN CAPITAL LETTER K
70 "\x{004C}" => "\x4C", # LATIN CAPITAL LETTER L
71 "\x{004D}" => "\x4D", # LATIN CAPITAL LETTER M
72 "\x{004E}" => "\x4E", # LATIN CAPITAL LETTER N
73 "\x{004F}" => "\x4F", # LATIN CAPITAL LETTER O
74 "\x{0050}" => "\x50", # LATIN CAPITAL LETTER P
75 "\x{0051}" => "\x51", # LATIN CAPITAL LETTER Q
76 "\x{0052}" => "\x52", # LATIN CAPITAL LETTER R
77 "\x{0053}" => "\x53", # LATIN CAPITAL LETTER S
78 "\x{0054}" => "\x54", # LATIN CAPITAL LETTER T
79 "\x{0055}" => "\x55", # LATIN CAPITAL LETTER U
80 "\x{0056}" => "\x56", # LATIN CAPITAL LETTER V
81 "\x{0057}" => "\x57", # LATIN CAPITAL LETTER W
82 "\x{0058}" => "\x58", # LATIN CAPITAL LETTER X
83 "\x{0059}" => "\x59", # LATIN CAPITAL LETTER Y
84 "\x{005A}" => "\x5A", # LATIN CAPITAL LETTER Z
85 "\x{005B}" => "\x1B\x3C", # LEFT SQUARE BRACKET
86 "\x{005C}" => "\x1B\x2F", # REVERSE SOLIDUS
87 "\x{005D}" => "\x1B\x3E", # RIGHT SQUARE BRACKET
88 "\x{005E}" => "\x1B\x14", # CIRCUMFLEX ACCENT
89 "\x{005F}" => "\x11", # LOW LINE
90 "\x{0061}" => "\x61", # LATIN SMALL LETTER A
91 "\x{0062}" => "\x62", # LATIN SMALL LETTER B
92 "\x{0063}" => "\x63", # LATIN SMALL LETTER C
93 "\x{0064}" => "\x64", # LATIN SMALL LETTER D
94 "\x{0065}" => "\x65", # LATIN SMALL LETTER E
95 "\x{0066}" => "\x66", # LATIN SMALL LETTER F
96 "\x{0067}" => "\x67", # LATIN SMALL LETTER G
97 "\x{0068}" => "\x68", # LATIN SMALL LETTER H
98 "\x{0069}" => "\x69", # LATIN SMALL LETTER I
99 "\x{006A}" => "\x6A", # LATIN SMALL LETTER J
100 "\x{006B}" => "\x6B", # LATIN SMALL LETTER K
101 "\x{006C}" => "\x6C", # LATIN SMALL LETTER L
102 "\x{006D}" => "\x6D", # LATIN SMALL LETTER M
103 "\x{006E}" => "\x6E", # LATIN SMALL LETTER N
104 "\x{006F}" => "\x6F", # LATIN SMALL LETTER O
105 "\x{0070}" => "\x70", # LATIN SMALL LETTER P
106 "\x{0071}" => "\x71", # LATIN SMALL LETTER Q
107 "\x{0072}" => "\x72", # LATIN SMALL LETTER R
108 "\x{0073}" => "\x73", # LATIN SMALL LETTER S
109 "\x{0074}" => "\x74", # LATIN SMALL LETTER T
110 "\x{0075}" => "\x75", # LATIN SMALL LETTER U
111 "\x{0076}" => "\x76", # LATIN SMALL LETTER V
112 "\x{0077}" => "\x77", # LATIN SMALL LETTER W
113 "\x{0078}" => "\x78", # LATIN SMALL LETTER X
114 "\x{0079}" => "\x79", # LATIN SMALL LETTER Y
115 "\x{007A}" => "\x7A", # LATIN SMALL LETTER Z
116 "\x{007B}" => "\x1B\x28", # LEFT CURLY BRACKET
117 "\x{007C}" => "\x1B\x40", # VERTICAL LINE
118 "\x{007D}" => "\x1B\x29", # RIGHT CURLY BRACKET
119 "\x{007E}" => "\x1B\x3D", # TILDE
120 "\x{00A1}" => "\x40", # INVERTED EXCLAMATION MARK
121 "\x{00A3}" => "\x01", # POUND SIGN
122 "\x{00A4}" => "\x24", # CURRENCY SIGN
123 "\x{00A5}" => "\x03", # YEN SIGN
124 "\x{00A7}" => "\x5F", # SECTION SIGN
125 "\x{00BF}" => "\x60", # INVERTED QUESTION MARK
126 "\x{00C4}" => "\x5B", # LATIN CAPITAL LETTER A WITH DIAERESIS
127 "\x{00C5}" => "\x0E", # LATIN CAPITAL LETTER A WITH RING ABOVE
128 "\x{00C6}" => "\x1C", # LATIN CAPITAL LETTER AE
129 "\x{00C7}" => "\x09", # LATIN CAPITAL LETTER C WITH CEDILLA
130 "\x{00C9}" => "\x1F", # LATIN CAPITAL LETTER E WITH ACUTE
131 "\x{00D1}" => "\x5D", # LATIN CAPITAL LETTER N WITH TILDE
132 "\x{00D6}" => "\x5C", # LATIN CAPITAL LETTER O WITH DIAERESIS
133 "\x{00D8}" => "\x0B", # LATIN CAPITAL LETTER O WITH STROKE
134 "\x{00DC}" => "\x5E", # LATIN CAPITAL LETTER U WITH DIAERESIS
135 "\x{00DF}" => "\x1E", # LATIN SMALL LETTER SHARP S
136 "\x{00E0}" => "\x7F", # LATIN SMALL LETTER A WITH GRAVE
137 "\x{00E4}" => "\x7B", # LATIN SMALL LETTER A WITH DIAERESIS
138 "\x{00E5}" => "\x0F", # LATIN SMALL LETTER A WITH RING ABOVE
139 "\x{00E6}" => "\x1D", # LATIN SMALL LETTER AE
140 "\x{00E8}" => "\x04", # LATIN SMALL LETTER E WITH GRAVE
141 "\x{00E9}" => "\x05", # LATIN SMALL LETTER E WITH ACUTE
142 "\x{00EC}" => "\x07", # LATIN SMALL LETTER I WITH GRAVE
143 "\x{00F1}" => "\x7D", # LATIN SMALL LETTER N WITH TILDE
144 "\x{00F2}" => "\x08", # LATIN SMALL LETTER O WITH GRAVE
145 "\x{00F6}" => "\x7C", # LATIN SMALL LETTER O WITH DIAERESIS
146 "\x{00F8}" => "\x0C", # LATIN SMALL LETTER O WITH STROKE
147 "\x{00F9}" => "\x06", # LATIN SMALL LETTER U WITH GRAVE
148 "\x{00FC}" => "\x7E", # LATIN SMALL LETTER U WITH DIAERESIS
149 "\x{0393}" => "\x13", # GREEK CAPITAL LETTER GAMMA
150 "\x{0394}" => "\x10", # GREEK CAPITAL LETTER DELTA
151 "\x{0398}" => "\x19", # GREEK CAPITAL LETTER THETA
152 "\x{039B}" => "\x14", # GREEK CAPITAL LETTER LAMDA
153 "\x{039E}" => "\x1A", # GREEK CAPITAL LETTER XI
154 "\x{03A0}" => "\x16", # GREEK CAPITAL LETTER PI
155 "\x{03A3}" => "\x18", # GREEK CAPITAL LETTER SIGMA
156 "\x{03A6}" => "\x12", # GREEK CAPITAL LETTER PHI
157 "\x{03A8}" => "\x17", # GREEK CAPITAL LETTER PSI
158 "\x{03A9}" => "\x15", # GREEK CAPITAL LETTER OMEGA
159 "\x{20AC}" => "\x1B\x65", # EURO SIGN
161 our %GSM2UNI = reverse %UNI2GSM;
165 my ( $obj, $bytes, $chk ) = @_;
166 return undef unless defined $bytes;
167 my $str = substr( $bytes, 0, 0 ); # to propagate taintedness;
168 while ( length $bytes ) {
172 $c = substr( $bytes, 0, 1, '' );
174 } while ( length $bytes and $c eq $ESC );
176 exists $GSM2UNI{$seq} ? $GSM2UNI{$seq}
177 : ( $chk && ref $chk eq 'CODE' ) ? $chk->( unpack 'C*', $seq )
179 if ( not exists $GSM2UNI{$seq} and $chk and not ref $chk ) {
180 if ( substr( $seq, 0, 1 ) eq $ESC
181 and ( $chk & Encode::STOP_AT_PARTIAL ) )
186 croak join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq )
187 . ' does not map to Unicode'
188 if $chk & Encode::DIE_ON_ERR;
189 carp join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq )
190 . ' does not map to Unicode'
191 if $chk & Encode::WARN_ON_ERR;
192 if ( $chk & Encode::RETURN_ON_ERR ) {
199 $_[1] = $bytes if not ref $chk and $chk and !( $chk & Encode::LEAVE_SRC );
204 my ( $obj, $str, $chk ) = @_;
205 return undef unless defined $str;
206 my $bytes = substr( $str, 0, 0 ); # to propagate taintedness
207 while ( length $str ) {
208 my $u = substr( $str, 0, 1, '' );
211 exists $UNI2GSM{$u} ? $UNI2GSM{$u}
212 : ( $chk && ref $chk eq 'CODE' ) ? $chk->( ord($u) )
214 if ( not exists $UNI2GSM{$u} and $chk and not ref $chk ) {
215 croak sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name )
216 if $chk & Encode::DIE_ON_ERR;
217 carp sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name )
218 if $chk & Encode::WARN_ON_ERR;
219 if ( $chk & Encode::RETURN_ON_ERR ) {
226 $_[1] = $str if not ref $chk and $chk and !( $chk & Encode::LEAVE_SRC );
235 Encode::GSM0338 -- ETSI GSM 03.38 Encoding
239 use Encode qw/encode decode/;
240 $gsm0338 = encode("gsm0338", $unicode); # loads Encode::GSM0338 implicitly
241 $unicode = decode("gsm0338", $gsm0338); # ditto
245 GSM0338 is for GSM handsets. Though it shares alphanumerals with ASCII,
246 control character ranges and other parts are mapped very differently,
247 mainly to store Greek characters. There are also escape sequences
248 (starting with 0x1B) to cover e.g. the Euro sign.
250 This was once handled by L<Encode::Bytes> but because of all those
251 unusual specifications, Encode 2.20 has relocated the support to
254 This module implements only I<GSM 7 bit Default Alphabet> and
255 I<GSM 7 bit default alphabet extension table> according to standard
256 3GPP TS 23.038 version 16. Therefore I<National Language Single Shift>
257 and I<National Language Locking Shift> are not implemented nor supported.
261 This modules operates with octets (like any other Encode module) and not
262 with packed septets (unlike other GSM standards). Therefore for processing
263 binary SMS or parts of GSM TPDU payload (3GPP TS 23.040) it is needed to do
264 conversion between octets and packed septets. For this purpose perl's C<pack>
265 and C<unpack> functions may be useful:
267 $bytes = substr(pack('(b*)*', unpack '(A7)*', unpack 'b*', $septets), 0, $num_of_septets);
268 $unicode = decode('GSM0338', $bytes);
270 $bytes = encode('GSM0338', $unicode);
271 $septets = pack 'b*', join '', map { substr $_, 0, 7 } unpack '(A8)*', unpack 'b*', $bytes;
272 $num_of_septets = length $bytes;
274 Please note that for correct decoding of packed septets it is required to
275 know number of septets packed in binary buffer as binary buffer is always
276 padded with zero bits and 7 zero bits represents character C<@>. Number
277 of septets is also stored in TPDU payload when dealing with 3GPP TS 23.040.
281 Encode::GSM0338 2.7 and older versions (part of Encode 3.06) incorrectly
282 handled zero bytes (character C<@>). This was fixed in Encode::GSM0338
283 version 2.8 (part of Encode 3.07).
287 L<3GPP TS 23.038|https://www.3gpp.org/dynareport/23038.htm>
289 L<ETSI TS 123 038 V16.0.0 (2020-07)|https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf>