This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
new perldelta
[perl5.git] / cpan / Encode / lib / Encode / GSM0338.pm
CommitLineData
44b3b9c7 1#
7167e196 2# $Id: GSM0338.pm,v 2.10 2021/05/24 10:56:53 dankogai Exp $
44b3b9c7
SP
3#
4package Encode::GSM0338;
5
6use strict;
7use warnings;
8use Carp;
9
10use vars qw($VERSION);
7167e196 11$VERSION = do { my @r = ( q$Revision: 2.10 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
44b3b9c7
SP
12
13use Encode qw(:fallbacks);
14
369b9ffe 15use parent qw(Encode::Encoding);
44b3b9c7
SP
16__PACKAGE__->Define('gsm0338');
17
44b3b9c7 18use utf8;
27ee53f9
SH
19
20# Mapping table according to 3GPP TS 23.038 version 16.0.0 Release 16 and ETSI TS 123 038 V16.0.0 (2020-07)
21# https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf (page 20 and 22)
44b3b9c7 22our %UNI2GSM = (
44b3b9c7
SP
23 "\x{000A}" => "\x0A", # LINE FEED
24 "\x{000C}" => "\x1B\x0A", # FORM FEED
25 "\x{000D}" => "\x0D", # CARRIAGE RETURN
26 "\x{0020}" => "\x20", # SPACE
27 "\x{0021}" => "\x21", # EXCLAMATION MARK
28 "\x{0022}" => "\x22", # QUOTATION MARK
29 "\x{0023}" => "\x23", # NUMBER SIGN
30 "\x{0024}" => "\x02", # DOLLAR SIGN
31 "\x{0025}" => "\x25", # PERCENT SIGN
32 "\x{0026}" => "\x26", # AMPERSAND
33 "\x{0027}" => "\x27", # APOSTROPHE
34 "\x{0028}" => "\x28", # LEFT PARENTHESIS
35 "\x{0029}" => "\x29", # RIGHT PARENTHESIS
36 "\x{002A}" => "\x2A", # ASTERISK
37 "\x{002B}" => "\x2B", # PLUS SIGN
38 "\x{002C}" => "\x2C", # COMMA
39 "\x{002D}" => "\x2D", # HYPHEN-MINUS
40 "\x{002E}" => "\x2E", # FULL STOP
41 "\x{002F}" => "\x2F", # SOLIDUS
42 "\x{0030}" => "\x30", # DIGIT ZERO
43 "\x{0031}" => "\x31", # DIGIT ONE
44 "\x{0032}" => "\x32", # DIGIT TWO
45 "\x{0033}" => "\x33", # DIGIT THREE
46 "\x{0034}" => "\x34", # DIGIT FOUR
47 "\x{0035}" => "\x35", # DIGIT FIVE
48 "\x{0036}" => "\x36", # DIGIT SIX
49 "\x{0037}" => "\x37", # DIGIT SEVEN
50 "\x{0038}" => "\x38", # DIGIT EIGHT
51 "\x{0039}" => "\x39", # DIGIT NINE
52 "\x{003A}" => "\x3A", # COLON
53 "\x{003B}" => "\x3B", # SEMICOLON
54 "\x{003C}" => "\x3C", # LESS-THAN SIGN
55 "\x{003D}" => "\x3D", # EQUALS SIGN
56 "\x{003E}" => "\x3E", # GREATER-THAN SIGN
57 "\x{003F}" => "\x3F", # QUESTION MARK
27ee53f9 58 "\x{0040}" => "\x00", # COMMERCIAL AT
44b3b9c7
SP
59 "\x{0041}" => "\x41", # LATIN CAPITAL LETTER A
60 "\x{0042}" => "\x42", # LATIN CAPITAL LETTER B
61 "\x{0043}" => "\x43", # LATIN CAPITAL LETTER C
62 "\x{0044}" => "\x44", # LATIN CAPITAL LETTER D
63 "\x{0045}" => "\x45", # LATIN CAPITAL LETTER E
64 "\x{0046}" => "\x46", # LATIN CAPITAL LETTER F
65 "\x{0047}" => "\x47", # LATIN CAPITAL LETTER G
66 "\x{0048}" => "\x48", # LATIN CAPITAL LETTER H
67 "\x{0049}" => "\x49", # LATIN CAPITAL LETTER I
68 "\x{004A}" => "\x4A", # LATIN CAPITAL LETTER J
69 "\x{004B}" => "\x4B", # LATIN CAPITAL LETTER K
70 "\x{004C}" => "\x4C", # LATIN CAPITAL LETTER L
71 "\x{004D}" => "\x4D", # LATIN CAPITAL LETTER M
72 "\x{004E}" => "\x4E", # LATIN CAPITAL LETTER N
73 "\x{004F}" => "\x4F", # LATIN CAPITAL LETTER O
74 "\x{0050}" => "\x50", # LATIN CAPITAL LETTER P
75 "\x{0051}" => "\x51", # LATIN CAPITAL LETTER Q
76 "\x{0052}" => "\x52", # LATIN CAPITAL LETTER R
77 "\x{0053}" => "\x53", # LATIN CAPITAL LETTER S
78 "\x{0054}" => "\x54", # LATIN CAPITAL LETTER T
79 "\x{0055}" => "\x55", # LATIN CAPITAL LETTER U
80 "\x{0056}" => "\x56", # LATIN CAPITAL LETTER V
81 "\x{0057}" => "\x57", # LATIN CAPITAL LETTER W
82 "\x{0058}" => "\x58", # LATIN CAPITAL LETTER X
83 "\x{0059}" => "\x59", # LATIN CAPITAL LETTER Y
84 "\x{005A}" => "\x5A", # LATIN CAPITAL LETTER Z
27ee53f9
SH
85 "\x{005B}" => "\x1B\x3C", # LEFT SQUARE BRACKET
86 "\x{005C}" => "\x1B\x2F", # REVERSE SOLIDUS
87 "\x{005D}" => "\x1B\x3E", # RIGHT SQUARE BRACKET
88 "\x{005E}" => "\x1B\x14", # CIRCUMFLEX ACCENT
44b3b9c7
SP
89 "\x{005F}" => "\x11", # LOW LINE
90 "\x{0061}" => "\x61", # LATIN SMALL LETTER A
91 "\x{0062}" => "\x62", # LATIN SMALL LETTER B
92 "\x{0063}" => "\x63", # LATIN SMALL LETTER C
93 "\x{0064}" => "\x64", # LATIN SMALL LETTER D
94 "\x{0065}" => "\x65", # LATIN SMALL LETTER E
95 "\x{0066}" => "\x66", # LATIN SMALL LETTER F
96 "\x{0067}" => "\x67", # LATIN SMALL LETTER G
97 "\x{0068}" => "\x68", # LATIN SMALL LETTER H
98 "\x{0069}" => "\x69", # LATIN SMALL LETTER I
99 "\x{006A}" => "\x6A", # LATIN SMALL LETTER J
100 "\x{006B}" => "\x6B", # LATIN SMALL LETTER K
101 "\x{006C}" => "\x6C", # LATIN SMALL LETTER L
102 "\x{006D}" => "\x6D", # LATIN SMALL LETTER M
103 "\x{006E}" => "\x6E", # LATIN SMALL LETTER N
104 "\x{006F}" => "\x6F", # LATIN SMALL LETTER O
105 "\x{0070}" => "\x70", # LATIN SMALL LETTER P
106 "\x{0071}" => "\x71", # LATIN SMALL LETTER Q
107 "\x{0072}" => "\x72", # LATIN SMALL LETTER R
108 "\x{0073}" => "\x73", # LATIN SMALL LETTER S
109 "\x{0074}" => "\x74", # LATIN SMALL LETTER T
110 "\x{0075}" => "\x75", # LATIN SMALL LETTER U
111 "\x{0076}" => "\x76", # LATIN SMALL LETTER V
112 "\x{0077}" => "\x77", # LATIN SMALL LETTER W
113 "\x{0078}" => "\x78", # LATIN SMALL LETTER X
114 "\x{0079}" => "\x79", # LATIN SMALL LETTER Y
115 "\x{007A}" => "\x7A", # LATIN SMALL LETTER Z
44b3b9c7
SP
116 "\x{007B}" => "\x1B\x28", # LEFT CURLY BRACKET
117 "\x{007C}" => "\x1B\x40", # VERTICAL LINE
118 "\x{007D}" => "\x1B\x29", # RIGHT CURLY BRACKET
119 "\x{007E}" => "\x1B\x3D", # TILDE
44b3b9c7
SP
120 "\x{00A1}" => "\x40", # INVERTED EXCLAMATION MARK
121 "\x{00A3}" => "\x01", # POUND SIGN
122 "\x{00A4}" => "\x24", # CURRENCY SIGN
123 "\x{00A5}" => "\x03", # YEN SIGN
124 "\x{00A7}" => "\x5F", # SECTION SIGN
125 "\x{00BF}" => "\x60", # INVERTED QUESTION MARK
126 "\x{00C4}" => "\x5B", # LATIN CAPITAL LETTER A WITH DIAERESIS
127 "\x{00C5}" => "\x0E", # LATIN CAPITAL LETTER A WITH RING ABOVE
128 "\x{00C6}" => "\x1C", # LATIN CAPITAL LETTER AE
27ee53f9 129 "\x{00C7}" => "\x09", # LATIN CAPITAL LETTER C WITH CEDILLA
44b3b9c7
SP
130 "\x{00C9}" => "\x1F", # LATIN CAPITAL LETTER E WITH ACUTE
131 "\x{00D1}" => "\x5D", # LATIN CAPITAL LETTER N WITH TILDE
132 "\x{00D6}" => "\x5C", # LATIN CAPITAL LETTER O WITH DIAERESIS
133 "\x{00D8}" => "\x0B", # LATIN CAPITAL LETTER O WITH STROKE
134 "\x{00DC}" => "\x5E", # LATIN CAPITAL LETTER U WITH DIAERESIS
135 "\x{00DF}" => "\x1E", # LATIN SMALL LETTER SHARP S
136 "\x{00E0}" => "\x7F", # LATIN SMALL LETTER A WITH GRAVE
137 "\x{00E4}" => "\x7B", # LATIN SMALL LETTER A WITH DIAERESIS
138 "\x{00E5}" => "\x0F", # LATIN SMALL LETTER A WITH RING ABOVE
139 "\x{00E6}" => "\x1D", # LATIN SMALL LETTER AE
44b3b9c7
SP
140 "\x{00E8}" => "\x04", # LATIN SMALL LETTER E WITH GRAVE
141 "\x{00E9}" => "\x05", # LATIN SMALL LETTER E WITH ACUTE
142 "\x{00EC}" => "\x07", # LATIN SMALL LETTER I WITH GRAVE
143 "\x{00F1}" => "\x7D", # LATIN SMALL LETTER N WITH TILDE
144 "\x{00F2}" => "\x08", # LATIN SMALL LETTER O WITH GRAVE
145 "\x{00F6}" => "\x7C", # LATIN SMALL LETTER O WITH DIAERESIS
146 "\x{00F8}" => "\x0C", # LATIN SMALL LETTER O WITH STROKE
147 "\x{00F9}" => "\x06", # LATIN SMALL LETTER U WITH GRAVE
148 "\x{00FC}" => "\x7E", # LATIN SMALL LETTER U WITH DIAERESIS
149 "\x{0393}" => "\x13", # GREEK CAPITAL LETTER GAMMA
150 "\x{0394}" => "\x10", # GREEK CAPITAL LETTER DELTA
151 "\x{0398}" => "\x19", # GREEK CAPITAL LETTER THETA
152 "\x{039B}" => "\x14", # GREEK CAPITAL LETTER LAMDA
153 "\x{039E}" => "\x1A", # GREEK CAPITAL LETTER XI
154 "\x{03A0}" => "\x16", # GREEK CAPITAL LETTER PI
155 "\x{03A3}" => "\x18", # GREEK CAPITAL LETTER SIGMA
156 "\x{03A6}" => "\x12", # GREEK CAPITAL LETTER PHI
157 "\x{03A8}" => "\x17", # GREEK CAPITAL LETTER PSI
158 "\x{03A9}" => "\x15", # GREEK CAPITAL LETTER OMEGA
159 "\x{20AC}" => "\x1B\x65", # EURO SIGN
160);
161our %GSM2UNI = reverse %UNI2GSM;
7167e196 162our $ESC = "\x1b";
44b3b9c7
SP
163
164sub decode ($$;$) {
165 my ( $obj, $bytes, $chk ) = @_;
3f60a930 166 return undef unless defined $bytes;
7167e196 167 my $str = substr( $bytes, 0, 0 ); # to propagate taintedness;
44b3b9c7 168 while ( length $bytes ) {
27ee53f9
SH
169 my $seq = '';
170 my $c;
171 do {
172 $c = substr( $bytes, 0, 1, '' );
173 $seq .= $c;
174 } while ( length $bytes and $c eq $ESC );
175 my $u =
7167e196
RS
176 exists $GSM2UNI{$seq} ? $GSM2UNI{$seq}
177 : ( $chk && ref $chk eq 'CODE' ) ? $chk->( unpack 'C*', $seq )
178 : "\x{FFFD}";
27ee53f9 179 if ( not exists $GSM2UNI{$seq} and $chk and not ref $chk ) {
7167e196
RS
180 if ( substr( $seq, 0, 1 ) eq $ESC
181 and ( $chk & Encode::STOP_AT_PARTIAL ) )
182 {
0aee9517
DK
183 $bytes .= $seq;
184 last;
185 }
7167e196
RS
186 croak join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq )
187 . ' does not map to Unicode'
188 if $chk & Encode::DIE_ON_ERR;
189 carp join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq )
190 . ' does not map to Unicode'
191 if $chk & Encode::WARN_ON_ERR;
192 if ( $chk & Encode::RETURN_ON_ERR ) {
27ee53f9
SH
193 $bytes .= $seq;
194 last;
195 }
44b3b9c7
SP
196 }
197 $str .= $u;
198 }
7167e196 199 $_[1] = $bytes if not ref $chk and $chk and !( $chk & Encode::LEAVE_SRC );
44b3b9c7
SP
200 return $str;
201}
202
44b3b9c7
SP
203sub encode($$;$) {
204 my ( $obj, $str, $chk ) = @_;
3f60a930 205 return undef unless defined $str;
7167e196 206 my $bytes = substr( $str, 0, 0 ); # to propagate taintedness
44b3b9c7
SP
207 while ( length $str ) {
208 my $u = substr( $str, 0, 1, '' );
209 my $c;
27ee53f9 210 my $seq =
7167e196
RS
211 exists $UNI2GSM{$u} ? $UNI2GSM{$u}
212 : ( $chk && ref $chk eq 'CODE' ) ? $chk->( ord($u) )
213 : $UNI2GSM{'?'};
27ee53f9 214 if ( not exists $UNI2GSM{$u} and $chk and not ref $chk ) {
7167e196
RS
215 croak sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name )
216 if $chk & Encode::DIE_ON_ERR;
217 carp sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name )
218 if $chk & Encode::WARN_ON_ERR;
219 if ( $chk & Encode::RETURN_ON_ERR ) {
27ee53f9
SH
220 $str .= $u;
221 last;
222 }
223 }
224 $bytes .= $seq;
44b3b9c7 225 }
7167e196 226 $_[1] = $str if not ref $chk and $chk and !( $chk & Encode::LEAVE_SRC );
44b3b9c7
SP
227 return $bytes;
228}
229
2301;
231__END__
232
233=head1 NAME
234
27ee53f9 235Encode::GSM0338 -- ETSI GSM 03.38 Encoding
44b3b9c7
SP
236
237=head1 SYNOPSIS
238
27ee53f9
SH
239 use Encode qw/encode decode/;
240 $gsm0338 = encode("gsm0338", $unicode); # loads Encode::GSM0338 implicitly
241 $unicode = decode("gsm0338", $gsm0338); # ditto
44b3b9c7
SP
242
243=head1 DESCRIPTION
244
245GSM0338 is for GSM handsets. Though it shares alphanumerals with ASCII,
246control character ranges and other parts are mapped very differently,
247mainly to store Greek characters. There are also escape sequences
248(starting with 0x1B) to cover e.g. the Euro sign.
249
250This was once handled by L<Encode::Bytes> but because of all those
251unusual specifications, Encode 2.20 has relocated the support to
252this module.
253
27ee53f9
SH
254This module implements only I<GSM 7 bit Default Alphabet> and
255I<GSM 7 bit default alphabet extension table> according to standard
2563GPP TS 23.038 version 16. Therefore I<National Language Single Shift>
257and I<National Language Locking Shift> are not implemented nor supported.
44b3b9c7 258
27ee53f9 259=head2 Septets
44b3b9c7 260
27ee53f9
SH
261This modules operates with octets (like any other Encode module) and not
262with packed septets (unlike other GSM standards). Therefore for processing
263binary SMS or parts of GSM TPDU payload (3GPP TS 23.040) it is needed to do
264conversion between octets and packed septets. For this purpose perl's C<pack>
265and C<unpack> functions may be useful:
44b3b9c7 266
27ee53f9
SH
267 $bytes = substr(pack('(b*)*', unpack '(A7)*', unpack 'b*', $septets), 0, $num_of_septets);
268 $unicode = decode('GSM0338', $bytes);
44b3b9c7 269
27ee53f9
SH
270 $bytes = encode('GSM0338', $unicode);
271 $septets = pack 'b*', join '', map { substr $_, 0, 7 } unpack '(A8)*', unpack 'b*', $bytes;
272 $num_of_septets = length $bytes;
44b3b9c7 273
27ee53f9
SH
274Please note that for correct decoding of packed septets it is required to
275know number of septets packed in binary buffer as binary buffer is always
276padded with zero bits and 7 zero bits represents character C<@>. Number
277of septets is also stored in TPDU payload when dealing with 3GPP TS 23.040.
44b3b9c7 278
27ee53f9 279=head1 BUGS
44b3b9c7 280
27ee53f9
SH
281Encode::GSM0338 2.7 and older versions (part of Encode 3.06) incorrectly
282handled zero bytes (character C<@>). This was fixed in Encode::GSM0338
283version 2.8 (part of Encode 3.07).
44b3b9c7 284
27ee53f9 285=head1 SEE ALSO
44b3b9c7 286
27ee53f9 287L<3GPP TS 23.038|https://www.3gpp.org/dynareport/23038.htm>
44b3b9c7 288
27ee53f9 289L<ETSI TS 123 038 V16.0.0 (2020-07)|https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf>
44b3b9c7
SP
290
291L<Encode>
292
293=cut