This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate mainline.
[perl5.git] / t / op / utf8decode.t
CommitLineData
a9917092
JH
1#!./perl
2
3BEGIN {
4 chdir 't' if -d 't';
5 @INC = '../lib';
6}
7
3b0e0cb6 8no utf8;
ffc61ed2 9
a9917092
JH
10print "1..78\n";
11
12my $test = 1;
13
14# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
15# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
16# version dated 2000-09-02.
17
3b0e0cb6
JH
18# We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff
19# because e.g. many patch programs have issues with binary data.
a9917092
JH
20
21my @MK = split(/\n/, <<__EOMK__);
221 Correct UTF-8
3b0e0cb6 231.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5
a9917092
JH
242 Boundary conditions
252.1 First possible sequence of certain length
3b0e0cb6
JH
262.1.1 y "\x00" 0 1 00 1
272.1.2 y "\xc2\x80" 80 2 c2:80 1
282.1.3 y "\xe0\xa0\x80" 800 3 e0:a0:80 1
292.1.4 y "\xf0\x90\x80\x80" 10000 4 f0:90:80:80 1
302.1.5 y "\xf8\x88\x80\x80\x80" 200000 5 f8:88:80:80:80 1
312.1.6 y "\xfc\x84\x80\x80\x80\x80" 4000000 6 fc:84:80:80:80:80 1
a9917092 322.2 Last possible sequence of certain length
3b0e0cb6
JH
332.2.1 y "\x7f" 7f 1 7f 1
342.2.2 y "\xdf\xbf" 7ff 2 df:bf 1
a9917092 35# The ffff is illegal unless UTF8_ALLOW_FFFF
3b0e0cb6
JH
362.2.3 n "\xef\xbf\xbf" ffff 3 ef:bf:bf 1 character 0xffff
372.2.4 y "\xf7\xbf\xbf\xbf" 1fffff 4 f7:bf:bf:bf 1
382.2.5 y "\xfb\xbf\xbf\xbf\xbf" 3ffffff 5 fb:bf:bf:bf:bf 1
392.2.6 y "\xfd\xbf\xbf\xbf\xbf\xbf" 7fffffff 6 fd:bf:bf:bf:bf:bf 1
a9917092 402.3 Other boundary conditions
3b0e0cb6
JH
412.3.1 y "\xed\x9f\xbf" d7ff 3 ed:9f:bf 1
422.3.2 y "\xee\x80\x80" e000 3 ee:80:80 1
432.3.3 y "\xef\xbf\xbd" fffd 3 ef:bf:bd 1
442.3.4 y "\xf4\x8f\xbf\xbf" 10ffff 4 f4:8f:bf:bf 1
452.3.5 y "\xf4\x90\x80\x80" 110000 4 f4:90:80:80 1
a9917092
JH
463 Malformed sequences
473.1 Unexpected continuation bytes
3b0e0cb6
JH
483.1.1 n "\x80" - 1 80 - unexpected continuation byte 0x80
493.1.2 n "\xbf" - 1 bf - unexpected continuation byte 0xbf
503.1.3 n "\x80\xbf" - 2 80:bf - unexpected continuation byte 0x80
513.1.4 n "\x80\xbf\x80" - 3 80:bf:80 - unexpected continuation byte 0x80
523.1.5 n "\x80\xbf\x80\xbf" - 4 80:bf:80:bf - unexpected continuation byte 0x80
533.1.6 n "\x80\xbf\x80\xbf\x80" - 5 80:bf:80:bf:80 - unexpected continuation byte 0x80
543.1.7 n "\x80\xbf\x80\xbf\x80\xbf" - 6 80:bf:80:bf:80:bf - unexpected continuation byte 0x80
553.1.8 n "\x80\xbf\x80\xbf\x80\xbf\x80" - 7 80:bf:80:bf:80:bf:80 - unexpected continuation byte 0x80
563.1.9 n "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" - 64 80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf - unexpected continuation byte 0x80
a9917092 573.2 Lonely start characters
3b0e0cb6
JH
583.2.1 n "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf " - 64 c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 - unexpected non-continuation byte 0x20 after start byte 0xc0
593.2.2 n "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef " - 32 e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20 - unexpected non-continuation byte 0x20 after start byte 0xe0
603.2.3 n "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 " - 16 f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 - unexpected non-continuation byte 0x20 after start byte 0xf0
613.2.4 n "\xf8 \xf9 \xfa \xfb " - 8 f8:20:f9:20:fa:20:fb:20 - unexpected non-continuation byte 0x20 after start byte 0xf8
623.2.5 n "\xfc \xfd " - 4 fc:20:fd:20 - unexpected non-continuation byte 0x20 after start byte 0xfc
a9917092 633.3 Sequences with last continuation byte missing
3b0e0cb6
JH
643.3.1 n "\xc0" - 1 c0 - 1 byte, need 2
653.3.2 n "\xe0\x80" - 2 e0:80 - 2 bytes, need 3
663.3.3 n "\xf0\x80\x80" - 3 f0:80:80 - 3 bytes, need 4
673.3.4 n "\xf8\x80\x80\x80" - 4 f8:80:80:80 - 4 bytes, need 5
683.3.5 n "\xfc\x80\x80\x80\x80" - 5 fc:80:80:80:80 - 5 bytes, need 6
693.3.6 n "\xdf" - 1 df - 1 byte, need 2
703.3.7 n "\xef\xbf" - 2 ef:bf - 2 bytes, need 3
713.3.8 n "\xf7\xbf\xbf" - 3 f7:bf:bf - 3 bytes, need 4
723.3.9 n "\xfb\xbf\xbf\xbf" - 4 fb:bf:bf:bf - 4 bytes, need 5
733.3.10 n "\xfd\xbf\xbf\xbf\xbf" - 5 fd:bf:bf:bf:bf - 5 bytes, need 6
a9917092 743.4 Concatenation of incomplete sequences
3b0e0cb6 753.4.1 n "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf" - 30 c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf - unexpected non-continuation byte 0xe0 after start byte 0xc0
a9917092 763.5 Impossible bytes
3b0e0cb6
JH
773.5.1 n "\xfe" - 1 fe - byte 0xfe
783.5.2 n "\xff" - 1 ff - byte 0xff
793.5.3 n "\xfe\xfe\xff\xff" - 4 fe:fe:ff:ff - byte 0xfe
a9917092
JH
804 Overlong sequences
814.1 Examples of an overlong ASCII character
3b0e0cb6
JH
824.1.1 n "\xc0\xaf" - 2 c0:af - 2 bytes, need 1
834.1.2 n "\xe0\x80\xaf" - 3 e0:80:af - 3 bytes, need 1
844.1.3 n "\xf0\x80\x80\xaf" - 4 f0:80:80:af - 4 bytes, need 1
854.1.4 n "\xf8\x80\x80\x80\xaf" - 5 f8:80:80:80:af - 5 bytes, need 1
864.1.5 n "\xfc\x80\x80\x80\x80\xaf" - 6 fc:80:80:80:80:af - 6 bytes, need 1
a9917092 874.2 Maximum overlong sequences
3b0e0cb6
JH
884.2.1 n "\xc1\xbf" - 2 c1:bf - 2 bytes, need 1
894.2.2 n "\xe0\x9f\xbf" - 3 e0:9f:bf - 3 bytes, need 2
904.2.3 n "\xf0\x8f\xbf\xbf" - 4 f0:8f:bf:bf - 4 bytes, need 3
914.2.4 n "\xf8\x87\xbf\xbf\xbf" - 5 f8:87:bf:bf:bf - 5 bytes, need 4
924.2.5 n "\xfc\x83\xbf\xbf\xbf\xbf" - 6 fc:83:bf:bf:bf:bf - 6 bytes, need 5
a9917092 934.3 Overlong representation of the NUL character
3b0e0cb6
JH
944.3.1 n "\xc0\x80" - 2 c0:80 - 2 bytes, need 1
954.3.2 n "\xe0\x80\x80" - 3 e0:80:80 - 3 bytes, need 1
964.3.3 n "\xf0\x80\x80\x80" - 4 f0:80:80:80 - 4 bytes, need 1
974.3.4 n "\xf8\x80\x80\x80\x80" - 5 f8:80:80:80:80 - 5 bytes, need 1
984.3.5 n "\xfc\x80\x80\x80\x80\x80" - 6 fc:80:80:80:80:80 - 6 bytes, need 1
a9917092
JH
995 Illegal code positions
1005.1 Single UTF-16 surrogates
3b0e0cb6
JH
1015.1.1 n "\xed\xa0\x80" - 3 ed:a0:80 - UTF-16 surrogate 0xd800
1025.1.2 n "\xed\xad\xbf" - 3 ed:ad:bf - UTF-16 surrogate 0xdb7f
1035.1.3 n "\xed\xae\x80" - 3 ed:ae:80 - UTF-16 surrogate 0xdb80
1045.1.4 n "\xed\xaf\xbf" - 3 ed:af:bf - UTF-16 surrogate 0xdbff
1055.1.5 n "\xed\xb0\x80" - 3 ed:b0:80 - UTF-16 surrogate 0xdc00
1065.1.6 n "\xed\xbe\x80" - 3 ed:be:80 - UTF-16 surrogate 0xdf80
1075.1.7 n "\xed\xbf\xbf" - 3 ed:bf:bf - UTF-16 surrogate 0xdfff
a9917092 1085.2 Paired UTF-16 surrogates
3b0e0cb6
JH
1095.2.1 n "\xed\xa0\x80\xed\xb0\x80" - 6 ed:a0:80:ed:b0:80 - UTF-16 surrogate 0xd800
1105.2.2 n "\xed\xa0\x80\xed\xbf\xbf" - 6 ed:a0:80:ed:bf:bf - UTF-16 surrogate 0xd800
1115.2.3 n "\xed\xad\xbf\xed\xb0\x80" - 6 ed:ad:bf:ed:b0:80 - UTF-16 surrogate 0xdb7f
1125.2.4 n "\xed\xad\xbf\xed\xbf\xbf" - 6 ed:ad:bf:ed:bf:bf - UTF-16 surrogate 0xdb7f
1135.2.5 n "\xed\xae\x80\xed\xb0\x80" - 6 ed:ae:80:ed:b0:80 - UTF-16 surrogate 0xdb80
1145.2.6 n "\xed\xae\x80\xed\xbf\xbf" - 6 ed:ae:80:ed:bf:bf - UTF-16 surrogate 0xdb80
1155.2.7 n "\xed\xaf\xbf\xed\xb0\x80" - 6 ed:af:bf:ed:b0:80 - UTF-16 surrogate 0xdbff
1165.2.8 n "\xed\xaf\xbf\xed\xbf\xbf" - 6 ed:af:bf:ed:bf:bf - UTF-16 surrogate 0xdbff
a9917092 1175.3 Other illegal code positions
3b0e0cb6 1185.3.1 n "\xef\xbf\xbe" - 3 ef:bf:be - byte order mark 0xfffe
a9917092 119# The ffff is illegal unless UTF8_ALLOW_FFFF
3b0e0cb6 1205.3.2 n "\xef\xbf\xbf" - 3 ef:bf:bf - character 0xffff
a9917092
JH
121__EOMK__
122
123# 104..181
124{
125 my $WARNCNT;
126 my $id;
127
128 local $SIG{__WARN__} =
129 sub {
421a8bf2 130 print "# $id: @_";
a9917092
JH
131 $WARNCNT++;
132 $WARNMSG = "@_";
133 };
134
135 sub moan {
136 print "$id: @_";
137 }
138
139 sub test_unpack_U {
140 $WARNCNT = 0;
141 $WARNMSG = "";
142 unpack('U*', $_[0]);
143 }
144
145 for (@MK) {
146 if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) {
147 # print "# $_\n";
148 } elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) {
149 $id = $1;
150 my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $error) =
151 ($2, $3, $4, $5, $6, $7, $8);
152 my @hex = split(/:/, $hex);
153 unless (@hex == $byteslen) {
154 my $nhex = @hex;
155 moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n";
156 }
157 {
158 use bytes;
159 my $bytesbyteslen = length($bytes);
160 unless ($bytesbyteslen == $byteslen) {
161 moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n";
162 }
163 }
164 if ($okay eq 'y') {
165 test_unpack_U($bytes);
166 if ($WARNCNT) {
167 moan "unpack('U*') false negative\n";
168 print "not ";
169 }
170 } elsif ($okay eq 'n') {
171 test_unpack_U($bytes);
172 if ($WARNCNT == 0 || ($error ne '' && $WARNMSG !~ /$error/)) {
173 moan "unpack('U*') false positive\n";
174 print "not ";
175 }
176 }
177 print "ok $test\n";
178 $test++;
179 } else {
180 moan "unknown format\n";
181 }
182 }
183}