This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
In utf8decode.t, test that the hex sequences and \x escapes are equivalent.
[perl5.git] / t / op / utf8decode.t
CommitLineData
a9917092
JH
1#!./perl
2
3BEGIN {
4 chdir 't' if -d 't';
5 @INC = '../lib';
680218c4 6 require './test.pl';
daf0f78e
NIS
7}
8
9{
10 my $wide = v256;
11 use bytes;
ffbc6a93
JH
12 my $ordwide = ord($wide);
13 printf "# under use bytes ord(v256) = 0x%02x\n", $ordwide;
680218c4
NC
14 skip_all('UTF-EBCDIC (not UTF-8) used here') if $ordwide == 140;
15
16 if ($ordwide != 196) {
ffbc6a93
JH
17 printf "# v256 starts with 0x%02x\n", $ordwide;
18 }
a9917092
JH
19}
20
3b0e0cb6 21no utf8;
ffc61ed2 22
a9917092
JH
23# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
24# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
daf0f78e 25# version dated 2000-09-02.
a9917092 26
3b0e0cb6
JH
27# We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff
28# because e.g. many patch programs have issues with binary data.
a9917092
JH
29
30my @MK = split(/\n/, <<__EOMK__);
311 Correct UTF-8
3b0e0cb6 321.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5
daf0f78e 332 Boundary conditions
a9917092 342.1 First possible sequence of certain length
3b0e0cb6
JH
352.1.1 y "\x00" 0 1 00 1
362.1.2 y "\xc2\x80" 80 2 c2:80 1
372.1.3 y "\xe0\xa0\x80" 800 3 e0:a0:80 1
382.1.4 y "\xf0\x90\x80\x80" 10000 4 f0:90:80:80 1
392.1.5 y "\xf8\x88\x80\x80\x80" 200000 5 f8:88:80:80:80 1
402.1.6 y "\xfc\x84\x80\x80\x80\x80" 4000000 6 fc:84:80:80:80:80 1
a9917092 412.2 Last possible sequence of certain length
3b0e0cb6
JH
422.2.1 y "\x7f" 7f 1 7f 1
432.2.2 y "\xdf\xbf" 7ff 2 df:bf 1
8567041c
NC
44# The ffff is legal by default since 872c91ae155f6880
452.2.3 y "\xef\xbf\xbf" ffff 3 ef:bf:bf 1 character 0xffff
3b0e0cb6
JH
462.2.4 y "\xf7\xbf\xbf\xbf" 1fffff 4 f7:bf:bf:bf 1
472.2.5 y "\xfb\xbf\xbf\xbf\xbf" 3ffffff 5 fb:bf:bf:bf:bf 1
482.2.6 y "\xfd\xbf\xbf\xbf\xbf\xbf" 7fffffff 6 fd:bf:bf:bf:bf:bf 1
a9917092 492.3 Other boundary conditions
3b0e0cb6
JH
502.3.1 y "\xed\x9f\xbf" d7ff 3 ed:9f:bf 1
512.3.2 y "\xee\x80\x80" e000 3 ee:80:80 1
522.3.3 y "\xef\xbf\xbd" fffd 3 ef:bf:bd 1
532.3.4 y "\xf4\x8f\xbf\xbf" 10ffff 4 f4:8f:bf:bf 1
542.3.5 y "\xf4\x90\x80\x80" 110000 4 f4:90:80:80 1
a9917092
JH
553 Malformed sequences
563.1 Unexpected continuation bytes
3b0e0cb6
JH
573.1.1 n "\x80" - 1 80 - unexpected continuation byte 0x80
583.1.2 n "\xbf" - 1 bf - unexpected continuation byte 0xbf
593.1.3 n "\x80\xbf" - 2 80:bf - unexpected continuation byte 0x80
603.1.4 n "\x80\xbf\x80" - 3 80:bf:80 - unexpected continuation byte 0x80
613.1.5 n "\x80\xbf\x80\xbf" - 4 80:bf:80:bf - unexpected continuation byte 0x80
623.1.6 n "\x80\xbf\x80\xbf\x80" - 5 80:bf:80:bf:80 - unexpected continuation byte 0x80
633.1.7 n "\x80\xbf\x80\xbf\x80\xbf" - 6 80:bf:80:bf:80:bf - unexpected continuation byte 0x80
643.1.8 n "\x80\xbf\x80\xbf\x80\xbf\x80" - 7 80:bf:80:bf:80:bf:80 - unexpected continuation byte 0x80
653.1.9 n "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" - 64 80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf - unexpected continuation byte 0x80
a9917092 663.2 Lonely start characters
8567041c
NC
673.2.1 n "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf " - 64 c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 - unexpected non-continuation byte 0x20, immediately after start byte 0xc0
683.2.2 n "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef " - 32 e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20 - unexpected non-continuation byte 0x20, immediately after start byte 0xe0
693.2.3 n "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 " - 16 f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 - unexpected non-continuation byte 0x20, immediately after start byte 0xf0
703.2.4 n "\xf8 \xf9 \xfa \xfb " - 8 f8:20:f9:20:fa:20:fb:20 - unexpected non-continuation byte 0x20, immediately after start byte 0xf8
713.2.5 n "\xfc \xfd " - 4 fc:20:fd:20 - unexpected non-continuation byte 0x20, immediately after start byte 0xfc
a9917092 723.3 Sequences with last continuation byte missing
3b0e0cb6
JH
733.3.1 n "\xc0" - 1 c0 - 1 byte, need 2
743.3.2 n "\xe0\x80" - 2 e0:80 - 2 bytes, need 3
753.3.3 n "\xf0\x80\x80" - 3 f0:80:80 - 3 bytes, need 4
763.3.4 n "\xf8\x80\x80\x80" - 4 f8:80:80:80 - 4 bytes, need 5
773.3.5 n "\xfc\x80\x80\x80\x80" - 5 fc:80:80:80:80 - 5 bytes, need 6
783.3.6 n "\xdf" - 1 df - 1 byte, need 2
793.3.7 n "\xef\xbf" - 2 ef:bf - 2 bytes, need 3
803.3.8 n "\xf7\xbf\xbf" - 3 f7:bf:bf - 3 bytes, need 4
813.3.9 n "\xfb\xbf\xbf\xbf" - 4 fb:bf:bf:bf - 4 bytes, need 5
823.3.10 n "\xfd\xbf\xbf\xbf\xbf" - 5 fd:bf:bf:bf:bf - 5 bytes, need 6
a9917092 833.4 Concatenation of incomplete sequences
8567041c 843.4.1 n "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf" - 30 c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf - unexpected non-continuation byte 0xe0, immediately after start byte 0xc0
a9917092 853.5 Impossible bytes
3b0e0cb6
JH
863.5.1 n "\xfe" - 1 fe - byte 0xfe
873.5.2 n "\xff" - 1 ff - byte 0xff
883.5.3 n "\xfe\xfe\xff\xff" - 4 fe:fe:ff:ff - byte 0xfe
a9917092
JH
894 Overlong sequences
904.1 Examples of an overlong ASCII character
3b0e0cb6
JH
914.1.1 n "\xc0\xaf" - 2 c0:af - 2 bytes, need 1
924.1.2 n "\xe0\x80\xaf" - 3 e0:80:af - 3 bytes, need 1
934.1.3 n "\xf0\x80\x80\xaf" - 4 f0:80:80:af - 4 bytes, need 1
944.1.4 n "\xf8\x80\x80\x80\xaf" - 5 f8:80:80:80:af - 5 bytes, need 1
954.1.5 n "\xfc\x80\x80\x80\x80\xaf" - 6 fc:80:80:80:80:af - 6 bytes, need 1
a9917092 964.2 Maximum overlong sequences
3b0e0cb6
JH
974.2.1 n "\xc1\xbf" - 2 c1:bf - 2 bytes, need 1
984.2.2 n "\xe0\x9f\xbf" - 3 e0:9f:bf - 3 bytes, need 2
994.2.3 n "\xf0\x8f\xbf\xbf" - 4 f0:8f:bf:bf - 4 bytes, need 3
1004.2.4 n "\xf8\x87\xbf\xbf\xbf" - 5 f8:87:bf:bf:bf - 5 bytes, need 4
1014.2.5 n "\xfc\x83\xbf\xbf\xbf\xbf" - 6 fc:83:bf:bf:bf:bf - 6 bytes, need 5
a9917092 1024.3 Overlong representation of the NUL character
3b0e0cb6
JH
1034.3.1 n "\xc0\x80" - 2 c0:80 - 2 bytes, need 1
1044.3.2 n "\xe0\x80\x80" - 3 e0:80:80 - 3 bytes, need 1
1054.3.3 n "\xf0\x80\x80\x80" - 4 f0:80:80:80 - 4 bytes, need 1
1064.3.4 n "\xf8\x80\x80\x80\x80" - 5 f8:80:80:80:80 - 5 bytes, need 1
1074.3.5 n "\xfc\x80\x80\x80\x80\x80" - 6 fc:80:80:80:80:80 - 6 bytes, need 1
a9917092
JH
1085 Illegal code positions
1095.1 Single UTF-16 surrogates
8567041c
NC
1105.1.1 y "\xed\xa0\x80" - 3 ed:a0:80 - UTF-16 surrogate 0xd800
1115.1.2 y "\xed\xad\xbf" - 3 ed:ad:bf - UTF-16 surrogate 0xdb7f
1125.1.3 y "\xed\xae\x80" - 3 ed:ae:80 - UTF-16 surrogate 0xdb80
1135.1.4 y "\xed\xaf\xbf" - 3 ed:af:bf - UTF-16 surrogate 0xdbff
1145.1.5 y "\xed\xb0\x80" - 3 ed:b0:80 - UTF-16 surrogate 0xdc00
1155.1.6 y "\xed\xbe\x80" - 3 ed:be:80 - UTF-16 surrogate 0xdf80
1165.1.7 y "\xed\xbf\xbf" - 3 ed:bf:bf - UTF-16 surrogate 0xdfff
a9917092 1175.2 Paired UTF-16 surrogates
8567041c
NC
1185.2.1 y "\xed\xa0\x80\xed\xb0\x80" - 6 ed:a0:80:ed:b0:80 - UTF-16 surrogate 0xd800
1195.2.2 y "\xed\xa0\x80\xed\xbf\xbf" - 6 ed:a0:80:ed:bf:bf - UTF-16 surrogate 0xd800
1205.2.3 y "\xed\xad\xbf\xed\xb0\x80" - 6 ed:ad:bf:ed:b0:80 - UTF-16 surrogate 0xdb7f
1215.2.4 y "\xed\xad\xbf\xed\xbf\xbf" - 6 ed:ad:bf:ed:bf:bf - UTF-16 surrogate 0xdb7f
1225.2.5 y "\xed\xae\x80\xed\xb0\x80" - 6 ed:ae:80:ed:b0:80 - UTF-16 surrogate 0xdb80
1235.2.6 y "\xed\xae\x80\xed\xbf\xbf" - 6 ed:ae:80:ed:bf:bf - UTF-16 surrogate 0xdb80
1245.2.7 y "\xed\xaf\xbf\xed\xb0\x80" - 6 ed:af:bf:ed:b0:80 - UTF-16 surrogate 0xdbff
1255.2.8 y "\xed\xaf\xbf\xed\xbf\xbf" - 6 ed:af:bf:ed:bf:bf - UTF-16 surrogate 0xdbff
a9917092 1265.3 Other illegal code positions
8567041c
NC
1275.3.1 y "\xef\xbf\xbe" - 3 ef:bf:be - byte order mark 0xfffe
128# The ffff is legal by default since 872c91ae155f6880
1295.3.2 y "\xef\xbf\xbf" - 3 ef:bf:bf - character 0xffff
a9917092
JH
130__EOMK__
131
132# 104..181
133{
a9917092
JH
134 my $id;
135
35bcd338
JH
136 local $SIG{__WARN__} = sub {
137 print "# $id: @_";
8567041c 138 $@ .= "@_";
35bcd338 139 };
a9917092 140
35bcd338
JH
141 sub warn_unpack_U {
142 $@ = '';
8567041c 143 my @null = unpack('C0U*', $_[0]);
35bcd338 144 return $@;
a9917092
JH
145 }
146
147 for (@MK) {
148 if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) {
149 # print "# $_\n";
150 } elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) {
151 $id = $1;
35bcd338 152 my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $experr) =
a9917092
JH
153 ($2, $3, $4, $5, $6, $7, $8);
154 my @hex = split(/:/, $hex);
680218c4 155 is(scalar @hex, $byteslen, 'Amount of hex tallies with byteslen');
e90499c4
NC
156 my $fromhex = join '', map {chr hex $_} @hex;
157 is($fromhex, $bytes, 'hex matches bytes');
a9917092
JH
158 {
159 use bytes;
160 my $bytesbyteslen = length($bytes);
680218c4
NC
161 is($bytesbyteslen, $byteslen,
162 'bytes length() tallies with byteslen');
a9917092 163 }
35bcd338 164 my $warn = warn_unpack_U($bytes);
a9917092 165 if ($okay eq 'y') {
680218c4
NC
166 is($warn, '', "No warnings expected for $id");
167 } elsif ($okay ne 'n') {
168 is($okay, 'n', "Confused test description for $id");
169 } elsif($experr) {
170 like($warn, qr/$experr/, "Expected warning for $id");
171 } else {
172 isnt($warn, '', "Expect a warning for $id");
a9917092 173 }
a9917092 174 } else {
680218c4 175 fail("unknown format '$_'");
a9917092
JH
176 }
177 }
178}
680218c4
NC
179
180done_testing();