Commit | Line | Data |
---|---|---|
7d59b7e4 NIS |
1 | #!./perl |
2 | ||
3 | BEGIN { | |
4 | chdir 't' if -d 't'; | |
8c892c9b | 5 | require './test.pl'; |
624c42e2 | 6 | set_up_inc('../lib'); |
8c892c9b | 7 | require './charset_tools.pl'; |
7d59b7e4 | 8 | } |
624c42e2 | 9 | skip_all_without_perlio(); |
7d59b7e4 | 10 | |
169da838 | 11 | no utf8; # needed for use utf8 not griping about the raw octets |
3ba0e062 | 12 | |
bb4e15c8 | 13 | |
1ed4b776 | 14 | plan(tests => 62); |
bb4e15c8 | 15 | |
7d59b7e4 | 16 | $| = 1; |
7d59b7e4 | 17 | |
62a28c97 NC |
18 | my $a_file = tempfile(); |
19 | ||
20 | open(F,"+>:utf8",$a_file); | |
7d59b7e4 | 21 | print F chr(0x100).'£'; |
0fb301d7 | 22 | cmp_ok( tell(F), '==', 4, tell(F) ); |
7d59b7e4 | 23 | print F "\n"; |
0fb301d7 | 24 | cmp_ok( tell(F), '>=', 5, tell(F) ); |
7d59b7e4 | 25 | seek(F,0,0); |
0fb301d7 NC |
26 | is( getc(F), chr(0x100) ); |
27 | is( getc(F), "£" ); | |
28 | is( getc(F), "\n" ); | |
7d59b7e4 NIS |
29 | seek(F,0,0); |
30 | binmode(F,":bytes"); | |
aa700082 KW |
31 | |
32 | # Byte representation of these characters | |
33 | my $U_100 = byte_utf8a_to_utf8n("\xc4\x80"); | |
34 | my $POUND_SIGN = byte_utf8a_to_utf8n("\xc2\xa3"); | |
35 | ||
36 | my $chr = substr($U_100, 0, 1); | |
0fb301d7 | 37 | is( getc(F), $chr ); |
aa700082 | 38 | $chr = substr($U_100, 1, 1); |
0fb301d7 | 39 | is( getc(F), $chr ); |
aa700082 | 40 | $chr = substr($POUND_SIGN, 0, 1); |
0fb301d7 | 41 | is( getc(F), $chr ); |
aa700082 | 42 | $chr = substr($POUND_SIGN, 1, 1); |
0fb301d7 NC |
43 | is( getc(F), $chr ); |
44 | is( getc(F), "\n" ); | |
7d59b7e4 NIS |
45 | seek(F,0,0); |
46 | binmode(F,":utf8"); | |
0fb301d7 | 47 | is( scalar(<F>), "\x{100}£\n" ); |
eb5c063a NIS |
48 | seek(F,0,0); |
49 | $buf = chr(0x200); | |
50 | $count = read(F,$buf,2,1); | |
0fb301d7 NC |
51 | cmp_ok( $count, '==', 2 ); |
52 | is( $buf, "\x{200}\x{100}£" ); | |
7d59b7e4 NIS |
53 | close(F); |
54 | ||
360eb788 | 55 | { |
7b89fb7c JH |
56 | $a = chr(300); # This *is* UTF-encoded |
57 | $b = chr(130); # This is not. | |
6874a2de | 58 | |
62a28c97 | 59 | open F, ">:utf8", $a_file or die $!; |
7b89fb7c JH |
60 | print F $a,"\n"; |
61 | close F; | |
6874a2de | 62 | |
62a28c97 | 63 | open F, "<:utf8", $a_file or die $!; |
7b89fb7c JH |
64 | $x = <F>; |
65 | chomp($x); | |
0fb301d7 | 66 | is( $x, chr(300) ); |
6874a2de | 67 | |
62a28c97 | 68 | open F, $a_file or die $!; # Not UTF |
7b89fb7c JH |
69 | binmode(F, ":bytes"); |
70 | $x = <F>; | |
71 | chomp($x); | |
8e90a002 | 72 | $chr = byte_utf8a_to_utf8n(chr(196).chr(172)); |
0fb301d7 | 73 | is( $x, $chr ); |
7b89fb7c | 74 | close F; |
6874a2de | 75 | |
62a28c97 | 76 | open F, ">:utf8", $a_file or die $!; |
7b89fb7c | 77 | binmode(F); # we write a "\n" and then tell() - avoid CRLF issues. |
6874a2de | 78 | binmode(F,":utf8"); # turn UTF-8-ness back on |
7b89fb7c JH |
79 | print F $a; |
80 | my $y; | |
81 | { my $x = tell(F); | |
82 | { use bytes; $y = length($a);} | |
0fb301d7 | 83 | cmp_ok( $x, '==', $y ); |
7b89fb7c | 84 | } |
6874a2de | 85 | |
7b89fb7c JH |
86 | { # Check byte length of $b |
87 | use bytes; my $y = length($b); | |
0fb301d7 | 88 | cmp_ok( $y, '==', 1 ); |
7b89fb7c | 89 | } |
6874a2de | 90 | |
ab034624 | 91 | print F $b,"\n"; # Don't upgrade $b |
6874a2de | 92 | |
7b89fb7c JH |
93 | { # Check byte length of $b |
94 | use bytes; my $y = length($b); | |
0fb301d7 | 95 | cmp_ok( $y, '==', 1 ); |
7b89fb7c | 96 | } |
6874a2de | 97 | |
7b89fb7c JH |
98 | { |
99 | my $x = tell(F); | |
aa700082 | 100 | { use bytes; if ($::IS_EBCDIC){$y += 2;}else{$y += 3;}} # EBCDIC ASCII |
0fb301d7 | 101 | cmp_ok( $x, '==', $y ); |
7b89fb7c | 102 | } |
6874a2de | 103 | |
7b89fb7c | 104 | close F; |
6874a2de | 105 | |
62a28c97 | 106 | open F, $a_file or die $!; # Not UTF |
7b89fb7c JH |
107 | binmode(F, ":bytes"); |
108 | $x = <F>; | |
109 | chomp($x); | |
110 | $chr = v196.172.194.130; | |
aa700082 | 111 | if ($::IS_EBCDIC) { $chr = v141.83.130; } # EBCDIC |
0fb301d7 | 112 | is( $x, $chr, sprintf('(%vd)', $x) ); |
6874a2de | 113 | |
62a28c97 | 114 | open F, "<:utf8", $a_file or die $!; |
7b89fb7c JH |
115 | $x = <F>; |
116 | chomp($x); | |
117 | close F; | |
0fb301d7 | 118 | is( $x, chr(300).chr(130), sprintf('(%vd)', $x) ); |
6874a2de | 119 | |
62a28c97 | 120 | open F, ">", $a_file or die $!; |
11fa0b78 | 121 | binmode(F, ":bytes:"); |
7b89fb7c JH |
122 | |
123 | # Now let's make it suffer. | |
3eb9224a JH |
124 | my $w; |
125 | { | |
126 | use warnings 'utf8'; | |
127 | local $SIG{__WARN__} = sub { $w = $_[0] }; | |
128 | print F $a; | |
0fb301d7 NC |
129 | ok( (!$@)); |
130 | like($w, qr/Wide character in print/i ); | |
3eb9224a | 131 | } |
54d2e5f1 | 132 | } |
360eb788 NIS |
133 | |
134 | # Hm. Time to get more evil. | |
62a28c97 | 135 | open F, ">:utf8", $a_file or die $!; |
360eb788 NIS |
136 | print F $a; |
137 | binmode(F, ":bytes"); | |
138 | print F chr(130)."\n"; | |
139 | close F; | |
6874a2de | 140 | |
62a28c97 | 141 | open F, "<", $a_file or die $!; |
3eb9224a | 142 | binmode(F, ":bytes"); |
360eb788 | 143 | $x = <F>; chomp $x; |
d2f5bb60 | 144 | $chr = v196.172.130; |
aa700082 | 145 | if ($::IS_EBCDIC) { $chr = v141.83.130; } # EBCDIC |
0fb301d7 | 146 | is( $x, $chr ); |
360eb788 NIS |
147 | |
148 | # Right. | |
62a28c97 | 149 | open F, ">:utf8", $a_file or die $!; |
360eb788 NIS |
150 | print F $a; |
151 | close F; | |
62a28c97 | 152 | open F, ">>", $a_file or die $!; |
ceb1aeda | 153 | binmode(F, ":bytes"); |
360eb788 NIS |
154 | print F chr(130)."\n"; |
155 | close F; | |
156 | ||
62a28c97 | 157 | open F, "<", $a_file or die $!; |
ceb1aeda | 158 | binmode(F, ":bytes"); |
360eb788 | 159 | $x = <F>; chomp $x; |
ceb1aeda NC |
160 | SKIP: { |
161 | skip("Defaulting to UTF-8 output means that we can't generate a mangled file") | |
162 | if $UTF8_OUTPUT; | |
163 | is( $x, $chr ); | |
164 | } | |
360eb788 NIS |
165 | |
166 | # Now we have a deformed file. | |
d0965105 | 167 | |
75ccb5d3 | 168 | SKIP: { |
aa700082 | 169 | if ($::IS_EBCDIC) { |
8e90a002 | 170 | skip("EBCDIC The file isn't deformed in UTF-EBCDIC", 2); |
75ccb5d3 | 171 | } else { |
0fb301d7 | 172 | my @warnings; |
62a28c97 | 173 | open F, "<:utf8", $a_file or die $!; |
75ccb5d3 | 174 | $x = <F>; chomp $x; |
0fb301d7 | 175 | local $SIG{__WARN__} = sub { push @warnings, $_[0]; }; |
75ccb5d3 | 176 | eval { sprintf "%vd\n", $x }; |
0fb301d7 | 177 | is (scalar @warnings, 1); |
7cf8d05d | 178 | like ($warnings[0], qr/Malformed UTF-8 character: \\x82 \(unexpected continuation byte 0x82, with no preceding start byte/); |
75ccb5d3 | 179 | } |
360eb788 NIS |
180 | } |
181 | ||
4f0c37ba | 182 | close F; |
62a28c97 | 183 | unlink($a_file); |
7d59b7e4 | 184 | |
62a28c97 | 185 | open F, ">:utf8", $a_file; |
d0965105 | 186 | @a = map { chr(1 << ($_ << 2)) } 0..5; # 0x1, 0x10, .., 0x100000 |
c36dfd09 | 187 | unshift @a, chr(0); # ... and a null byte in front just for fun |
d0965105 JH |
188 | print F @a; |
189 | close F; | |
c36dfd09 | 190 | |
c88828dc JH |
191 | my $c; |
192 | ||
193 | # read() should work on characters, not bytes | |
62a28c97 | 194 | open F, "<:utf8", $a_file; |
d0965105 | 195 | $a = 0; |
0fb301d7 | 196 | my $failed; |
d0965105 | 197 | for (@a) { |
c88828dc JH |
198 | unless (($c = read(F, $b, 1) == 1) && |
199 | length($b) == 1 && | |
200 | ord($b) == ord($_) && | |
201 | tell(F) == ($a += bytes::length($b))) { | |
34fce102 JH |
202 | print '# ord($_) == ', ord($_), "\n"; |
203 | print '# ord($b) == ', ord($b), "\n"; | |
204 | print '# length($b) == ', length($b), "\n"; | |
205 | print '# bytes::length($b) == ', bytes::length($b), "\n"; | |
206 | print '# tell(F) == ', tell(F), "\n"; | |
207 | print '# $a == ', $a, "\n"; | |
208 | print '# $c == ', $c, "\n"; | |
0fb301d7 | 209 | $failed++; |
d0965105 JH |
210 | last; |
211 | } | |
212 | } | |
c36dfd09 | 213 | close F; |
0fb301d7 | 214 | is($failed, undef); |
d0965105 | 215 | |
62961d2e JH |
216 | { |
217 | # Check that warnings are on on I/O, and that they can be muffled. | |
218 | ||
219 | local $SIG{__WARN__} = sub { $@ = shift }; | |
220 | ||
221 | undef $@; | |
62a28c97 | 222 | open F, ">$a_file"; |
3eb9224a | 223 | binmode(F, ":bytes"); |
62961d2e JH |
224 | print F chr(0x100); |
225 | close(F); | |
226 | ||
aaa63dae | 227 | like( $@, qr/Wide character in print/ ); |
62961d2e JH |
228 | |
229 | undef $@; | |
62a28c97 | 230 | open F, ">:utf8", $a_file; |
62961d2e JH |
231 | print F chr(0x100); |
232 | close(F); | |
233 | ||
768fd157 | 234 | isnt( defined $@, !0 ); |
62961d2e JH |
235 | |
236 | undef $@; | |
62a28c97 | 237 | open F, ">$a_file"; |
62961d2e JH |
238 | binmode(F, ":utf8"); |
239 | print F chr(0x100); | |
240 | close(F); | |
241 | ||
768fd157 | 242 | isnt( defined $@, !0 ); |
62961d2e JH |
243 | |
244 | no warnings 'utf8'; | |
245 | ||
246 | undef $@; | |
62a28c97 | 247 | open F, ">$a_file"; |
62961d2e JH |
248 | print F chr(0x100); |
249 | close(F); | |
250 | ||
768fd157 | 251 | isnt( defined $@, !0 ); |
62961d2e JH |
252 | |
253 | use warnings 'utf8'; | |
254 | ||
255 | undef $@; | |
62a28c97 | 256 | open F, ">$a_file"; |
3eb9224a | 257 | binmode(F, ":bytes"); |
62961d2e JH |
258 | print F chr(0x100); |
259 | close(F); | |
260 | ||
aaa63dae | 261 | like( $@, qr/Wide character in print/ ); |
62961d2e JH |
262 | } |
263 | ||
efd8b2ba | 264 | { |
62a28c97 | 265 | open F, ">:bytes",$a_file; print F "\xde"; close F; |
efd8b2ba | 266 | |
62a28c97 | 267 | open F, "<:bytes", $a_file; |
efd8b2ba AE |
268 | my $b = chr 0x100; |
269 | $b .= <F>; | |
0fb301d7 | 270 | is( $b, chr(0x100).chr(0xde), "21395 '.= <>' utf8 vs. bytes" ); |
efd8b2ba AE |
271 | close F; |
272 | } | |
273 | ||
274 | { | |
62a28c97 | 275 | open F, ">:utf8",$a_file; print F chr 0x100; close F; |
efd8b2ba | 276 | |
62a28c97 | 277 | open F, "<:utf8", $a_file; |
efd8b2ba AE |
278 | my $b = "\xde"; |
279 | $b .= <F>; | |
0fb301d7 | 280 | is( $b, chr(0xde).chr(0x100), "21395 '.= <>' bytes vs. utf8" ); |
efd8b2ba AE |
281 | close F; |
282 | } | |
283 | ||
b5d30a84 JH |
284 | { |
285 | my @a = ( [ 0x007F, "bytes" ], | |
286 | [ 0x0080, "bytes" ], | |
287 | [ 0x0080, "utf8" ], | |
288 | [ 0x0100, "utf8" ] ); | |
289 | my $t = 34; | |
290 | for my $u (@a) { | |
291 | for my $v (@a) { | |
292 | # print "# @$u - @$v\n"; | |
62a28c97 | 293 | open F, ">$a_file"; |
b5d30a84 JH |
294 | binmode(F, ":" . $u->[1]); |
295 | print F chr($u->[0]); | |
296 | close F; | |
297 | ||
62a28c97 | 298 | open F, "<$a_file"; |
b5d30a84 JH |
299 | binmode(F, ":" . $u->[1]); |
300 | ||
301 | my $s = chr($v->[0]); | |
302 | utf8::upgrade($s) if $v->[1] eq "utf8"; | |
303 | ||
304 | $s .= <F>; | |
0fb301d7 | 305 | is( $s, chr($v->[0]) . chr($u->[0]), 'rcatline utf8' ); |
b5d30a84 JH |
306 | close F; |
307 | $t++; | |
308 | } | |
309 | } | |
11c2f0cf | 310 | # last test here 49 |
b5d30a84 JH |
311 | } |
312 | ||
4de5f5ca JH |
313 | { |
314 | # [perl #23428] Somethings rotten in unicode semantics | |
62a28c97 | 315 | open F, ">$a_file"; |
1ed4b776 TC |
316 | binmode F; |
317 | $a = "A"; | |
318 | utf8::upgrade($a); | |
319 | syswrite(F, $a); | |
d3b4e16f | 320 | close F; |
1ed4b776 | 321 | ok(utf8::is_utf8($a), '23428 syswrite should not downgrade scalar' ); |
4de5f5ca JH |
322 | } |
323 | ||
d3b4e16f | 324 | { |
532ffc5d | 325 | # <FH> on a :utf8 stream should complain immediately with -w |
d3b4e16f | 326 | # if it finds bad UTF-8 (:encoding(utf8) works this way) |
532ffc5d | 327 | use warnings 'utf8'; |
2d79bf7f | 328 | undef $@; |
d3b4e16f | 329 | local $SIG{__WARN__} = sub { $@ = shift }; |
62a28c97 | 330 | open F, ">$a_file"; |
d3b4e16f | 331 | binmode F; |
cc8040a1 | 332 | my ($chrE4, $chrF6) = (chr(0xE4), chr(0xF6)); |
aa700082 | 333 | if ($::IS_EBCDIC) # EBCDIC |
cc8040a1 TD |
334 | { ($chrE4, $chrF6) = (chr(0x43), chr(0xEC)); } |
335 | print F "foo", $chrE4, "\n"; | |
336 | print F "foo", $chrF6, "\n"; | |
d3b4e16f | 337 | close F; |
62a28c97 | 338 | open F, "<:utf8", $a_file; |
2d79bf7f | 339 | undef $@; |
d3b4e16f | 340 | my $line = <F>; |
cc8040a1 | 341 | my ($chrE4, $chrF6) = ("E4", "F6"); |
aa700082 | 342 | if ($::IS_EBCDIC) { ($chrE4, $chrF6) = ("43", "EC"); } # EBCDIC |
cc8040a1 | 343 | like( $@, qr/utf8 "\\x$chrE4" does not map to Unicode .+ <F> line 1/, |
2d79bf7f JH |
344 | "<:utf8 readline must warn about bad utf8"); |
345 | undef $@; | |
346 | $line .= <F>; | |
cc8040a1 | 347 | like( $@, qr/utf8 "\\x$chrF6" does not map to Unicode .+ <F> line 2/, |
2d79bf7f | 348 | "<:utf8 rcatline must warn about bad utf8"); |
d3b4e16f JH |
349 | close F; |
350 | } | |
ca93b89e TC |
351 | |
352 | { | |
353 | # fixed record reads | |
354 | open F, ">:utf8", $a_file; | |
355 | print F "foo\xE4"; | |
356 | print F "bar\xFE"; | |
a1aea1fe TC |
357 | print F "\xC0\xC8\xCC\xD2"; |
358 | print F "a\xE4ab"; | |
90f6536b | 359 | print F "a\xE4a"; |
ca93b89e TC |
360 | close F; |
361 | open F, "<:utf8", $a_file; | |
362 | local $/ = \4; | |
363 | my $line = <F>; | |
364 | is($line, "foo\xE4", "readline with \$/ = \\4"); | |
365 | $line .= <F>; | |
366 | is($line, "foo\xE4bar\xFE", "rcatline with \$/ = \\4"); | |
90f6536b | 367 | $line = <F>; |
a1aea1fe TC |
368 | is($line, "\xC0\xC8\xCC\xD2", "readline with several encoded characters"); |
369 | $line = <F>; | |
370 | is($line, "a\xE4ab", "readline with another boundary condition"); | |
371 | $line = <F>; | |
90f6536b | 372 | is($line, "a\xE4a", "readline with boundary condition"); |
ca93b89e TC |
373 | close F; |
374 | ||
375 | # badly encoded at EOF | |
376 | open F, ">:raw", $a_file; | |
377 | print F "foo\xEF\xAC"; # truncated \x{FB04} small ligature ffl | |
378 | close F; | |
379 | ||
380 | use warnings 'utf8'; | |
381 | open F, "<:utf8", $a_file; | |
382 | undef $@; | |
383 | local $SIG{__WARN__} = sub { $@ = shift }; | |
384 | $line = <F>; | |
385 | ||
386 | like( $@, qr/utf8 "\\xEF" does not map to Unicode .+ <F> chunk 1/, | |
387 | "<:utf8 readline (fixed) must warn about bad utf8"); | |
59f4d939 | 388 | close F; |
ca93b89e | 389 | } |
88c34251 FC |
390 | |
391 | # getc should reset the utf8 flag and not be affected by previous | |
392 | # return values | |
393 | SKIP: { | |
394 | skip "no PerlIO::scalar on miniperl", 2, if is_miniperl(); | |
ef40a5dd | 395 | open my $fh, "<:raw", \($buf = chr 255); |
aa700082 | 396 | open my $uh, "<:utf8", \($uuf = $U_100); |
88c34251 FC |
397 | for([$uh,chr 256], [$fh,chr 255]) { |
398 | is getc $$_[0], $$_[1], | |
399 | 'getc returning non-utf8 after utf8'; | |
400 | } | |
401 | } |