Commit | Line | Data |
---|---|---|
7d59b7e4 NIS |
1 | #!./perl |
2 | ||
3 | BEGIN { | |
4 | chdir 't' if -d 't'; | |
5 | @INC = '../lib'; | |
aa700082 | 6 | require './test.pl'; require './charset_tools.pl'; |
e05e9c3d | 7 | skip_all_without_perlio(); |
7d59b7e4 NIS |
8 | } |
9 | ||
169da838 | 10 | no utf8; # needed for use utf8 not griping about the raw octets |
3ba0e062 | 11 | |
bb4e15c8 | 12 | |
88c34251 | 13 | plan(tests => 63); |
bb4e15c8 | 14 | |
7d59b7e4 | 15 | $| = 1; |
7d59b7e4 | 16 | |
62a28c97 NC |
17 | my $a_file = tempfile(); |
18 | ||
19 | open(F,"+>:utf8",$a_file); | |
7d59b7e4 | 20 | print F chr(0x100).'£'; |
0fb301d7 | 21 | cmp_ok( tell(F), '==', 4, tell(F) ); |
7d59b7e4 | 22 | print F "\n"; |
0fb301d7 | 23 | cmp_ok( tell(F), '>=', 5, tell(F) ); |
7d59b7e4 | 24 | seek(F,0,0); |
0fb301d7 NC |
25 | is( getc(F), chr(0x100) ); |
26 | is( getc(F), "£" ); | |
27 | is( getc(F), "\n" ); | |
7d59b7e4 NIS |
28 | seek(F,0,0); |
29 | binmode(F,":bytes"); | |
aa700082 KW |
30 | |
31 | # Byte representation of these characters | |
32 | my $U_100 = byte_utf8a_to_utf8n("\xc4\x80"); | |
33 | my $POUND_SIGN = byte_utf8a_to_utf8n("\xc2\xa3"); | |
34 | ||
35 | my $chr = substr($U_100, 0, 1); | |
0fb301d7 | 36 | is( getc(F), $chr ); |
aa700082 | 37 | $chr = substr($U_100, 1, 1); |
0fb301d7 | 38 | is( getc(F), $chr ); |
aa700082 | 39 | $chr = substr($POUND_SIGN, 0, 1); |
0fb301d7 | 40 | is( getc(F), $chr ); |
aa700082 | 41 | $chr = substr($POUND_SIGN, 1, 1); |
0fb301d7 NC |
42 | is( getc(F), $chr ); |
43 | is( getc(F), "\n" ); | |
7d59b7e4 NIS |
44 | seek(F,0,0); |
45 | binmode(F,":utf8"); | |
0fb301d7 | 46 | is( scalar(<F>), "\x{100}£\n" ); |
eb5c063a NIS |
47 | seek(F,0,0); |
48 | $buf = chr(0x200); | |
49 | $count = read(F,$buf,2,1); | |
0fb301d7 NC |
50 | cmp_ok( $count, '==', 2 ); |
51 | is( $buf, "\x{200}\x{100}£" ); | |
7d59b7e4 NIS |
52 | close(F); |
53 | ||
360eb788 | 54 | { |
7b89fb7c JH |
55 | $a = chr(300); # This *is* UTF-encoded |
56 | $b = chr(130); # This is not. | |
6874a2de | 57 | |
62a28c97 | 58 | open F, ">:utf8", $a_file or die $!; |
7b89fb7c JH |
59 | print F $a,"\n"; |
60 | close F; | |
6874a2de | 61 | |
62a28c97 | 62 | open F, "<:utf8", $a_file or die $!; |
7b89fb7c JH |
63 | $x = <F>; |
64 | chomp($x); | |
0fb301d7 | 65 | is( $x, chr(300) ); |
6874a2de | 66 | |
62a28c97 | 67 | open F, $a_file or die $!; # Not UTF |
7b89fb7c JH |
68 | binmode(F, ":bytes"); |
69 | $x = <F>; | |
70 | chomp($x); | |
8e90a002 | 71 | $chr = byte_utf8a_to_utf8n(chr(196).chr(172)); |
0fb301d7 | 72 | is( $x, $chr ); |
7b89fb7c | 73 | close F; |
6874a2de | 74 | |
62a28c97 | 75 | open F, ">:utf8", $a_file or die $!; |
7b89fb7c | 76 | binmode(F); # we write a "\n" and then tell() - avoid CRLF issues. |
6874a2de | 77 | binmode(F,":utf8"); # turn UTF-8-ness back on |
7b89fb7c JH |
78 | print F $a; |
79 | my $y; | |
80 | { my $x = tell(F); | |
81 | { use bytes; $y = length($a);} | |
0fb301d7 | 82 | cmp_ok( $x, '==', $y ); |
7b89fb7c | 83 | } |
6874a2de | 84 | |
7b89fb7c JH |
85 | { # Check byte length of $b |
86 | use bytes; my $y = length($b); | |
0fb301d7 | 87 | cmp_ok( $y, '==', 1 ); |
7b89fb7c | 88 | } |
6874a2de | 89 | |
ab034624 | 90 | print F $b,"\n"; # Don't upgrade $b |
6874a2de | 91 | |
7b89fb7c JH |
92 | { # Check byte length of $b |
93 | use bytes; my $y = length($b); | |
0fb301d7 | 94 | cmp_ok( $y, '==', 1 ); |
7b89fb7c | 95 | } |
6874a2de | 96 | |
7b89fb7c JH |
97 | { |
98 | my $x = tell(F); | |
aa700082 | 99 | { use bytes; if ($::IS_EBCDIC){$y += 2;}else{$y += 3;}} # EBCDIC ASCII |
0fb301d7 | 100 | cmp_ok( $x, '==', $y ); |
7b89fb7c | 101 | } |
6874a2de | 102 | |
7b89fb7c | 103 | close F; |
6874a2de | 104 | |
62a28c97 | 105 | open F, $a_file or die $!; # Not UTF |
7b89fb7c JH |
106 | binmode(F, ":bytes"); |
107 | $x = <F>; | |
108 | chomp($x); | |
109 | $chr = v196.172.194.130; | |
aa700082 | 110 | if ($::IS_EBCDIC) { $chr = v141.83.130; } # EBCDIC |
0fb301d7 | 111 | is( $x, $chr, sprintf('(%vd)', $x) ); |
6874a2de | 112 | |
62a28c97 | 113 | open F, "<:utf8", $a_file or die $!; |
7b89fb7c JH |
114 | $x = <F>; |
115 | chomp($x); | |
116 | close F; | |
0fb301d7 | 117 | is( $x, chr(300).chr(130), sprintf('(%vd)', $x) ); |
6874a2de | 118 | |
62a28c97 | 119 | open F, ">", $a_file or die $!; |
11fa0b78 | 120 | binmode(F, ":bytes:"); |
7b89fb7c JH |
121 | |
122 | # Now let's make it suffer. | |
3eb9224a JH |
123 | my $w; |
124 | { | |
125 | use warnings 'utf8'; | |
126 | local $SIG{__WARN__} = sub { $w = $_[0] }; | |
127 | print F $a; | |
0fb301d7 NC |
128 | ok( (!$@)); |
129 | like($w, qr/Wide character in print/i ); | |
3eb9224a | 130 | } |
54d2e5f1 | 131 | } |
360eb788 NIS |
132 | |
133 | # Hm. Time to get more evil. | |
62a28c97 | 134 | open F, ">:utf8", $a_file or die $!; |
360eb788 NIS |
135 | print F $a; |
136 | binmode(F, ":bytes"); | |
137 | print F chr(130)."\n"; | |
138 | close F; | |
6874a2de | 139 | |
62a28c97 | 140 | open F, "<", $a_file or die $!; |
3eb9224a | 141 | binmode(F, ":bytes"); |
360eb788 | 142 | $x = <F>; chomp $x; |
d2f5bb60 | 143 | $chr = v196.172.130; |
aa700082 | 144 | if ($::IS_EBCDIC) { $chr = v141.83.130; } # EBCDIC |
0fb301d7 | 145 | is( $x, $chr ); |
360eb788 NIS |
146 | |
147 | # Right. | |
62a28c97 | 148 | open F, ">:utf8", $a_file or die $!; |
360eb788 NIS |
149 | print F $a; |
150 | close F; | |
62a28c97 | 151 | open F, ">>", $a_file or die $!; |
ceb1aeda | 152 | binmode(F, ":bytes"); |
360eb788 NIS |
153 | print F chr(130)."\n"; |
154 | close F; | |
155 | ||
62a28c97 | 156 | open F, "<", $a_file or die $!; |
ceb1aeda | 157 | binmode(F, ":bytes"); |
360eb788 | 158 | $x = <F>; chomp $x; |
ceb1aeda NC |
159 | SKIP: { |
160 | skip("Defaulting to UTF-8 output means that we can't generate a mangled file") | |
161 | if $UTF8_OUTPUT; | |
162 | is( $x, $chr ); | |
163 | } | |
360eb788 NIS |
164 | |
165 | # Now we have a deformed file. | |
d0965105 | 166 | |
75ccb5d3 | 167 | SKIP: { |
aa700082 | 168 | if ($::IS_EBCDIC) { |
8e90a002 | 169 | skip("EBCDIC The file isn't deformed in UTF-EBCDIC", 2); |
75ccb5d3 | 170 | } else { |
0fb301d7 | 171 | my @warnings; |
62a28c97 | 172 | open F, "<:utf8", $a_file or die $!; |
75ccb5d3 | 173 | $x = <F>; chomp $x; |
0fb301d7 | 174 | local $SIG{__WARN__} = sub { push @warnings, $_[0]; }; |
75ccb5d3 | 175 | eval { sprintf "%vd\n", $x }; |
0fb301d7 NC |
176 | is (scalar @warnings, 1); |
177 | like ($warnings[0], qr/Malformed UTF-8 character \(unexpected continuation byte 0x82, with no preceding start byte/); | |
75ccb5d3 | 178 | } |
360eb788 NIS |
179 | } |
180 | ||
4f0c37ba | 181 | close F; |
62a28c97 | 182 | unlink($a_file); |
7d59b7e4 | 183 | |
62a28c97 | 184 | open F, ">:utf8", $a_file; |
d0965105 | 185 | @a = map { chr(1 << ($_ << 2)) } 0..5; # 0x1, 0x10, .., 0x100000 |
c36dfd09 | 186 | unshift @a, chr(0); # ... and a null byte in front just for fun |
d0965105 JH |
187 | print F @a; |
188 | close F; | |
c36dfd09 | 189 | |
c88828dc JH |
190 | my $c; |
191 | ||
192 | # read() should work on characters, not bytes | |
62a28c97 | 193 | open F, "<:utf8", $a_file; |
d0965105 | 194 | $a = 0; |
0fb301d7 | 195 | my $failed; |
d0965105 | 196 | for (@a) { |
c88828dc JH |
197 | unless (($c = read(F, $b, 1) == 1) && |
198 | length($b) == 1 && | |
199 | ord($b) == ord($_) && | |
200 | tell(F) == ($a += bytes::length($b))) { | |
34fce102 JH |
201 | print '# ord($_) == ', ord($_), "\n"; |
202 | print '# ord($b) == ', ord($b), "\n"; | |
203 | print '# length($b) == ', length($b), "\n"; | |
204 | print '# bytes::length($b) == ', bytes::length($b), "\n"; | |
205 | print '# tell(F) == ', tell(F), "\n"; | |
206 | print '# $a == ', $a, "\n"; | |
207 | print '# $c == ', $c, "\n"; | |
0fb301d7 | 208 | $failed++; |
d0965105 JH |
209 | last; |
210 | } | |
211 | } | |
c36dfd09 | 212 | close F; |
0fb301d7 | 213 | is($failed, undef); |
d0965105 | 214 | |
62961d2e JH |
215 | { |
216 | # Check that warnings are on on I/O, and that they can be muffled. | |
217 | ||
218 | local $SIG{__WARN__} = sub { $@ = shift }; | |
219 | ||
220 | undef $@; | |
62a28c97 | 221 | open F, ">$a_file"; |
3eb9224a | 222 | binmode(F, ":bytes"); |
62961d2e JH |
223 | print F chr(0x100); |
224 | close(F); | |
225 | ||
aaa63dae | 226 | like( $@, qr/Wide character in print/ ); |
62961d2e JH |
227 | |
228 | undef $@; | |
62a28c97 | 229 | open F, ">:utf8", $a_file; |
62961d2e JH |
230 | print F chr(0x100); |
231 | close(F); | |
232 | ||
768fd157 | 233 | isnt( defined $@, !0 ); |
62961d2e JH |
234 | |
235 | undef $@; | |
62a28c97 | 236 | open F, ">$a_file"; |
62961d2e JH |
237 | binmode(F, ":utf8"); |
238 | print F chr(0x100); | |
239 | close(F); | |
240 | ||
768fd157 | 241 | isnt( defined $@, !0 ); |
62961d2e JH |
242 | |
243 | no warnings 'utf8'; | |
244 | ||
245 | undef $@; | |
62a28c97 | 246 | open F, ">$a_file"; |
62961d2e JH |
247 | print F chr(0x100); |
248 | close(F); | |
249 | ||
768fd157 | 250 | isnt( defined $@, !0 ); |
62961d2e JH |
251 | |
252 | use warnings 'utf8'; | |
253 | ||
254 | undef $@; | |
62a28c97 | 255 | open F, ">$a_file"; |
3eb9224a | 256 | binmode(F, ":bytes"); |
62961d2e JH |
257 | print F chr(0x100); |
258 | close(F); | |
259 | ||
aaa63dae | 260 | like( $@, qr/Wide character in print/ ); |
62961d2e JH |
261 | } |
262 | ||
efd8b2ba | 263 | { |
62a28c97 | 264 | open F, ">:bytes",$a_file; print F "\xde"; close F; |
efd8b2ba | 265 | |
62a28c97 | 266 | open F, "<:bytes", $a_file; |
efd8b2ba AE |
267 | my $b = chr 0x100; |
268 | $b .= <F>; | |
0fb301d7 | 269 | is( $b, chr(0x100).chr(0xde), "21395 '.= <>' utf8 vs. bytes" ); |
efd8b2ba AE |
270 | close F; |
271 | } | |
272 | ||
273 | { | |
62a28c97 | 274 | open F, ">:utf8",$a_file; print F chr 0x100; close F; |
efd8b2ba | 275 | |
62a28c97 | 276 | open F, "<:utf8", $a_file; |
efd8b2ba AE |
277 | my $b = "\xde"; |
278 | $b .= <F>; | |
0fb301d7 | 279 | is( $b, chr(0xde).chr(0x100), "21395 '.= <>' bytes vs. utf8" ); |
efd8b2ba AE |
280 | close F; |
281 | } | |
282 | ||
b5d30a84 JH |
283 | { |
284 | my @a = ( [ 0x007F, "bytes" ], | |
285 | [ 0x0080, "bytes" ], | |
286 | [ 0x0080, "utf8" ], | |
287 | [ 0x0100, "utf8" ] ); | |
288 | my $t = 34; | |
289 | for my $u (@a) { | |
290 | for my $v (@a) { | |
291 | # print "# @$u - @$v\n"; | |
62a28c97 | 292 | open F, ">$a_file"; |
b5d30a84 JH |
293 | binmode(F, ":" . $u->[1]); |
294 | print F chr($u->[0]); | |
295 | close F; | |
296 | ||
62a28c97 | 297 | open F, "<$a_file"; |
b5d30a84 JH |
298 | binmode(F, ":" . $u->[1]); |
299 | ||
300 | my $s = chr($v->[0]); | |
301 | utf8::upgrade($s) if $v->[1] eq "utf8"; | |
302 | ||
303 | $s .= <F>; | |
0fb301d7 | 304 | is( $s, chr($v->[0]) . chr($u->[0]), 'rcatline utf8' ); |
b5d30a84 JH |
305 | close F; |
306 | $t++; | |
307 | } | |
308 | } | |
11c2f0cf | 309 | # last test here 49 |
b5d30a84 JH |
310 | } |
311 | ||
4de5f5ca JH |
312 | { |
313 | # [perl #23428] Somethings rotten in unicode semantics | |
62a28c97 | 314 | open F, ">$a_file"; |
4de5f5ca JH |
315 | binmode F, ":utf8"; |
316 | syswrite(F, $a = chr(0x100)); | |
d3b4e16f | 317 | close F; |
4de5f5ca JH |
318 | is( ord($a), 0x100, '23428 syswrite should not downgrade scalar' ); |
319 | like( $a, qr/^\w+/, '23428 syswrite should not downgrade scalar' ); | |
320 | } | |
321 | ||
554ad1fc | 322 | # sysread() and syswrite() tested in lib/open.t since Fcntl is used |
c88828dc | 323 | |
d3b4e16f | 324 | { |
532ffc5d | 325 | # <FH> on a :utf8 stream should complain immediately with -w |
d3b4e16f | 326 | # if it finds bad UTF-8 (:encoding(utf8) works this way) |
532ffc5d | 327 | use warnings 'utf8'; |
2d79bf7f | 328 | undef $@; |
d3b4e16f | 329 | local $SIG{__WARN__} = sub { $@ = shift }; |
62a28c97 | 330 | open F, ">$a_file"; |
d3b4e16f | 331 | binmode F; |
cc8040a1 | 332 | my ($chrE4, $chrF6) = (chr(0xE4), chr(0xF6)); |
aa700082 | 333 | if ($::IS_EBCDIC) # EBCDIC |
cc8040a1 TD |
334 | { ($chrE4, $chrF6) = (chr(0x43), chr(0xEC)); } |
335 | print F "foo", $chrE4, "\n"; | |
336 | print F "foo", $chrF6, "\n"; | |
d3b4e16f | 337 | close F; |
62a28c97 | 338 | open F, "<:utf8", $a_file; |
2d79bf7f | 339 | undef $@; |
d3b4e16f | 340 | my $line = <F>; |
cc8040a1 | 341 | my ($chrE4, $chrF6) = ("E4", "F6"); |
aa700082 | 342 | if ($::IS_EBCDIC) { ($chrE4, $chrF6) = ("43", "EC"); } # EBCDIC |
cc8040a1 | 343 | like( $@, qr/utf8 "\\x$chrE4" does not map to Unicode .+ <F> line 1/, |
2d79bf7f JH |
344 | "<:utf8 readline must warn about bad utf8"); |
345 | undef $@; | |
346 | $line .= <F>; | |
cc8040a1 | 347 | like( $@, qr/utf8 "\\x$chrF6" does not map to Unicode .+ <F> line 2/, |
2d79bf7f | 348 | "<:utf8 rcatline must warn about bad utf8"); |
d3b4e16f JH |
349 | close F; |
350 | } | |
ca93b89e TC |
351 | |
352 | { | |
353 | # fixed record reads | |
354 | open F, ">:utf8", $a_file; | |
355 | print F "foo\xE4"; | |
356 | print F "bar\xFE"; | |
a1aea1fe TC |
357 | print F "\xC0\xC8\xCC\xD2"; |
358 | print F "a\xE4ab"; | |
90f6536b | 359 | print F "a\xE4a"; |
ca93b89e TC |
360 | close F; |
361 | open F, "<:utf8", $a_file; | |
362 | local $/ = \4; | |
363 | my $line = <F>; | |
364 | is($line, "foo\xE4", "readline with \$/ = \\4"); | |
365 | $line .= <F>; | |
366 | is($line, "foo\xE4bar\xFE", "rcatline with \$/ = \\4"); | |
90f6536b | 367 | $line = <F>; |
a1aea1fe TC |
368 | is($line, "\xC0\xC8\xCC\xD2", "readline with several encoded characters"); |
369 | $line = <F>; | |
370 | is($line, "a\xE4ab", "readline with another boundary condition"); | |
371 | $line = <F>; | |
90f6536b | 372 | is($line, "a\xE4a", "readline with boundary condition"); |
ca93b89e TC |
373 | close F; |
374 | ||
375 | # badly encoded at EOF | |
376 | open F, ">:raw", $a_file; | |
377 | print F "foo\xEF\xAC"; # truncated \x{FB04} small ligature ffl | |
378 | close F; | |
379 | ||
380 | use warnings 'utf8'; | |
381 | open F, "<:utf8", $a_file; | |
382 | undef $@; | |
383 | local $SIG{__WARN__} = sub { $@ = shift }; | |
384 | $line = <F>; | |
385 | ||
386 | like( $@, qr/utf8 "\\xEF" does not map to Unicode .+ <F> chunk 1/, | |
387 | "<:utf8 readline (fixed) must warn about bad utf8"); | |
59f4d939 | 388 | close F; |
ca93b89e | 389 | } |
88c34251 FC |
390 | |
391 | # getc should reset the utf8 flag and not be affected by previous | |
392 | # return values | |
393 | SKIP: { | |
394 | skip "no PerlIO::scalar on miniperl", 2, if is_miniperl(); | |
ef40a5dd | 395 | open my $fh, "<:raw", \($buf = chr 255); |
aa700082 | 396 | open my $uh, "<:utf8", \($uuf = $U_100); |
88c34251 FC |
397 | for([$uh,chr 256], [$fh,chr 255]) { |
398 | is getc $$_[0], $$_[1], | |
399 | 'getc returning non-utf8 after utf8'; | |
400 | } | |
401 | } |