[perl5.git] / lib / utf8.t

#!./perl 

BEGIN {
    chdir 't' if -d 't';
    @INC = '../lib';
    require './test.pl';
}

# NOTE!
#
# Think carefully before adding tests here.  In general this should be
# used only for about three categories of tests:
#
# (1) tests that absolutely require 'use utf8', and since that in general
#     shouldn't be needed as the utf8 is being obsoleted, this should
#     have rather few tests.  If you want to test Unicode and regexes,
#     you probably want to go to op/regexp or op/pat; if you want to test
#     split, go to op/split; pack, op/pack; appending or joining,
#     op/append or op/join, and so forth
#
# (2) tests that have to do with Unicode tokenizing (though it's likely
#     that all the other Unicode tests sprinkled around the t/**/*.t are
#     going to catch that)
#
# (3) complicated tests that simultaneously stress so many Unicode features
#     that deciding into which other test script the tests should go to
#     is hard -- maybe consider breaking up the complicated test
#
#

plan tests => 94;

{
    # bug id 20001009.001

    my ($a, $b);

    { use bytes; $a = "\xc3\xa4" }
    { use utf8;  $b = "\xe4"     }

    my $test = 68;

    ok($a ne $b);

    { use utf8; ok($a ne $b) }
}


{
    # bug id 20000730.004

    my $smiley = "\x{263a}";

    for my $s ("\x{263a}",
	       $smiley,
		
	       "" . $smiley,
	       "" . "\x{263a}",

	       $smiley    . "",
	       "\x{263a}" . "",
	       ) {
	my $length_chars = length($s);
	my $length_bytes;
	{ use bytes; $length_bytes = length($s) }
	my @regex_chars = $s =~ m/(.)/g;
	my $regex_chars = @regex_chars;
	my @split_chars = split //, $s;
	my $split_chars = @split_chars;
	ok("$length_chars/$regex_chars/$split_chars/$length_bytes" eq
	   "1/1/1/3");
    }

    for my $s ("\x{263a}" . "\x{263a}",
	       $smiley    . $smiley,

	       "\x{263a}\x{263a}",
	       "$smiley$smiley",
	       
	       "\x{263a}" x 2,
	       $smiley    x 2,
	       ) {
	my $length_chars = length($s);
	my $length_bytes;
	{ use bytes; $length_bytes = length($s) }
	my @regex_chars = $s =~ m/(.)/g;
	my $regex_chars = @regex_chars;
	my @split_chars = split //, $s;
	my $split_chars = @split_chars;
	ok("$length_chars/$regex_chars/$split_chars/$length_bytes" eq
	   "2/2/2/6");
    }
}


{
    my $w = 0;
    local $SIG{__WARN__} = sub { print "#($_[0])\n"; $w++ };
    my $x = eval q/"\\/ . "\x{100}" . q/"/;;
   
    ok($w == 0 && $x eq "\x{100}");
}

{
    use warnings;
    use strict;

    my $show = q(
                 sub show {
                   my $result;
                   $result .= '>' . join (',', map {ord} split //, $_) . '<'
                     foreach @_;
                   $result;
                 }
                 1;
                );
    eval $show or die $@; # We don't expect this sub definition to fail.
    my $progfile = 'utf' . $$;
    END {unlink_all $progfile}

    # If I'm right 60 is '>' in ASCII, ' ' in EBCDIC
    # 173 is not punctuation in either ASCII or EBCDIC
    my (@char);
    foreach (60, 173, 257, 65532) {
      my $char = chr $_;
      utf8::encode($char);
      # I don't want to use map {ord} and I've no need to hardcode the UTF
      # version
      my $charsubst = $char;
      $charsubst =~ s/(.)/ord ($1) . ','/ge;
      chop $charsubst;
      # Not testing this one against map {ord}
      my $char_as_ord
          = join " . ", map {sprintf 'chr (%d)', ord $_} split //, $char;
      push @char, [$_, $char, $charsubst, $char_as_ord];
    }
    # Now we've done all the UTF8 munching hopefully we're safe
    my @tests = (
             ['check our detection program works',
              'my @a = ("'.chr(60).'\x2A", ""); $b = show @a', qr/^>60,42<><$/],
             ['check literal 8 bit input',
              '$a = "' . chr (173) . '"; $b = show $a', qr/^>173<$/],
             ['check no utf8; makes no change',
              'no utf8; $a = "' . chr (173) . '"; $b = show $a', qr/^>173<$/],
             # Now we do the real byte sequences that are valid UTF8
             (map {
               ["the utf8 sequence for chr $_->[0]",
                qq(\$a = "$_->[1]"; \$b = show \$a), qr/^>$_->[2]<$/],
               ["no utf8; for the utf8 sequence for chr $_->[0]",
                qq(no utf8; \$a = "$_->[1]"; \$b = show \$a), qr/^>$_->[2]<$/],
               ["use utf8; for the utf8 sequence for chr $_->[0]",
                qq(use utf8; \$a = "$_->[1]"; \$b = show \$a), qr/^>$_->[0]<$/],
              } @char),
             # Interpolation of hex characters needs to take place now, as we're
             # testing feeding malformed utf8 into perl. Bug now fixed was an
             # "out of memory" error. We really need the "" [rather than qq()
             # or q()] to get the best explosion.
             ["!Feed malformed utf8 into perl.", <<"BANG",
    use utf8; %a = ("\xE1\xA0"=>"sterling");
    print 'start'; printf '%x,', ord \$_ foreach keys %a; print "end\n";
BANG
	      qr/^Malformed UTF-8 character \(2 bytes, need 3.+\).*start\d+,end$/s
	     ],
            );
    foreach (@tests) {
        my ($why, $prog, $expect) = @$_;
        open P, ">$progfile" or die "Can't open '$progfile': $!";
	print P $show, $prog, '; print $b'
            or die "Print to 'progfile' failed: $!";
        close P or die "Can't close '$progfile': $!";
        if ($why =~ s/^!//) {
            print "# Possible delay...\n";
        } else {
            print "# $prog\n";
        }
        my $result = runperl ( stderr => 1, progfile => $progfile );
        like ($result, $expect, $why);
    }
    print
        "# Again! Again! [but this time as eval, and not the explosive one]\n";
    # and now we've safely done them all as separate files, check that the
    # evals do the same thing. Hopefully doing it later sucessfully decouples
    # the previous tests from anything messy that may go wrong with the evals.
    foreach (@tests) {
        my ($why, $prog, $expect) = @$_;
        next if $why =~ m/^!/; # Goes bang.
        my $result = eval $prog;
        if ($@) {
            print "# prog is $prog\n";
            print "# \$\@=", _qq($@), "\n";
        }
        like ($result, $expect, $why);
    }

    # See what the tokeniser does with hash keys.
    print "# What does the tokeniser do with utf8 hash keys?\n";
    @tests = (map {
        # This is the control - I don't expect it to fail
        ["assign utf8 for chr $_->[0] to a hash",
         qq(my \$a = "$_->[1]"; my %h; \$h{\$a} = 1;
            my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
         qr/^>$_->[2]<$/],
        ["no utf8; assign utf8 for chr $_->[0] to a hash",
         qq(no utf8; my \$a = "$_->[1]"; my %h; \$h{\$a} = 1;
            my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
         qr/^>$_->[2]<$/],
        ["use utf8; assign utf8 for chr $_->[0] to a hash",
         qq(use utf8; my \$a = "$_->[1]"; my %h; \$h{\$a} = 1;
            my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
         qr/^>$_->[0]<$/],
        # Now check literal $h{"x"} constructions.
        ["\$h{\"x\"} construction, where x is utf8 for chr $_->[0]",
         qq(my \$a = "$_->[1]"; my %h; \$h{"$_->[1]"} = 1;
            my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
         qr/^>$_->[2]<$/],
        ["no utf8; \$h{\"x\"} construction, where x is utf8 for chr $_->[0]",
         qq(no utf8; my \$a = "$_->[1]"; my %h; \$h{"$_->[1]"} = 1;
            my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
         qr/^>$_->[2]<$/],
        ["use utf8; \$h{\"x\"} construction, where x is utf8 for chr $_->[0]",
         qq(use utf8; my \$a = "$_->[1]"; my %h; \$h{"$_->[1]"} = 1;
            my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
         qr/^>$_->[0]<$/],
        # Now check "x" => constructions.
        ["assign \"x\"=>1 to a hash, where x is utf8 for chr $_->[0]",
         qq(my \$a = "$_->[1]"; my %h; %h = ("$_->[1]" => 1);
            my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
         qr/^>$_->[2]<$/],
        ["no utf8; assign \"x\"=>1 to a hash, where x is utf8 for chr $_->[0]",
         qq(no utf8; my \$a = "$_->[1]"; my %h; %h = ("$_->[1]" => 1);
            my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
         qr/^>$_->[2]<$/],
        ["use utf8; assign \"x\"=>1 to a hash, where x is utf8 for chr $_->[0]",
         qq(use utf8; my \$a = "$_->[1]"; my %h; %h = ("$_->[1]" => 1);
            my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
         qr/^>$_->[0]<$/],
        # Check copies of hashes made from literal utf8 keys
        ["assign utf8 for chr $_->[0] to a hash, then copy it",
         qq(my \$a = "$_->[1]"; my %i; \$i{\$a} = 1; my %h = %i;
            my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
         qr/^>$_->[2]<$/],
        ["no utf8; assign utf8 for chr $_->[0] to a hash, then copy it",
         qq(no utf8; my \$a = "$_->[1]"; my %i; \$i{\$a} = 1;; my %h = %i;
            my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
         qr/^>$_->[2]<$/],
        ["use utf8; assign utf8 for chr $_->[0] to a hash, then copy it",
         qq(use utf8; my \$a = "$_->[1]"; my %i; \$i{\$a} = 1; my %h = %i;
            my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
         qr/^>$_->[0]<$/],
     } @char);
    foreach (@tests) {
        my ($why, $prog, $expect) = @$_;
        # print "# $prog\n";
        my $result = eval $prog;
        like ($result, $expect, $why);
    }
}
Commit	Line	Data
f96ec2a2 GS	1	#!./perl
	2
	3	BEGIN {
	4	chdir 't' if -d 't';
20822f61	5	@INC = '../lib';
8ae6c9f9	6	require './test.pl';
f96ec2a2 GS	7	}
f96ec2a2 GS	8
4765795a JH	9	# NOTE!
	10	#
	11	# Think carefully before adding tests here. In general this should be
	12	# used only for about three categories of tests:
	13	#
	14	# (1) tests that absolutely require 'use utf8', and since that in general
	15	# shouldn't be needed as the utf8 is being obsoleted, this should
	16	# have rather few tests. If you want to test Unicode and regexes,
	17	# you probably want to go to op/regexp or op/pat; if you want to test
	18	# split, go to op/split; pack, op/pack; appending or joining,
	19	# op/append or op/join, and so forth
	20	#
	21	# (2) tests that have to do with Unicode tokenizing (though it's likely
	22	# that all the other Unicode tests sprinkled around the t/*/.t are
	23	# going to catch that)
	24	#
	25	# (3) complicated tests that simultaneously stress so many Unicode features
	26	# that deciding into which other test script the tests should go to
	27	# is hard -- maybe consider breaking up the complicated test
	28	#
	29	#
	30
4c26891c	31	plan tests => 94;
31067593	32
7bbb0251	33	{
da450f52 JH	34	# bug id 20001009.001
da450f52 JH	35
89491803 SC	36	my ($a, $b);
	37
	38	{ use bytes; $a = "\xc3\xa4" }
4765795a	39	{ use utf8; $b = "\xe4" }
89491803	40
4765795a	41	my $test = 68;
31067593	42
4765795a	43	ok($a ne $b);
da450f52	44
4765795a	45	{ use utf8; ok($a ne $b) }
da450f52 JH	46	}
da450f52 JH	47
60ff4832 JH	48
	49	{
	50	# bug id 20000730.004
	51
60ff4832 JH	52	my $smiley = "\x{263a}";
60ff4832 JH	53
4765795a JH	54	for my $s ("\x{263a}",
4765795a JH	55	$smiley,
60ff4832	56
4765795a JH	57	"" . $smiley,
4765795a JH	58	"" . "\x{263a}",
60ff4832	59
4765795a JH	60	$smiley . "",
4765795a JH	61	"\x{263a}" . "",
60ff4832 JH	62	) {
	63	my $length_chars = length($s);
	64	my $length_bytes;
	65	{ use bytes; $length_bytes = length($s) }
	66	my @regex_chars = $s =~ m/(.)/g;
	67	my $regex_chars = @regex_chars;
	68	my @split_chars = split //, $s;
	69	my $split_chars = @split_chars;
4765795a JH	70	ok("$length_chars/$regex_chars/$split_chars/$length_bytes" eq
4765795a JH	71	"1/1/1/3");
60ff4832 JH	72	}
60ff4832 JH	73
4765795a JH	74	for my $s ("\x{263a}" . "\x{263a}",
4765795a JH	75	$smiley . $smiley,
60ff4832	76
4765795a JH	77	"\x{263a}\x{263a}",
4765795a JH	78	"$smiley$smiley",
60ff4832	79
4765795a JH	80	"\x{263a}" x 2,
4765795a JH	81	$smiley x 2,
60ff4832 JH	82	) {
	83	my $length_chars = length($s);
	84	my $length_bytes;
	85	{ use bytes; $length_bytes = length($s) }
	86	my @regex_chars = $s =~ m/(.)/g;
	87	my $regex_chars = @regex_chars;
	88	my @split_chars = split //, $s;
	89	my $split_chars = @split_chars;
4765795a JH	90	ok("$length_chars/$regex_chars/$split_chars/$length_bytes" eq
4765795a JH	91	"2/2/2/6");
60ff4832 JH	92	}
60ff4832 JH	93	}
ffc61ed2	94
ffc61ed2 JH	95
ffc61ed2 JH	96	{
f9a63242 JH	97	my $w = 0;
	98	local $SIG{__WARN__} = sub { print "#($_[0])\n"; $w++ };
	99	my $x = eval q/"\\/ . "\x{100}" . q/"/;;
	100
4765795a	101	ok($w == 0 && $x eq "\x{100}");
f9a63242 JH	102	}
f9a63242 JH	103
8ae6c9f9	104	{
435e7af6	105	use warnings;
4c26891c NC	106	use strict;
	107
	108	my $show = q(
	109	sub show {
	110	my $result;
	111	$result .= '>' . join (',', map {ord} split //, $_) . '<'
	112	foreach @_;
	113	$result;
	114	}
	115	1;
	116	);
	117	eval $show or die $@; # We don't expect this sub definition to fail.
8ae6c9f9	118	my $progfile = 'utf' . $$;
435e7af6 NC	119	END {unlink_all $progfile}
	120
	121	# If I'm right 60 is '>' in ASCII, ' ' in EBCDIC
	122	# 173 is not punctuation in either ASCII or EBCDIC
	123	my (@char);
	124	foreach (60, 173, 257, 65532) {
	125	my $char = chr $_;
	126	utf8::encode($char);
	127	# I don't want to use map {ord} and I've no need to hardcode the UTF
	128	# version
	129	my $charsubst = $char;
	130	$charsubst =~ s/(.)/ord ($1) . ','/ge;
	131	chop $charsubst;
4c26891c NC	132	# Not testing this one against map {ord}
	133	my $char_as_ord
	134	= join " . ", map {sprintf 'chr (%d)', ord $_} split //, $char;
	135	push @char, [$_, $char, $charsubst, $char_as_ord];
435e7af6	136	}
4c26891c NC	137	# Now we've done all the UTF8 munching hopefully we're safe
4c26891c NC	138	my @tests = (
435e7af6	139	['check our detection program works',
4c26891c	140	'my @a = ("'.chr(60).'\x2A", ""); $b = show @a', qr/^>60,42<><$/],
435e7af6	141	['check literal 8 bit input',
4c26891c	142	'$a = "' . chr (173) . '"; $b = show $a', qr/^>173<$/],
435e7af6	143	['check no utf8; makes no change',
4c26891c	144	'no utf8; $a = "' . chr (173) . '"; $b = show $a', qr/^>173<$/],
435e7af6 NC	145	# Now we do the real byte sequences that are valid UTF8
	146	(map {
	147	["the utf8 sequence for chr $_->[0]",
4c26891c	148	qq(\$a = "$_->[1]"; \$b = show \$a), qr/^>$_->[2]<$/],
435e7af6	149	["no utf8; for the utf8 sequence for chr $_->[0]",
4c26891c	150	qq(no utf8; \$a = "$_->[1]"; \$b = show \$a), qr/^>$_->[2]<$/],
435e7af6	151	["use utf8; for the utf8 sequence for chr $_->[0]",
4c26891c	152	qq(use utf8; \$a = "$_->[1]"; \$b = show \$a), qr/^>$_->[0]<$/],
435e7af6 NC	153	} @char),
	154	# Interpolation of hex characters needs to take place now, as we're
	155	# testing feeding malformed utf8 into perl. Bug now fixed was an
	156	# "out of memory" error. We really need the "" [rather than qq()
	157	# or q()] to get the best explosion.
	158	["!Feed malformed utf8 into perl.", <<"BANG",
8ae6c9f9	159	use utf8; %a = ("\xE1\xA0"=>"sterling");
435e7af6	160	print 'start'; printf '%x,', ord \$_ foreach keys %a; print "end\n";
8ae6c9f9	161	BANG
097fb8e2	162	qr/^Malformed UTF-8 character \(2 bytes, need 3.+\).*start\d+,end$/s
435e7af6	163	],
4c26891c NC	164	);
4c26891c NC	165	foreach (@tests) {
435e7af6 NC	166	my ($why, $prog, $expect) = @$_;
435e7af6 NC	167	open P, ">$progfile" or die "Can't open '$progfile': $!";
4c26891c NC	168	print P $show, $prog, '; print $b'
4c26891c NC	169	or die "Print to 'progfile' failed: $!";
435e7af6 NC	170	close P or die "Can't close '$progfile': $!";
	171	if ($why =~ s/^!//) {
	172	print "# Possible delay...\n";
	173	} else {
	174	print "# $prog\n";
	175	}
	176	my $result = runperl ( stderr => 1, progfile => $progfile );
	177	like ($result, $expect, $why);
	178	}
4c26891c NC	179	print
	180	"# Again! Again! [but this time as eval, and not the explosive one]\n";
	181	# and now we've safely done them all as separate files, check that the
	182	# evals do the same thing. Hopefully doing it later sucessfully decouples
	183	# the previous tests from anything messy that may go wrong with the evals.
	184	foreach (@tests) {
	185	my ($why, $prog, $expect) = @$_;
	186	next if $why =~ m/^!/; # Goes bang.
	187	my $result = eval $prog;
	188	if ($@) {
	189	print "# prog is $prog\n";
	190	print "# \$\@=", _qq($@), "\n";
	191	}
	192	like ($result, $expect, $why);
	193	}
	194
	195	# See what the tokeniser does with hash keys.
	196	print "# What does the tokeniser do with utf8 hash keys?\n";
	197	@tests = (map {
	198	# This is the control - I don't expect it to fail
	199	["assign utf8 for chr $_->[0] to a hash",
	200	qq(my \$a = "$_->[1]"; my %h; \$h{\$a} = 1;
	201	my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
	202	qr/^>$_->[2]<$/],
	203	["no utf8; assign utf8 for chr $_->[0] to a hash",
	204	qq(no utf8; my \$a = "$_->[1]"; my %h; \$h{\$a} = 1;
	205	my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
	206	qr/^>$_->[2]<$/],
	207	["use utf8; assign utf8 for chr $_->[0] to a hash",
	208	qq(use utf8; my \$a = "$_->[1]"; my %h; \$h{\$a} = 1;
	209	my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
	210	qr/^>$_->[0]<$/],
	211	# Now check literal $h{"x"} constructions.
	212	["\$h{\"x\"} construction, where x is utf8 for chr $_->[0]",
	213	qq(my \$a = "$_->[1]"; my %h; \$h{"$_->[1]"} = 1;
	214	my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
	215	qr/^>$_->[2]<$/],
	216	["no utf8; \$h{\"x\"} construction, where x is utf8 for chr $_->[0]",
	217	qq(no utf8; my \$a = "$_->[1]"; my %h; \$h{"$_->[1]"} = 1;
	218	my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
	219	qr/^>$_->[2]<$/],
	220	["use utf8; \$h{\"x\"} construction, where x is utf8 for chr $_->[0]",
	221	qq(use utf8; my \$a = "$_->[1]"; my %h; \$h{"$_->[1]"} = 1;
	222	my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
	223	qr/^>$_->[0]<$/],
	224	# Now check "x" => constructions.
	225	["assign \"x\"=>1 to a hash, where x is utf8 for chr $_->[0]",
	226	qq(my \$a = "$_->[1]"; my %h; %h = ("$_->[1]" => 1);
	227	my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
	228	qr/^>$_->[2]<$/],
	229	["no utf8; assign \"x\"=>1 to a hash, where x is utf8 for chr $_->[0]",
	230	qq(no utf8; my \$a = "$_->[1]"; my %h; %h = ("$_->[1]" => 1);
	231	my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
	232	qr/^>$_->[2]<$/],
	233	["use utf8; assign \"x\"=>1 to a hash, where x is utf8 for chr $_->[0]",
	234	qq(use utf8; my \$a = "$_->[1]"; my %h; %h = ("$_->[1]" => 1);
	235	my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
	236	qr/^>$_->[0]<$/],
	237	# Check copies of hashes made from literal utf8 keys
	238	["assign utf8 for chr $_->[0] to a hash, then copy it",
	239	qq(my \$a = "$_->[1]"; my %i; \$i{\$a} = 1; my %h = %i;
	240	my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
	241	qr/^>$_->[2]<$/],
	242	["no utf8; assign utf8 for chr $_->[0] to a hash, then copy it",
243	qq(no utf8; my \$a = "$_->[1]"; my %i; \$i{\$a} = 1;; my %h = %i;
244	my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
245	qr/^>$_->[2]<$/],
246	["use utf8; assign utf8 for chr $_->[0] to a hash, then copy it",
247	qq(use utf8; my \$a = "$_->[1]"; my %i; \$i{\$a} = 1; my %h = %i;
248	my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
249	qr/^>$_->[0]<$/],
250	} @char);
251	foreach (@tests) {
252	my ($why, $prog, $expect) = @$_;
253	# print "# $prog\n";
254	my $result = eval $prog;
255	like ($result, $expect, $why);
256	}
8ae6c9f9	257	}