[perl5.git] / t / uni / parser.t

#!./perl

# Checks if the parser behaves correctly in edge cases
# (including weird syntax errors)

BEGIN {
    chdir 't' if -d 't';
    require './test.pl';
    skip_all_without_unicode_tables();
}

plan (tests => 52);

use utf8;
use open qw( :utf8 :std );

is *tèst, "*main::tèst", "sanity check.";
ok $::{"tèst"}, "gets the right glob in the stash.";

my $glob_by_sub = sub { *ｍａｉｎ::method }->();

is *ｍａｉｎ::method, "*ｍａｉｎ::method", "glob stringy works";
is "" . *ｍａｉｎ::method, "*ｍａｉｎ::method", "glob stringify-through-concat works";
is $glob_by_sub, "*ｍａｉｎ::method", "glob stringy works";
is "" . $glob_by_sub, "*ｍａｉｎ::method", "";

sub gimme_glob {
    no strict 'refs';
    is *{$_[0]}, "*main::$_[0]";
    *{$_[0]};
}

is "" . gimme_glob("下郎"), "*main::下郎";
$a = *下郎;
is "" . $a, "*main::下郎";

*{gimme_glob("下郎")} = sub {};

{
    ok defined *{"下郎"}{CODE};
    ok !defined *{"\344\270\213\351\203\216"}{CODE};
}

$Lèon = 1;
is ${*Lèon{SCALAR}}, 1, "scalar define in the right glob,";
ok !${*{"L\303\250on"}{SCALAR}}, "..and nothing in the wrong one.";

my $a = "foo" . chr(190);
my $b = $a    . chr(256);
chop $b; # $b is $a with utf8 on

is $a, $b, '$a equals $b';

*$b = sub { 5 };

is eval { main->$a }, 5, q!$a can call $b's sub!;
ok !$@, "..and there's no error.";

my $c = $b;
utf8::encode($c);
ok $b ne $c, '$b unequal $c';
eval { main->$c };
ok $@, q!$c can't call $b's sub.!;

# Now define another sub under the downgraded name:
*$a = sub { 6 };
# Call it:
is eval { main->$a }, 6, "Adding a new sub to *a and calling it works,";
ok !$@, "..without errors.";
eval { main->$c };
ok $@, "but it's still unreachable through *c";

*$b = \10;
is ${*$a{SCALAR}}, 10;
is ${*$b{SCALAR}}, 10;
is ${*$c{SCALAR}}, undef;

opendir FÒÒ, ".";
closedir FÒÒ;
::ok($::{"FÒÒ"}, "Bareword generates the right glob.");
::ok(!$::{"F\303\222\303\222"});

sub участники { 1 }

ok $::{"участники"}, "non-const sub declarations generate the right glob";
is $::{"участники"}->(), 1;

sub 原 () { 1 }

is grep({ $_ eq "\x{539f}"     } keys %::), 1, "Constant subs generate the right glob.";
is grep({ $_ eq "\345\216\237" } keys %::), 0;

#These should probably go elsewhere.
eval q{ sub wròng1 (_$); wròng1(1,2) };
like( $@, qr/Malformed prototype for main::wròng1/, 'Malformed prototype croak is clean.' );

eval q{ sub ча::ики ($__); ча::ики(1,2) };
like( $@, qr/Malformed prototype for ча::ики/ );

our $問 = 10;
is $問, 10, "our works";
is $main::問, 10, "...as does getting the same variable through the fully qualified name";
is ${"main::\345\225\217"}, undef, "..and using the encoded form doesn't";

{
    use charnames qw( :full );

    eval qq! my \$\x{30cb} \N{DROMEDARY CAMEL} !;
    $@ =~ s/eval \d+/eval 11/;
    is $@, 'Unrecognized character \x{1f42a}; marked by <-- HERE after  my $ニ <-- HERE near column 8 at (eval 11) line 1.
', "'Unrecognized character' croak is UTF-8 clean";

    eval "q\0foobar\0 \x{FFFF}+1";
    $@ =~ s/eval \d+/eval 11/;
    is(
        $@,
       "Unrecognized character \\x{ffff}; marked by <-- HERE after q\0foobar\0 <-- HERE near column 11 at (eval 11) line 1.\n",
       "...and nul-clean"
    );

    {
        use re 'eval';
        my $f = qq{(?{\$ネ+ 1; \x{1F42A} })};
        eval { "a" =~ /^a$f/ };
        my $e = $@;
        $e =~ s/eval \d+/eval 11/;
        is(
            $e,
            "Unrecognized character \\x{1f42a}; marked by <-- HERE after (?{\$ネ+ 1; <-- HERE near column 13 at (eval 11) line 1.\n",
            "Messages from a re-eval are UTF-8 clean"
        );

        $f = qq{(?{q\0foobar\0 \x{FFFF}+1 })};
        eval { "a" =~ /^a$f/ };
        my $e = $@;
        $e =~ s/eval \d+/eval 11/;
        is(
            $e,
            "Unrecognized character \\x{ffff}; marked by <-- HERE after q\x{0}foobar\x{0} <-- HERE near column 16 at (eval 11) line 1.\n",
           "...and nul-clean"
        );
    }
    
    {
        eval qq{\$ネ+ 1; \x{1F42A}};
        $@ =~ s/eval \d+/eval 11/;
        is(
            $@,
            "Unrecognized character \\x{1f42a}; marked by <-- HERE after \$ネ+ 1; <-- HERE near column 8 at (eval 11) line 1.\n",
            "Unrecognized character error doesn't cut off in the middle of characters"
        )
    }

}

{
    use feature 'state';
    for ( qw( my state our ) ) {
        local $@;
        eval "$_ Ｆｏｏ $x = 1;";
        like $@, qr/No such class Ｆｏｏ/u, "'No such class' warning for $_ is UTF-8 clean";
    }
}

{
    local $@;
    eval "our \$main::\x{30cb};";
    like $@, qr!No package name allowed for variable \$main::\x{30cb} in "our"!, "'No such package name allowed for variable' is UTF-8 clean";
}

{
    use feature 'state';
    local $@;
    for ( qw( my state ) ) {
        eval "$_ \$::\x{30cb};";
        like $@, qr!"$_" variable \$::\x{30cb} can't be in a package!, qq!'"$_" variable %s can't be in a package' is UTF-8 clean!;
    }
}

{
    local $@;
    eval qq!print \x{30cb}, "comma""!;
    like $@, qr/No comma allowed after filehandle/, "No comma allowed after filehandle triggers correctly for UTF-8 filehandles.";
}

# tests for "Bad name"
eval q{ Ｆｏｏ::$bar };
like( $@, qr/Bad name after Ｆｏｏ::/, 'Bad name after Ｆｏｏ::' );
eval q{ Ｆｏｏ''bar };
like( $@, qr/Bad name after Ｆｏｏ'/, 'Bad name after Ｆｏｏ\'' );

{
    no warnings 'utf8';
    my $malformed_to_be = ($::IS_EBCDIC)   # Overlong sequence
                           ? "\x{74}\x{41}"
                           : "\x{c0}\x{a0}";
    CORE::evalbytes "use charnames ':full'; use utf8; my \$x = \"\\N{abc$malformed_to_be}\"";
    like( $@, qr/Malformed UTF-8 character immediately after '\\N\{abc' at .* within string/, 'Malformed UTF-8 input to \N{}');
}

# RT# 124216: Perl_sv_clear: Assertion
# If a parsing error occurred during a forced token within an interpolated
# context, the stack unwinding failed to restore PL_lex_defer and so after
# error recovery the state restored after the forced token was processed
# was the wrong one, resulting in the lexer thinking we're still inside a
# quoted string and things getting freed multiple times.
#
# The \x{3030} char isn't a legal var name, and this triggers the error.
#
# NB: this only failed if the closing quote of the interpolated string is
# the last char of the file (i.e. no trailing \n).

{
    my $bad = "\x{3030}";
    # Write out the individual utf8 bytes making up \x{3030}. This
    # avoids 'Wide char in print' warnings from test.pl. (We may still
    # get that warning when compiling the prog itself, since the
    # error it prints to stderr contains a wide char.)
    utf8::encode($bad);

    fresh_perl_like(qq{use utf8; "\$$bad"},
        qr/
            \A
            ( \QWide character in print at - line 1.\E\n )?
            \Qsyntax error at - line 1, near \E"\$.*"\n
            \QExecution of - aborted due to compilation errors.\E\z
        /xm,

        {stderr => 1}, "RT# 124216");
}
Commit	Line	Data
1a96a5b7 BF	1	#!./perl
	2
	3	# Checks if the parser behaves correctly in edge cases
	4	# (including weird syntax errors)
	5
	6	BEGIN {
b5efbd1f	7	chdir 't' if -d 't';
1a96a5b7	8	require './test.pl';
41779348	9	skip_all_without_unicode_tables();
1a96a5b7 BF	10	}
1a96a5b7 BF	11
c7f317a9	12	plan (tests => 52);
1a96a5b7 BF	13
	14	use utf8;
	15	use open qw( :utf8 :std );
	16
0435b362	17	is tèst, "main::tèst", "sanity check.";
1a96a5b7 BF	18	ok $::{"tèst"}, "gets the right glob in the stash.";
	19
	20	my $glob_by_sub = sub { *ｍａｉｎ::method }->();
	21
	22	is ｍａｉｎ::method, "ｍａｉｎ::method", "glob stringy works";
	23	is "" . ｍａｉｎ::method, "ｍａｉｎ::method", "glob stringify-through-concat works";
	24	is $glob_by_sub, "*ｍａｉｎ::method", "glob stringy works";
	25	is "" . $glob_by_sub, "*ｍａｉｎ::method", "";
	26
	27	sub gimme_glob {
	28	no strict 'refs';
	29	is {$_[0]}, "main::$_[0]";
	30	*{$_[0]};
	31	}
	32
	33	is "" . gimme_glob("下郎"), "*main::下郎";
	34	$a = *下郎;
	35	is "" . $a, "*main::下郎";
	36
	37	*{gimme_glob("下郎")} = sub {};
	38
	39	{
	40	ok defined *{"下郎"}{CODE};
	41	ok !defined *{"\344\270\213\351\203\216"}{CODE};
	42	}
	43
	44	$Lèon = 1;
	45	is ${*Lèon{SCALAR}}, 1, "scalar define in the right glob,";
	46	ok !${*{"L\303\250on"}{SCALAR}}, "..and nothing in the wrong one.";
	47
	48	my $a = "foo" . chr(190);
	49	my $b = $a . chr(256);
	50	chop $b; # $b is $a with utf8 on
	51
	52	is $a, $b, '$a equals $b';
	53
	54	*$b = sub { 5 };
	55
	56	is eval { main->$a }, 5, q!$a can call $b's sub!;
	57	ok !$@, "..and there's no error.";
	58
	59	my $c = $b;
	60	utf8::encode($c);
	61	ok $b ne $c, '$b unequal $c';
	62	eval { main->$c };
	63	ok $@, q!$c can't call $b's sub.!;
	64
	65	# Now define another sub under the downgraded name:
	66	*$a = sub { 6 };
	67	# Call it:
	68	is eval { main->$a }, 6, "Adding a new sub to *a and calling it works,";
	69	ok !$@, "..without errors.";
	70	eval { main->$c };
	71	ok $@, "but it's still unreachable through *c";
	72
	73	*$b = \10;
	74	is ${*$a{SCALAR}}, 10;
	75	is ${*$b{SCALAR}}, 10;
	76	is ${*$c{SCALAR}}, undef;
	77
	78	opendir FÒÒ, ".";
	79	closedir FÒÒ;
	80	::ok($::{"FÒÒ"}, "Bareword generates the right glob.");
	81	::ok(!$::{"F\303\222\303\222"});
82
83	sub участники { 1 }
84
85	ok $::{"участники"}, "non-const sub declarations generate the right glob";
2eaf799e	86	is $::{"участники"}->(), 1;
1a96a5b7	87
3453414d BF	88	sub 原 () { 1 }
	89
	90	is grep({ $_ eq "\x{539f}" } keys %::), 1, "Constant subs generate the right glob.";
	91	is grep({ $_ eq "\345\216\237" } keys %::), 0;
	92
108f32a5 BF	93	#These should probably go elsewhere.
	94	eval q{ sub wròng1 (_$); wròng1(1,2) };
	95	like( $@, qr/Malformed prototype for main::wròng1/, 'Malformed prototype croak is clean.' );
	96
	97	eval q{ sub ча::ики ($__); ча::ики(1,2) };
	98	like( $@, qr/Malformed prototype for ча::ики/ );
	99
23b0eed2 BF	100	our $問 = 10;
	101	is $問, 10, "our works";
	102	is $main::問, 10, "...as does getting the same variable through the fully qualified name";
	103	is ${"main::\345\225\217"}, undef, "..and using the encoded form doesn't";
e2f06df0 BF	104
	105	{
	106	use charnames qw( :full );
	107
	108	eval qq! my \$\x{30cb} \N{DROMEDARY CAMEL} !;
055745ba	109	$@ =~ s/eval \d+/eval 11/;
08454bd8	110	is $@, 'Unrecognized character \x{1f42a}; marked by <-- HERE after my $ニ <-- HERE near column 8 at (eval 11) line 1.
e2f06df0	111	', "'Unrecognized character' croak is UTF-8 clean";
84051091 BF	112
	113	eval "q\0foobar\0 \x{FFFF}+1";
	114	$@ =~ s/eval \d+/eval 11/;
	115	is(
	116	$@,
	117	"Unrecognized character \\x{ffff}; marked by <-- HERE after q\0foobar\0 <-- HERE near column 11 at (eval 11) line 1.\n",
	118	"...and nul-clean"
	119	);
	120
ce4bd586	121	{
84051091 BF	122	use re 'eval';
	123	my $f = qq{(?{\$ネ+ 1; \x{1F42A} })};
	124	eval { "a" =~ /^a$f/ };
	125	my $e = $@;
	126	$e =~ s/eval \d+/eval 11/;
	127	is(
	128	$e,
	129	"Unrecognized character \\x{1f42a}; marked by <-- HERE after (?{\$ネ+ 1; <-- HERE near column 13 at (eval 11) line 1.\n",
	130	"Messages from a re-eval are UTF-8 clean"
	131	);
	132
	133	$f = qq{(?{q\0foobar\0 \x{FFFF}+1 })};
	134	eval { "a" =~ /^a$f/ };
	135	my $e = $@;
	136	$e =~ s/eval \d+/eval 11/;
	137	is(
	138	$e,
	139	"Unrecognized character \\x{ffff}; marked by <-- HERE after q\x{0}foobar\x{0} <-- HERE near column 16 at (eval 11) line 1.\n",
	140	"...and nul-clean"
	141	);
	142	}
	143
	144	{
	145	eval qq{\$ネ+ 1; \x{1F42A}};
	146	$@ =~ s/eval \d+/eval 11/;
	147	is(
	148	$@,
	149	"Unrecognized character \\x{1f42a}; marked by <-- HERE after \$ネ+ 1; <-- HERE near column 8 at (eval 11) line 1.\n",
	150	"Unrecognized character error doesn't cut off in the middle of characters"
	151	)
	152	}
	153
e2f06df0	154	}
3c54b17a BF	155
	156	{
	157	use feature 'state';
	158	for ( qw( my state our ) ) {
	159	local $@;
	160	eval "$_ Ｆｏｏ $x = 1;";
	161	like $@, qr/No such class Ｆｏｏ/u, "'No such class' warning for $_ is UTF-8 clean";
	162	}
4bca4ee0 BF	163	}
	164
	165	{
	166	local $@;
	167	eval "our \$main::\x{30cb};";
	168	like $@, qr!No package name allowed for variable \$main::\x{30cb} in "our"!, "'No such package name allowed for variable' is UTF-8 clean";
	169	}
58576270 BF	170
	171	{
	172	use feature 'state';
	173	local $@;
	174	for ( qw( my state ) ) {
	175	eval "$_ \$::\x{30cb};";
	176	like $@, qr!"$_" variable \$::\x{30cb} can't be in a package!, qq!'"$_" variable %s can't be in a package' is UTF-8 clean!;
	177	}
	178	}
d0fb66e4 BF	179
	180	{
	181	local $@;
	182	eval qq!print \x{30cb}, "comma""!;
	183	like $@, qr/No comma allowed after filehandle/, "No comma allowed after filehandle triggers correctly for UTF-8 filehandles.";
	184	}
86fe3f36 BF	185
	186	# tests for "Bad name"
	187	eval q{ Ｆｏｏ::$bar };
	188	like( $@, qr/Bad name after Ｆｏｏ::/, 'Bad name after Ｆｏｏ::' );
	189	eval q{ Ｆｏｏ''bar };
	190	like( $@, qr/Bad name after Ｆｏｏ'/, 'Bad name after Ｆｏｏ\'' );
107160e2 KW	191
	192	{
	193	no warnings 'utf8';
28642c96 KW	194	my $malformed_to_be = ($::IS_EBCDIC) # Overlong sequence
	195	? "\x{74}\x{41}"
	196	: "\x{c0}\x{a0}";
107160e2	197	CORE::evalbytes "use charnames ':full'; use utf8; my \$x = \"\\N{abc$malformed_to_be}\"";
be192702	198	like( $@, qr/Malformed UTF-8 character immediately after '\\N\{abc' at .* within string/, 'Malformed UTF-8 input to \N{}');
107160e2	199	}
c7f317a9 DM	200
	201	# RT# 124216: Perl_sv_clear: Assertion
	202	# If a parsing error occurred during a forced token within an interpolated
	203	# context, the stack unwinding failed to restore PL_lex_defer and so after
	204	# error recovery the state restored after the forced token was processed
	205	# was the wrong one, resulting in the lexer thinking we're still inside a
	206	# quoted string and things getting freed multiple times.
	207	#
c7f317a9 DM	208	# The \x{3030} char isn't a legal var name, and this triggers the error.
	209	#
	210	# NB: this only failed if the closing quote of the interpolated string is
	211	# the last char of the file (i.e. no trailing \n).
	212
	213	{
10574810 DM	214	my $bad = "\x{3030}";
	215	# Write out the individual utf8 bytes making up \x{3030}. This
	216	# avoids 'Wide char in print' warnings from test.pl. (We may still
	217	# get that warning when compiling the prog itself, since the
	218	# error it prints to stderr contains a wide char.)
	219	utf8::encode($bad);
	220
	221	fresh_perl_like(qq{use utf8; "\$$bad"},
e7385084 DM	222	qr/
	223	\A
	224	( \QWide character in print at - line 1.\E\n )?
	225	\Qsyntax error at - line 1, near \E"\$.*"\n
	226	\QExecution of - aborted due to compilation errors.\E\z
	227	/xm,
	228
	229	{stderr => 1}, "RT# 124216");
c7f317a9	230	}