[perl5.git] / t / op / split_unicode.t

#!./perl

BEGIN {
    require './test.pl';
    skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec (used by charnames)");
    plan(tests => 145);
}

{
    # check the special casing of split /\s/ and unicode
    use charnames qw(:full);
    # below test data is extracted from
    # PropList-5.0.0.txt
    # Date: 2006-06-07, 23:22:52 GMT [MD]
    #
    # Unicode Character Database
    # Copyright (c) 1991-2006 Unicode, Inc.
    # For terms of use, see http://www.unicode.org/terms_of_use.html
    # For documentation, see UCD.html
    my @spaces=(
	ord("\t"),      # Cc       <control-0009>
	ord("\n"),      # Cc       <control-000A>
	# not PerlSpace # Cc       <control-000B>
	ord("\f"),      # Cc       <control-000C>
	ord("\r"),      # Cc       <control-000D>
	ord(" "),       # Zs       SPACE
	ord("\N{NEL}"), # Cc       <control-0085>
	ord("\N{NO-BREAK SPACE}"),
			# Zs       NO-BREAK SPACE
        0x1680,         # Zs       OGHAM SPACE MARK
        0x2000..0x200A, # Zs  [11] EN QUAD..HAIR SPACE
        0x2028,         # Zl       LINE SEPARATOR
        0x2029,         # Zp       PARAGRAPH SEPARATOR
        0x202F,         # Zs       NARROW NO-BREAK SPACE
        0x205F,         # Zs       MEDIUM MATHEMATICAL SPACE
        0x3000          # Zs       IDEOGRAPHIC SPACE
    );
    #diag "Have @{[0+@spaces]} to test\n";
    foreach my $cp (@spaces) {
	my $msg = sprintf "Space: U+%04x", $cp;
        my $space = chr($cp);
        my $str="A:$space:B\x{FFFD}";
        chop $str;

        my @res=split(/\s+/,$str);
        my $cnt=split(/\s+/,$str);
        ok(@res == 2 && join('-',@res) eq "A:-:B", "$msg - /\\s+/");
	is($cnt, scalar(@res), "$msg - /\\s+/ (count)");

        my $s2 = "$space$space:A:$space$space:B\x{FFFD}";
        chop $s2;

        my @r2 = split(' ',$s2);
	my $c2 = split(' ',$s2);
        ok(@r2 == 2 && join('-', @r2) eq ":A:-:B",  "$msg - ' '");
	is($c2, scalar(@r2), "$msg - ' ' (count)");

        my @r3 = split(/\s+/, $s2);
        my $c3 = split(/\s+/, $s2);
        ok(@r3 == 3 && join('-', @r3) eq "-:A:-:B", "$msg - /\\s+/ No.2");
	is($c3, scalar(@r3), "$msg - /\\s+/ No.2 (count)");
    }

    { # RT #114808
        warning_is(
            sub {
                $p=chr(0x100);
                for (".","ab\x{101}def") {
                    @q = split /$p/
                }
            },
            undef,
            'no warnings when part of split cant match non-utf8'
        );
    }

}
Commit	Line	Data
ab08a362 NC	1	#!./perl
	2
	3	BEGIN {
	4	require './test.pl';
	5	skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec (used by charnames)");
a9c9e371	6	plan(tests => 145);
ab08a362 NC	7	}
	8
	9	{
	10	# check the special casing of split /\s/ and unicode
	11	use charnames qw(:full);
	12	# below test data is extracted from
	13	# PropList-5.0.0.txt
	14	# Date: 2006-06-07, 23:22:52 GMT [MD]
	15	#
	16	# Unicode Character Database
	17	# Copyright (c) 1991-2006 Unicode, Inc.
	18	# For terms of use, see http://www.unicode.org/terms_of_use.html
	19	# For documentation, see UCD.html
	20	my @spaces=(
	21	ord("\t"), # Cc <control-0009>
	22	ord("\n"), # Cc <control-000A>
	23	# not PerlSpace # Cc <control-000B>
	24	ord("\f"), # Cc <control-000C>
	25	ord("\r"), # Cc <control-000D>
	26	ord(" "), # Zs SPACE
	27	ord("\N{NEL}"), # Cc <control-0085>
	28	ord("\N{NO-BREAK SPACE}"),
	29	# Zs NO-BREAK SPACE
	30	0x1680, # Zs OGHAM SPACE MARK
ab08a362 NC	31	0x2000..0x200A, # Zs [11] EN QUAD..HAIR SPACE
	32	0x2028, # Zl LINE SEPARATOR
	33	0x2029, # Zp PARAGRAPH SEPARATOR
	34	0x202F, # Zs NARROW NO-BREAK SPACE
	35	0x205F, # Zs MEDIUM MATHEMATICAL SPACE
	36	0x3000 # Zs IDEOGRAPHIC SPACE
	37	);
	38	#diag "Have @{[0+@spaces]} to test\n";
	39	foreach my $cp (@spaces) {
	40	my $msg = sprintf "Space: U+%04x", $cp;
	41	my $space = chr($cp);
	42	my $str="A:$space:B\x{FFFD}";
	43	chop $str;
	44
	45	my @res=split(/\s+/,$str);
	46	my $cnt=split(/\s+/,$str);
	47	ok(@res == 2 && join('-',@res) eq "A:-:B", "$msg - /\\s+/");
	48	is($cnt, scalar(@res), "$msg - /\\s+/ (count)");
	49
	50	my $s2 = "$space$space:A:$space$space:B\x{FFFD}";
	51	chop $s2;
	52
	53	my @r2 = split(' ',$s2);
	54	my $c2 = split(' ',$s2);
	55	ok(@r2 == 2 && join('-', @r2) eq ":A:-:B", "$msg - ' '");
	56	is($c2, scalar(@r2), "$msg - ' ' (count)");
	57
	58	my @r3 = split(/\s+/, $s2);
	59	my $c3 = split(/\s+/, $s2);
	60	ok(@r3 == 3 && join('-', @r3) eq "-:A:-:B", "$msg - /\\s+/ No.2");
	61	is($c3, scalar(@r3), "$msg - /\\s+/ No.2 (count)");
	62	}
7e0d5ad7 KW	63
	64	{ # RT #114808
	65	warning_is(
	66	sub {
	67	$p=chr(0x100);
	68	for (".","ab\x{101}def") {
	69	@q = split /$p/
	70	}
	71	},
	72	undef,
	73	'no warnings when part of split cant match non-utf8'
	74	);
	75	}
	76
ab08a362	77	}