[perl5.git] / t / re / reg_fold.t

#!perl

BEGIN {
    chdir 't' if -d 't';
    @INC = '../lib';
    require './test.pl';
    skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec");
}

use strict;
use warnings;
my @tests;

my %todo_pass = map { $_ => 1 }
	    qw(00DF 1E9E FB00 FB01 FB02 FB03 FB04 FB05 FB06);

my $file="../lib/unicore/CaseFolding.txt";
open my $fh,"<",$file or die "Failed to read '$file': $!";
while (<$fh>) {
    chomp;
    my ($line,$comment)= split/\s+#\s+/, $_;
    my ($cp,$type,@folded)=split/[\s;]+/,$line||'';
    next unless $type and ($type eq 'F' or $type eq 'C');
    next if $type eq 'C';   # 'C' tests now done by fold_grind.t
    my $fold_above_latin1 = grep { hex("0x$_") > 255 } @folded;
    $_="\\x{$_}" for @folded;
    my $cpv=hex("0x$cp");
    my $chr="\\x{$cp}";
    my @str;
    foreach my $swap (0, 1) {   # swap lhs and rhs, or not.
        foreach my $charclass (0) {   # Put rhs in [...], or not
            my $lhs;
            my $rhs;
            if ($swap) {
                $lhs = join "", @folded;
                $rhs = $chr;
                $rhs = "[$rhs]" if $charclass;
            } else {
                $lhs = $chr;
                $rhs = "";
                foreach my $rhs_char (@folded) {
                    $rhs .= '[' if $charclass;
                    $rhs .=  $rhs_char;
                    $rhs .= ']' if $charclass;
                }
            }
            $lhs = "\"$lhs\"";
            $rhs = "/^$rhs\$/i";

            # Try both Latin1 and Unicode for code points below 256
            foreach my $upgrade ("", 'utf8::upgrade($c); ') {
                if ($upgrade) {
                    next if $swap && $fold_above_latin1;
                    next if !$swap && $cpv > 255;
                }
                my $eval = "my \$c = $lhs; $upgrade\$c =~ $rhs";
                #print __LINE__, ": $eval\n";
                push @tests, qq[ok(eval '$eval', '$eval - $comment')];
                if ($charclass && @folded > 1 && $swap && ! $upgrade && ! $fold_above_latin1) {
                    $tests[-1]="TODO: { local \$::TODO='Multi-char, non-utf8 folded inside character class [ ] doesnt work';\n$tests[-1] }"
                } elsif (! $upgrade && $cpv >= 128 && $cpv <= 255 && $cpv != 0xb5) {
                    $tests[-1]="TODO: { local \$::TODO='Most non-utf8 latin1 doesnt work';\n$tests[-1] }"
                } elsif (! $swap && $charclass && @folded > 1
		    && ! $todo_pass{$cp})
		{
                    # There are a few of these that pass; most fail.
                    $tests[-1]="TODO: { local \$::TODO='Some multi-char, f8 folded inside character class [ ] doesnt work';\n$tests[-1] }"
                }
            }
        }
    }
}

# Now verify the case folding tables.  First compute the mappings without
# resorting to the functions we're testing.

# Initialize the array so each $i maps to itself.
my @fold_ascii;
for my $i (0 .. 255) {
    $fold_ascii[$i] = $i;
}
my @fold_latin1 = @fold_ascii;

# Override the uppercase elements to fold to their lower case equivalents,
# using the fact that 'A' in ASCII is 0x41, 'a' is 0x41+32, 'B' is 0x42, and
# so on.  The same paradigm applies for most of the Latin1 range cased
# characters, but in posix anything outside ASCII maps to itself, as we've
# already set up.
for my $i (0x41 .. 0x5A, 0xC0 .. 0xD6, 0xD8 .. 0xDE) {
    my $upper_ord = ord_latin1_to_native($i);
    my $lower_ord = ord_latin1_to_native($i + 32);

    $fold_latin1[$upper_ord] = $lower_ord;

    next if $i > 127;
    $fold_ascii[$upper_ord] = $lower_ord;
}

# Same for folding lower to the upper equivalents
for my $i (0x61 .. 0x7A, 0xE0 .. 0xF6, 0xF8 .. 0xFE) {
    my $lower_ord = ord_latin1_to_native($i);
    my $upper_ord = ord_latin1_to_native($i - 32);

    $fold_latin1[$lower_ord] = $upper_ord;

    next if $i > 127;
    $fold_ascii[$lower_ord] = $upper_ord;
}

# Test every latin1 character that the correct values in both /u and /d
for my $i (0 .. 255) {
    my $chr = sprintf "\\x%02X", $i;
    my $hex_fold_ascii = sprintf "0x%02X", $fold_ascii[$i];
    my $hex_fold_latin1 = sprintf "0x%02X", $fold_latin1[$i];
    push @tests, qq[like chr($hex_fold_ascii), qr/(?d:$chr)/i, 'chr($hex_fold_ascii) =~ qr/(?d:$chr)/i'];
    push @tests, qq[like chr($hex_fold_latin1), qr/(?u:$chr)/i, 'chr($hex_fold_latin1) =~ qr/(?u:$chr)/i'];
}


push @tests, qq[like chr(0x0430), qr/[=\x{0410}-\x{0411}]/i, 'Bug #71752 Unicode /i char in a range'];
push @tests, qq[like 'a', qr/\\p{Upper}/i, "'a' =~ /\\\\p{Upper}/i"];
push @tests, q[my $c = "\x{212A}"; my $p = qr/(?:^[\x{004B}_]+$)/i; utf8::upgrade($p); like $c, $p, 'Bug #78994: my $c = "\x{212A}"; my $p = qr/(?:^[\x{004B}_]+$)/i; utf8::upgrade($p); $c =~ $p'];

use charnames ":full";
push @tests, q[my $re1 = "\N{WHITE SMILING FACE}";like "\xE8", qr/[\w$re1]/, 'my $re = "\N{WHITE SMILING FACE}"; "\xE8" =~ qr/[\w$re]/'];
push @tests, q[my $re2 = "\N{WHITE SMILING FACE}";like "\xE8", qr/\w|$re2/, 'my $re = "\N{WHITE SMILING FACE}"; "\xE8" =~ qr/\w|$re/'];

eval join ";\n","plan tests=>". (scalar @tests), @tests, "1"
    or die $@;
__DATA__
Commit	Line	Data
24df86f6 RGS	1	#!perl
	2
	3	BEGIN {
	4	chdir 't' if -d 't';
	5	@INC = '../lib';
0214bff6	6	require './test.pl';
164766b2	7	skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec");
24df86f6 RGS	8	}
24df86f6 RGS	9
a0a388a1 YO	10	use strict;
a0a388a1 YO	11	use warnings;
a0a388a1	12	my @tests;
a0a388a1	13
2f7760b5 DM	14	my %todo_pass = map { $_ => 1 }
	15	qw(00DF 1E9E FB00 FB01 FB02 FB03 FB04 FB05 FB06);
	16
a0a388a1	17	my $file="../lib/unicore/CaseFolding.txt";
24df86f6	18	open my $fh,"<",$file or die "Failed to read '$file': $!";
a0a388a1 YO	19	while (<$fh>) {
	20	chomp;
	21	my ($line,$comment)= split/\s+#\s+/, $_;
1443f10d	22	my ($cp,$type,@folded)=split/[\s;]+/,$line\|\|'';
a0a388a1	23	next unless $type and ($type eq 'F' or $type eq 'C');
8bfc9fab	24	next if $type eq 'C'; # 'C' tests now done by fold_grind.t
1443f10d KW	25	my $fold_above_latin1 = grep { hex("0x$_") > 255 } @folded;
1443f10d KW	26	$_="\\x{$_}" for @folded;
a0a388a1	27	my $cpv=hex("0x$cp");
1443f10d	28	my $chr="\\x{$cp}";
a0a388a1	29	my @str;
1443f10d	30	foreach my $swap (0, 1) { # swap lhs and rhs, or not.
3366dfc6	31	foreach my $charclass (0) { # Put rhs in [...], or not
1443f10d KW	32	my $lhs;
	33	my $rhs;
	34	if ($swap) {
	35	$lhs = join "", @folded;
	36	$rhs = $chr;
	37	$rhs = "[$rhs]" if $charclass;
	38	} else {
	39	$lhs = $chr;
	40	$rhs = "";
	41	foreach my $rhs_char (@folded) {
	42	$rhs .= '[' if $charclass;
	43	$rhs .= $rhs_char;
	44	$rhs .= ']' if $charclass;
	45	}
	46	}
	47	$lhs = "\"$lhs\"";
	48	$rhs = "/^$rhs\$/i";
24df86f6	49
1443f10d KW	50	# Try both Latin1 and Unicode for code points below 256
	51	foreach my $upgrade ("", 'utf8::upgrade($c); ') {
	52	if ($upgrade) {
	53	next if $swap && $fold_above_latin1;
	54	next if !$swap && $cpv > 255;
	55	}
	56	my $eval = "my \$c = $lhs; $upgrade\$c =~ $rhs";
	57	#print __LINE__, ": $eval\n";
	58	push @tests, qq[ok(eval '$eval', '$eval - $comment')];
0f824d87	59	if ($charclass && @folded > 1 && $swap && ! $upgrade && ! $fold_above_latin1) {
1443f10d KW	60	$tests[-1]="TODO: { local \$::TODO='Multi-char, non-utf8 folded inside character class [ ] doesnt work';\n$tests[-1] }"
	61	} elsif (! $upgrade && $cpv >= 128 && $cpv <= 255 && $cpv != 0xb5) {
	62	$tests[-1]="TODO: { local \$::TODO='Most non-utf8 latin1 doesnt work';\n$tests[-1] }"
2f7760b5 DM	63	} elsif (! $swap && $charclass && @folded > 1
	64	&& ! $todo_pass{$cp})
	65	{
1443f10d KW	66	# There are a few of these that pass; most fail.
	67	$tests[-1]="TODO: { local \$::TODO='Some multi-char, f8 folded inside character class [ ] doesnt work';\n$tests[-1] }"
	68	}
1443f10d KW	69	}
1443f10d KW	70	}
a0a388a1	71	}
24df86f6	72	}
2726813d	73
fad448f4 KW	74	# Now verify the case folding tables. First compute the mappings without
	75	# resorting to the functions we're testing.
	76
	77	# Initialize the array so each $i maps to itself.
	78	my @fold_ascii;
	79	for my $i (0 .. 255) {
	80	$fold_ascii[$i] = $i;
	81	}
	82	my @fold_latin1 = @fold_ascii;
	83
	84	# Override the uppercase elements to fold to their lower case equivalents,
	85	# using the fact that 'A' in ASCII is 0x41, 'a' is 0x41+32, 'B' is 0x42, and
	86	# so on. The same paradigm applies for most of the Latin1 range cased
	87	# characters, but in posix anything outside ASCII maps to itself, as we've
	88	# already set up.
	89	for my $i (0x41 .. 0x5A, 0xC0 .. 0xD6, 0xD8 .. 0xDE) {
	90	my $upper_ord = ord_latin1_to_native($i);
	91	my $lower_ord = ord_latin1_to_native($i + 32);
	92
	93	$fold_latin1[$upper_ord] = $lower_ord;
	94
	95	next if $i > 127;
	96	$fold_ascii[$upper_ord] = $lower_ord;
	97	}
	98
	99	# Same for folding lower to the upper equivalents
	100	for my $i (0x61 .. 0x7A, 0xE0 .. 0xF6, 0xF8 .. 0xFE) {
	101	my $lower_ord = ord_latin1_to_native($i);
	102	my $upper_ord = ord_latin1_to_native($i - 32);
	103
	104	$fold_latin1[$lower_ord] = $upper_ord;
	105
	106	next if $i > 127;
	107	$fold_ascii[$lower_ord] = $upper_ord;
	108	}
	109
	110	# Test every latin1 character that the correct values in both /u and /d
	111	for my $i (0 .. 255) {
	112	my $chr = sprintf "\\x%02X", $i;
	113	my $hex_fold_ascii = sprintf "0x%02X", $fold_ascii[$i];
	114	my $hex_fold_latin1 = sprintf "0x%02X", $fold_latin1[$i];
	115	push @tests, qq[like chr($hex_fold_ascii), qr/(?d:$chr)/i, 'chr($hex_fold_ascii) =~ qr/(?d:$chr)/i'];
fad448f4	116	push @tests, qq[like chr($hex_fold_latin1), qr/(?u:$chr)/i, 'chr($hex_fold_latin1) =~ qr/(?u:$chr)/i'];
fad448f4 KW	117	}
	118
	119
2726813d	120	push @tests, qq[like chr(0x0430), qr/[=\x{0410}-\x{0411}]/i, 'Bug #71752 Unicode /i char in a range'];
2726813d	121	push @tests, qq[like 'a', qr/\\p{Upper}/i, "'a' =~ /\\\\p{Upper}/i"];
8951c461	122	push @tests, q[my $c = "\x{212A}"; my $p = qr/(?:^[\x{004B}_]+$)/i; utf8::upgrade($p); like $c, $p, 'Bug #78994: my $c = "\x{212A}"; my $p = qr/(?:^[\x{004B}_]+$)/i; utf8::upgrade($p); $c =~ $p'];
2726813d	123
7b98bc43 KW	124	use charnames ":full";
	125	push @tests, q[my $re1 = "\N{WHITE SMILING FACE}";like "\xE8", qr/[\w$re1]/, 'my $re = "\N{WHITE SMILING FACE}"; "\xE8" =~ qr/[\w$re]/'];
	126	push @tests, q[my $re2 = "\N{WHITE SMILING FACE}";like "\xE8", qr/\w\|$re2/, 'my $re = "\N{WHITE SMILING FACE}"; "\xE8" =~ qr/\w\|$re/'];
	127
b2a1b324	128	eval join ";\n","plan tests=>". (scalar @tests), @tests, "1"
a0a388a1 YO	129	or die $@;
a0a388a1 YO	130	__DATA__