[perl5.git] / t / re / reg_fold.t

#!perl

BEGIN {
    chdir 't' if -d 't';
    require './test.pl';
    set_up_inc('../lib');
    skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec");
}

use strict;
use warnings;
my @tests;

my $file="../lib/unicore/CaseFolding.txt";
my @folds;
use Unicode::UCD;

# Use the Unicode data file if we are on an ASCII platform (which its data is
# for), and it is in the modern format (starting in Unicode 3.1.0) and it is
# available.  This avoids being affected by potential bugs introduced by other
# layers of Perl
if (ord('A') == 65
    && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
    && open my $fh, "<", $file)
{
    @folds = <$fh>;
}
else {
    my ($invlist_ref, $invmap_ref, undef, $default)
                                    = Unicode::UCD::prop_invmap('Case_Folding');
    for my $i (0 .. @$invlist_ref - 1 - 1) {
        next if $invmap_ref->[$i] == $default;
        my $adjust = -1;
        for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
            $adjust++;

            # Single-code point maps go to a 'C' type
            if (! ref $invmap_ref->[$i]) {
                push @folds, sprintf("%04X; C; %04X\n",
                                     $j,
                                     $invmap_ref->[$i] + $adjust);
            }
            else {  # Multi-code point maps go to 'F'.  prop_invmap()
                    # guarantees that no adjustment is needed for these,
                    # as the range will contain just one element
                push @folds, sprintf("%04X; F; %s\n",
                                    $j,
                                    join " ", map { sprintf "%04X", $_ }
                                                    @{$invmap_ref->[$i]});
            }
        }
    }
}

for  (@folds) {
    chomp;
    my ($line,$comment)= split/\s+#\s+/, $_;
    $comment = "" unless defined $comment;
    my ($cp,$type,@folded)=split/[\s;]+/,$line||'';
    next unless $type and ($type eq 'F' or $type eq 'C');
    my $fold_above_latin1 = grep { hex("0x$_") > 255 } @folded;
    $_="\\x{$_}" for @folded;
    my $cpv=hex("0x$cp");
    my $chr="\\x{$cp}";
    my @str;
    foreach my $swap (0, 1) {   # swap lhs and rhs, or not.
        foreach my $charclass (0, 1) {   # Put rhs in [...], or not
            my $lhs;
            my $rhs;
            if ($swap) {
                $lhs = join "", @folded;
                $rhs = $chr;
                $rhs = "[$rhs]" if $charclass;
            } else {
                #next if $charclass && @folded > 1;
                $lhs = $chr;
                $rhs = "";
                foreach my $rhs_char (@folded) {

                    # The colon is an unrelated character to the rest of the
                    # class, and makes sure no optimization into an EXACTish
                    # node occurs.
                    $rhs .= '[:' if $charclass;
                    $rhs .=  $rhs_char;
                    $rhs .= ']' if $charclass;
                }
            }
            $lhs = "\"$lhs\"";
            $rhs = "/^$rhs\$/iu";

            # Try both Latin1 and Unicode for code points below 256
            foreach my $upgrade ("", 'utf8::upgrade($c); ') {
                if ($upgrade) { # No need to upgrade if already must be in
                                # utf8
                    next if $swap && $fold_above_latin1;
                    next if !$swap && $cpv > 255;
                }
                my $eval = "my \$c = $lhs; $upgrade\$c =~ $rhs";
                #print __LINE__, ": $eval\n";
                push @tests, qq[ok(eval '$eval', '$eval - $comment')];
                if (! $swap && $charclass && @folded > 1)
		{
                    $tests[-1]="TODO: { local \$::TODO='A multi-char fold \"foo\", doesnt work for /[f][o][o]/i';\n$tests[-1] }"
                }
            }
        }
    }
}

# Now verify the case folding tables.  First compute the mappings without
# resorting to the functions we're testing.

# Initialize the array so each $i maps to itself.
my @fold_ascii;
for my $i (0 .. 255) {
    $fold_ascii[$i] = $i;
}
my @fold_latin1 = @fold_ascii;

# Override the uppercase elements to fold to their lower case equivalents,
# using the fact that 'A' in ASCII is 0x41, 'a' is 0x41+32, 'B' is 0x42, and
# so on.  The same paradigm applies for most of the Latin1 range cased
# characters, but in posix anything outside ASCII maps to itself, as we've
# already set up.
for my $i (0x41 .. 0x5A, 0xC0 .. 0xD6, 0xD8 .. 0xDE) {
    my $upper_ord = utf8::unicode_to_native($i);
    my $lower_ord = utf8::unicode_to_native($i + 32);

    $fold_latin1[$upper_ord] = $lower_ord;

    next if $i > 127;
    $fold_ascii[$upper_ord] = $lower_ord;
}

# Same for folding lower to the upper equivalents
for my $i (0x61 .. 0x7A, 0xE0 .. 0xF6, 0xF8 .. 0xFE) {
    my $lower_ord = utf8::unicode_to_native($i);
    my $upper_ord = utf8::unicode_to_native($i - 32);

    $fold_latin1[$lower_ord] = $upper_ord;

    next if $i > 127;
    $fold_ascii[$lower_ord] = $upper_ord;
}

# Test every latin1 character for the correct values in both /u and /d
for my $i (0 .. 255) {
    my $chr = sprintf "\\x%02X", $i;
    my $hex_fold_ascii = sprintf "0x%02X", $fold_ascii[$i];
    my $hex_fold_latin1 = sprintf "0x%02X", $fold_latin1[$i];
    push @tests, qq[like chr($hex_fold_ascii), qr/(?d:$chr)/i, 'chr($hex_fold_ascii) =~ qr/(?d:$chr)/i'];
    push @tests, qq[like chr($hex_fold_latin1), qr/(?u:$chr)/i, 'chr($hex_fold_latin1) =~ qr/(?u:$chr)/i'];
}


push @tests, qq[like chr(0x0430), qr/[=\x{0410}-\x{0411}]/i, 'Bug #71752 Unicode /i char in a range'];
push @tests, qq[like 'a', qr/\\p{Upper}/i, "'a' =~ /\\\\p{Upper}/i"];
push @tests, q[my $c = "\x{212A}"; my $p = qr/(?:^[K_]+$)/i; utf8::upgrade($p); like $c, qr/$p/, 'Bug #78994: my $c = "\x{212A}"; my $p = qr/(?:^[K_]+$)/i; utf8::upgrade($p); $c =~ $p'];

use charnames ":full";
my $e_grave = chr utf8::unicode_to_native(0xE8);
push @tests, q[my $re1 = "\N{WHITE SMILING FACE}";like $e_grave, qr/[\w$re1]/, 'my $re = "\N{WHITE SMILING FACE}"; $e_grave =~ qr/[\w$re]/'];
push @tests, q[my $re2 = "\N{WHITE SMILING FACE}";like $e_grave, qr/\w|$re2/, 'my $re = "\N{WHITE SMILING FACE}"; $e_grave =~ qr/\w|$re/'];

eval join ";\n","plan tests=>". (scalar @tests), @tests, "1"
    or die $@;
__DATA__
Commit	Line	Data
24df86f6 RGS	1	#!perl
	2
	3	BEGIN {
	4	chdir 't' if -d 't';
7bc44f18	5	require './test.pl';
624c42e2	6	set_up_inc('../lib');
164766b2	7	skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec");
24df86f6 RGS	8	}
24df86f6 RGS	9
a0a388a1 YO	10	use strict;
a0a388a1 YO	11	use warnings;
a0a388a1	12	my @tests;
a0a388a1 YO	13
a0a388a1 YO	14	my $file="../lib/unicore/CaseFolding.txt";
10329c47 KW	15	my @folds;
	16	use Unicode::UCD;
	17
	18	# Use the Unicode data file if we are on an ASCII platform (which its data is
	19	# for), and it is in the modern format (starting in Unicode 3.1.0) and it is
	20	# available. This avoids being affected by potential bugs introduced by other
	21	# layers of Perl
	22	if (ord('A') == 65
5fede848	23	&& pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
10329c47 KW	24	&& open my $fh, "<", $file)
	25	{
	26	@folds = <$fh>;
	27	}
	28	else {
	29	my ($invlist_ref, $invmap_ref, undef, $default)
	30	= Unicode::UCD::prop_invmap('Case_Folding');
	31	for my $i (0 .. @$invlist_ref - 1 - 1) {
	32	next if $invmap_ref->[$i] == $default;
	33	my $adjust = -1;
	34	for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
	35	$adjust++;
	36
	37	# Single-code point maps go to a 'C' type
	38	if (! ref $invmap_ref->[$i]) {
	39	push @folds, sprintf("%04X; C; %04X\n",
	40	$j,
	41	$invmap_ref->[$i] + $adjust);
	42	}
	43	else { # Multi-code point maps go to 'F'. prop_invmap()
	44	# guarantees that no adjustment is needed for these,
	45	# as the range will contain just one element
	46	push @folds, sprintf("%04X; F; %s\n",
	47	$j,
	48	join " ", map { sprintf "%04X", $_ }
	49	@{$invmap_ref->[$i]});
	50	}
	51	}
	52	}
	53	}
	54
	55	for (@folds) {
a0a388a1 YO	56	chomp;
a0a388a1 YO	57	my ($line,$comment)= split/\s+#\s+/, $_;
10329c47	58	$comment = "" unless defined $comment;
1443f10d	59	my ($cp,$type,@folded)=split/[\s;]+/,$line\|\|'';
a0a388a1	60	next unless $type and ($type eq 'F' or $type eq 'C');
1443f10d KW	61	my $fold_above_latin1 = grep { hex("0x$_") > 255 } @folded;
1443f10d KW	62	$_="\\x{$_}" for @folded;
a0a388a1	63	my $cpv=hex("0x$cp");
1443f10d	64	my $chr="\\x{$cp}";
a0a388a1	65	my @str;
1443f10d	66	foreach my $swap (0, 1) { # swap lhs and rhs, or not.
8edd0c7f	67	foreach my $charclass (0, 1) { # Put rhs in [...], or not
1443f10d KW	68	my $lhs;
	69	my $rhs;
	70	if ($swap) {
	71	$lhs = join "", @folded;
	72	$rhs = $chr;
	73	$rhs = "[$rhs]" if $charclass;
	74	} else {
8edd0c7f	75	#next if $charclass && @folded > 1;
1443f10d KW	76	$lhs = $chr;
	77	$rhs = "";
	78	foreach my $rhs_char (@folded) {
31ae3604 KW	79
	80	# The colon is an unrelated character to the rest of the
	81	# class, and makes sure no optimization into an EXACTish
	82	# node occurs.
	83	$rhs .= '[:' if $charclass;
1443f10d KW	84	$rhs .= $rhs_char;
	85	$rhs .= ']' if $charclass;
	86	}
	87	}
	88	$lhs = "\"$lhs\"";
6cdc48f8	89	$rhs = "/^$rhs\$/iu";
24df86f6	90
1443f10d KW	91	# Try both Latin1 and Unicode for code points below 256
1443f10d KW	92	foreach my $upgrade ("", 'utf8::upgrade($c); ') {
59aef945 KW	93	if ($upgrade) { # No need to upgrade if already must be in
59aef945 KW	94	# utf8
1443f10d KW	95	next if $swap && $fold_above_latin1;
	96	next if !$swap && $cpv > 255;
	97	}
	98	my $eval = "my \$c = $lhs; $upgrade\$c =~ $rhs";
	99	#print __LINE__, ": $eval\n";
	100	push @tests, qq[ok(eval '$eval', '$eval - $comment')];
67fcf67c	101	if (! $swap && $charclass && @folded > 1)
2f7760b5	102	{
67fcf67c	103	$tests[-1]="TODO: { local \$::TODO='A multi-char fold \"foo\", doesnt work for /[f][o][o]/i';\n$tests[-1] }"
1443f10d	104	}
1443f10d KW	105	}
1443f10d KW	106	}
a0a388a1	107	}
24df86f6	108	}
2726813d	109
fad448f4 KW	110	# Now verify the case folding tables. First compute the mappings without
	111	# resorting to the functions we're testing.
	112
	113	# Initialize the array so each $i maps to itself.
	114	my @fold_ascii;
	115	for my $i (0 .. 255) {
	116	$fold_ascii[$i] = $i;
	117	}
	118	my @fold_latin1 = @fold_ascii;
	119
	120	# Override the uppercase elements to fold to their lower case equivalents,
	121	# using the fact that 'A' in ASCII is 0x41, 'a' is 0x41+32, 'B' is 0x42, and
	122	# so on. The same paradigm applies for most of the Latin1 range cased
	123	# characters, but in posix anything outside ASCII maps to itself, as we've
	124	# already set up.
	125	for my $i (0x41 .. 0x5A, 0xC0 .. 0xD6, 0xD8 .. 0xDE) {
6deb7a5e KW	126	my $upper_ord = utf8::unicode_to_native($i);
6deb7a5e KW	127	my $lower_ord = utf8::unicode_to_native($i + 32);
fad448f4 KW	128
	129	$fold_latin1[$upper_ord] = $lower_ord;
	130
	131	next if $i > 127;
	132	$fold_ascii[$upper_ord] = $lower_ord;
	133	}
	134
	135	# Same for folding lower to the upper equivalents
	136	for my $i (0x61 .. 0x7A, 0xE0 .. 0xF6, 0xF8 .. 0xFE) {
6deb7a5e KW	137	my $lower_ord = utf8::unicode_to_native($i);
6deb7a5e KW	138	my $upper_ord = utf8::unicode_to_native($i - 32);
fad448f4 KW	139
	140	$fold_latin1[$lower_ord] = $upper_ord;
	141
	142	next if $i > 127;
	143	$fold_ascii[$lower_ord] = $upper_ord;
	144	}
	145
59aef945	146	# Test every latin1 character for the correct values in both /u and /d
fad448f4 KW	147	for my $i (0 .. 255) {
	148	my $chr = sprintf "\\x%02X", $i;
	149	my $hex_fold_ascii = sprintf "0x%02X", $fold_ascii[$i];
	150	my $hex_fold_latin1 = sprintf "0x%02X", $fold_latin1[$i];
	151	push @tests, qq[like chr($hex_fold_ascii), qr/(?d:$chr)/i, 'chr($hex_fold_ascii) =~ qr/(?d:$chr)/i'];
fad448f4	152	push @tests, qq[like chr($hex_fold_latin1), qr/(?u:$chr)/i, 'chr($hex_fold_latin1) =~ qr/(?u:$chr)/i'];
fad448f4 KW	153	}
	154
	155
2726813d	156	push @tests, qq[like chr(0x0430), qr/[=\x{0410}-\x{0411}]/i, 'Bug #71752 Unicode /i char in a range'];
2726813d	157	push @tests, qq[like 'a', qr/\\p{Upper}/i, "'a' =~ /\\\\p{Upper}/i"];
aaa63dae	158	push @tests, q[my $c = "\x{212A}"; my $p = qr/(?:^[K_]+$)/i; utf8::upgrade($p); like $c, qr/$p/, 'Bug #78994: my $c = "\x{212A}"; my $p = qr/(?:^[K_]+$)/i; utf8::upgrade($p); $c =~ $p'];
2726813d	159
7b98bc43	160	use charnames ":full";
7bc44f18	161	my $e_grave = chr utf8::unicode_to_native(0xE8);
ce0a582e KW	162	push @tests, q[my $re1 = "\N{WHITE SMILING FACE}";like $e_grave, qr/[\w$re1]/, 'my $re = "\N{WHITE SMILING FACE}"; $e_grave =~ qr/[\w$re]/'];
ce0a582e KW	163	push @tests, q[my $re2 = "\N{WHITE SMILING FACE}";like $e_grave, qr/\w\|$re2/, 'my $re = "\N{WHITE SMILING FACE}"; $e_grave =~ qr/\w\|$re/'];
7b98bc43	164
b2a1b324	165	eval join ";\n","plan tests=>". (scalar @tests), @tests, "1"
a0a388a1 YO	166	or die $@;
a0a388a1 YO	167	__DATA__