[perl5.git] / lib / feature / unicode_strings.t

use warnings;
use strict;

BEGIN {
    chdir 't' if -d 't';
    @INC = '../lib';
    require './test.pl';
}

plan(20736);    # Determined by experimentation

# In this section, test the upper/lower/title case mappings for all characters
# 0-255.

# First compute the case mappings without resorting to the functions we're
# testing.

# Initialize the arrays so each $i maps to itself.
my @posix_to_upper;
for my $i (0 .. 255) {
    $posix_to_upper[$i] = chr($i);
}
my @posix_to_lower
= my @posix_to_title
= my @latin1_to_upper
= my @latin1_to_lower
= my @latin1_to_title
= @posix_to_upper;

# Override the elements in the to_lower arrays that have different standard
# lower case mappings.  (standard meaning they are 32 numbers apart)
for my $i (0x41 .. 0x5A, 0xC0 .. 0xD6, 0xD8 .. 0xDE) {
    my $upper_ord = ord_latin1_to_native $i;
    my $lower_ord = ord_latin1_to_native($i + 32);

    $latin1_to_lower[$upper_ord] = chr($lower_ord);

    next if $i > 127;

    $posix_to_lower[$upper_ord] = chr($lower_ord);
}

# Same for upper and title
for my $i (0x61 .. 0x7A, 0xE0 .. 0xF6, 0xF8 .. 0xFE) {
    my $lower_ord = ord_latin1_to_native $i;
    my $upper_ord = ord_latin1_to_native($i - 32);

    $latin1_to_upper[$lower_ord] = chr($upper_ord);
    $latin1_to_title[$lower_ord] = chr($upper_ord);

    next if $i > 127;

    $posix_to_upper[$lower_ord] = chr($upper_ord);
    $posix_to_title[$lower_ord] = chr($upper_ord);
}

# Override the abnormal cases.
$latin1_to_upper[ord_latin1_to_native 0xB5] = chr(0x39C);
$latin1_to_title[ord_latin1_to_native 0xB5] = chr(0x39C);
$latin1_to_upper[ord_latin1_to_native 0xDF] = 'SS';
$latin1_to_title[ord_latin1_to_native 0xDF] = 'Ss';
$latin1_to_upper[ord_latin1_to_native 0xFF] = chr(0x178);
$latin1_to_title[ord_latin1_to_native 0xFF] = chr(0x178);

my $repeat = 25;    # Length to make strings.

# Create hashes of strings in several ranges, both for uc and lc.
my %posix;
$posix{'uc'} = 'A' x $repeat;
$posix{'lc'} = 'a' x $repeat ;

my %cyrillic;
$cyrillic{'uc'} = chr(0x42F) x $repeat;
$cyrillic{'lc'} = chr(0x44F) x $repeat;

my %latin1;
$latin1{'uc'} = chr(ord_latin1_to_native 0xD8) x $repeat;
$latin1{'lc'} = chr(ord_latin1_to_native 0xF8) x $repeat;

my %empty;
$empty{'lc'} = $empty{'uc'} = "";

# Loop so prefix each character being tested with nothing, and the various
# strings; then loop for suffixes of those strings as well.
for my  $prefix (\%empty, \%posix, \%cyrillic, \%latin1) {
    for my  $suffix (\%empty, \%posix, \%cyrillic, \%latin1) {
        for my $i (0 .. 255) {  # For each possible posix or latin1 character
            my $cp = sprintf "U+%04X", $i;

            # First try using latin1 (Unicode) semantics.
            use feature "unicode_strings";

            my $phrase = 'in uni8bit';
            my $char = chr($i);
            my $pre_lc = $prefix->{'lc'};
            my $pre_uc = $prefix->{'uc'};
            my $post_lc = $suffix->{'lc'};
            my $post_uc = $suffix->{'uc'};
            my $to_upper = $pre_lc . $char . $post_lc;
            my $expected_upper = $pre_uc . $latin1_to_upper[$i] . $post_uc;
            my $to_lower = $pre_uc . $char . $post_uc;
            my $expected_lower = $pre_lc . $latin1_to_lower[$i] . $post_lc;

            is (uc($to_upper), $expected_upper,
                display("$cp: $phrase: Verify uc($to_upper) eq $expected_upper"));
            is (lc($to_lower), $expected_lower,
                display("$cp: $phrase: Verify lc($to_lower) eq $expected_lower"));

            if ($pre_uc eq "") {    # Title case if null prefix.
                my $expected_title = $latin1_to_title[$i] . $post_lc;
                is (ucfirst($to_upper), $expected_title,
                    display("$cp: $phrase: Verify ucfirst($to_upper) eq $expected_title"));
                my $expected_lcfirst = $latin1_to_lower[$i] . $post_uc;
                is (lcfirst($to_lower), $expected_lcfirst,
                    display("$cp: $phrase: Verify lcfirst($to_lower) eq $expected_lcfirst"));
            }

            # Then try with posix semantics.
            no feature "unicode_strings";
            $phrase = 'no uni8bit';

            # These don't contribute anything in this case.
            next if $suffix == \%cyrillic;
            next if $suffix == \%latin1;
            next if $prefix == \%cyrillic;
            next if $prefix == \%latin1;

            $expected_upper = $pre_uc . $posix_to_upper[$i] . $post_uc;
            $expected_lower = $pre_lc . $posix_to_lower[$i] . $post_lc;

            is (uc($to_upper), $expected_upper,
                display("$cp: $phrase: Verify uc($to_upper) eq $expected_upper"));
            is (lc($to_lower), $expected_lower,
                display("$cp: $phrase: Verify lc($to_lower) eq $expected_lower"));

            if ($pre_uc eq "") {
                my $expected_title = $posix_to_title[$i] . $post_lc;
                is (ucfirst($to_upper), $expected_title,
                    display("$cp: $phrase: Verify ucfirst($to_upper) eq $expected_title"));
                my $expected_lcfirst = $posix_to_lower[$i] . $post_uc;
                is (lcfirst($to_lower), $expected_lcfirst,
                    display("$cp: $phrase: Verify lcfirst($to_lower) eq $expected_lcfirst"));
            }
        }
    }
}

# In this section test that \w, \s, and \b (and complements) work correctly.
# These are the only character classes affected by this pragma.  Above ASCII
# range Latin-1 characters are in \w and \s iff the pragma is on.

# Construct the expected full Latin1 values without using anything we're
# testing.  All these were determined manually by looking in the manual.
# Boolean: is w[$i] a \w character?
my @w = (0) x 256;
for my $i ( 0x30 .. 0x39,   # 0-9
            0x41 .. 0x5a,   # A-Z
            0x61 .. 0x7a,   # a-z
            0x5F,           # _
            0xAA,           # FEMININE ORDINAL INDICATOR
            0xB5,           # MICRO SIGN
            0xBA,           # MASCULINE ORDINAL INDICATOR
            0xC0 .. 0xD6,   # various
            0xD8 .. 0xF6,   # various
            0xF8 .. 0xFF,   # various
        )
{
    $w[ord_latin1_to_native $i] = 1;
}

# Boolean: is s[$i] a \s character?
my @s = (0) x 256;
$s[ord_latin1_to_native 0x09] = 1;   # Tab
$s[ord_latin1_to_native 0x0A] = 1;   # LF
$s[ord_latin1_to_native 0x0B] = 1;   # VT
$s[ord_latin1_to_native 0x0C] = 1;   # FF
$s[ord_latin1_to_native 0x0D] = 1;   # CR
$s[ord_latin1_to_native 0x20] = 1;   # SPACE
$s[ord_latin1_to_native 0x85] = 1;   # NEL
$s[ord_latin1_to_native 0xA0] = 1;   # NO BREAK SPACE

for my $i (0 .. 255) {
    my $char = chr($i);
    my $hex_i = sprintf "%02X", $i;
    foreach my $which (\@s, \@w) {
        my $basic_name;
        if ($which == \@s) {
            $basic_name = 's';
        } else {
            $basic_name = 'w'
        }

        # Test \w \W \s \S
        foreach my $complement (0, 1) {
            my $name = '\\' . (($complement) ? uc($basic_name) : $basic_name);

            # in and out of [...]
            foreach my $charclass (0, 1) {

                # And like [^...] or just plain [...]
                foreach my $complement_class (0, 1) {
                    next if ! $charclass && $complement_class;

                    # Start with the boolean as to if the character is in the
                    # class, and then complement as needed.
                    my $expect_success = $which->[$i];
                    $expect_success = ! $expect_success if $complement;
                    $expect_success = ! $expect_success if $complement_class;

                    my $test = $name;
                    $test = "^$test" if $complement_class;
                    $test = "[$test]" if $charclass;
                    $test = "^$test\$";

                    use feature 'unicode_strings';
                    my $prefix = "in uni8bit; Verify chr(0x$hex_i)";
                    if ($expect_success) {
                        like($char, qr/$test/, display("$prefix =~ qr/$test/"));
                    } else {
                        unlike($char, qr/$test/, display("$prefix !~ qr/$test/"));
                    }

                    no feature 'unicode_strings';
                    $prefix = "no uni8bit; Verify chr(0x$hex_i)";

                    # With the legacy, nothing above 128 should be in the
                    # class
                    if ($i >= 128) {
                        $expect_success = 0;
                        $expect_success = ! $expect_success if $complement;
                        $expect_success = ! $expect_success if $complement_class;
                    }
                    if ($expect_success) {
                        like($char, qr/$test/, display("$prefix =~ qr/$test/"));
                    } else {
                        unlike($char, qr/$test/, display("$prefix !~ qr/$test/"));
                    }
                }
            }
        }
    }

    # Similarly for \b and \B.
    foreach my $complement (0, 1) {
        my $name = '\\' . (($complement) ? 'B' : 'b');
        my $expect_success = ! $w[$i];  # \b is complement of \w
        $expect_success = ! $expect_success if $complement;

        my $string = "a$char";
        my $test = "(^a$name\\x{$hex_i}\$)";

        use feature 'unicode_strings';
        my $prefix = "in uni8bit; Verify $string";
        if ($expect_success) {
            like($string, qr/$test/, display("$prefix =~ qr/$test/"));
        } else {
            unlike($string, qr/$test/, display("$prefix !~ qr/$test/"));
        }

        no feature 'unicode_strings';
        $prefix = "no uni8bit; Verify $string";
        if ($i >= 128) {
            $expect_success = 1;
            $expect_success = ! $expect_success if $complement;
        }
        if ($expect_success) {
            like($string, qr/$test/, display("$prefix =~ qr/$test/"));
            like($string, qr/$test/, display("$prefix =~ qr/$test/"));
        } else {
            unlike($string, qr/$test/, display("$prefix !~ qr/$test/"));
        }
    }
}
Commit	Line	Data
00f254e2 KW	1	use warnings;
	2	use strict;
	3
	4	BEGIN {
	5	chdir 't' if -d 't';
	6	@INC = '../lib';
	7	require './test.pl';
	8	}
	9
a12cf05f	10	plan(20736); # Determined by experimentation
00f254e2	11
a12cf05f KW	12	# In this section, test the upper/lower/title case mappings for all characters
a12cf05f KW	13	# 0-255.
00f254e2 KW	14
	15	# First compute the case mappings without resorting to the functions we're
	16	# testing.
	17
	18	# Initialize the arrays so each $i maps to itself.
	19	my @posix_to_upper;
	20	for my $i (0 .. 255) {
	21	$posix_to_upper[$i] = chr($i);
	22	}
	23	my @posix_to_lower
	24	= my @posix_to_title
	25	= my @latin1_to_upper
	26	= my @latin1_to_lower
	27	= my @latin1_to_title
	28	= @posix_to_upper;
	29
4e89eca0 KW	30	# Override the elements in the to_lower arrays that have different standard
	31	# lower case mappings. (standard meaning they are 32 numbers apart)
	32	for my $i (0x41 .. 0x5A, 0xC0 .. 0xD6, 0xD8 .. 0xDE) {
	33	my $upper_ord = ord_latin1_to_native $i;
	34	my $lower_ord = ord_latin1_to_native($i + 32);
	35
	36	$latin1_to_lower[$upper_ord] = chr($lower_ord);
	37
	38	next if $i > 127;
	39
	40	$posix_to_lower[$upper_ord] = chr($lower_ord);
00f254e2 KW	41	}
	42
	43	# Same for upper and title
4e89eca0 KW	44	for my $i (0x61 .. 0x7A, 0xE0 .. 0xF6, 0xF8 .. 0xFE) {
	45	my $lower_ord = ord_latin1_to_native $i;
	46	my $upper_ord = ord_latin1_to_native($i - 32);
00f254e2	47
4e89eca0 KW	48	$latin1_to_upper[$lower_ord] = chr($upper_ord);
	49	$latin1_to_title[$lower_ord] = chr($upper_ord);
	50
	51	next if $i > 127;
	52
	53	$posix_to_upper[$lower_ord] = chr($upper_ord);
	54	$posix_to_title[$lower_ord] = chr($upper_ord);
00f254e2 KW	55	}
	56
	57	# Override the abnormal cases.
4e89eca0 KW	58	$latin1_to_upper[ord_latin1_to_native 0xB5] = chr(0x39C);
	59	$latin1_to_title[ord_latin1_to_native 0xB5] = chr(0x39C);
	60	$latin1_to_upper[ord_latin1_to_native 0xDF] = 'SS';
	61	$latin1_to_title[ord_latin1_to_native 0xDF] = 'Ss';
	62	$latin1_to_upper[ord_latin1_to_native 0xFF] = chr(0x178);
	63	$latin1_to_title[ord_latin1_to_native 0xFF] = chr(0x178);
00f254e2 KW	64
	65	my $repeat = 25; # Length to make strings.
	66
	67	# Create hashes of strings in several ranges, both for uc and lc.
	68	my %posix;
	69	$posix{'uc'} = 'A' x $repeat;
	70	$posix{'lc'} = 'a' x $repeat ;
	71
	72	my %cyrillic;
	73	$cyrillic{'uc'} = chr(0x42F) x $repeat;
	74	$cyrillic{'lc'} = chr(0x44F) x $repeat;
	75
	76	my %latin1;
4e89eca0 KW	77	$latin1{'uc'} = chr(ord_latin1_to_native 0xD8) x $repeat;
4e89eca0 KW	78	$latin1{'lc'} = chr(ord_latin1_to_native 0xF8) x $repeat;
00f254e2 KW	79
	80	my %empty;
	81	$empty{'lc'} = $empty{'uc'} = "";
	82
	83	# Loop so prefix each character being tested with nothing, and the various
	84	# strings; then loop for suffixes of those strings as well.
	85	for my $prefix (\%empty, \%posix, \%cyrillic, \%latin1) {
	86	for my $suffix (\%empty, \%posix, \%cyrillic, \%latin1) {
	87	for my $i (0 .. 255) { # For each possible posix or latin1 character
61fc5122	88	my $cp = sprintf "U+%04X", $i;
00f254e2 KW	89
00f254e2 KW	90	# First try using latin1 (Unicode) semantics.
9fab35f2	91	use feature "unicode_strings";
00f254e2	92
1e9e76cc	93	my $phrase = 'in uni8bit';
00f254e2 KW	94	my $char = chr($i);
	95	my $pre_lc = $prefix->{'lc'};
	96	my $pre_uc = $prefix->{'uc'};
	97	my $post_lc = $suffix->{'lc'};
	98	my $post_uc = $suffix->{'uc'};
	99	my $to_upper = $pre_lc . $char . $post_lc;
	100	my $expected_upper = $pre_uc . $latin1_to_upper[$i] . $post_uc;
	101	my $to_lower = $pre_uc . $char . $post_uc;
	102	my $expected_lower = $pre_lc . $latin1_to_lower[$i] . $post_lc;
	103
	104	is (uc($to_upper), $expected_upper,
1e9e76cc	105	display("$cp: $phrase: Verify uc($to_upper) eq $expected_upper"));
00f254e2	106	is (lc($to_lower), $expected_lower,
1e9e76cc	107	display("$cp: $phrase: Verify lc($to_lower) eq $expected_lower"));
00f254e2 KW	108
	109	if ($pre_uc eq "") { # Title case if null prefix.
	110	my $expected_title = $latin1_to_title[$i] . $post_lc;
	111	is (ucfirst($to_upper), $expected_title,
1e9e76cc	112	display("$cp: $phrase: Verify ucfirst($to_upper) eq $expected_title"));
00f254e2 KW	113	my $expected_lcfirst = $latin1_to_lower[$i] . $post_uc;
00f254e2 KW	114	is (lcfirst($to_lower), $expected_lcfirst,
1e9e76cc	115	display("$cp: $phrase: Verify lcfirst($to_lower) eq $expected_lcfirst"));
00f254e2 KW	116	}
	117
	118	# Then try with posix semantics.
1863b879	119	no feature "unicode_strings";
61fc5122	120	$phrase = 'no uni8bit';
00f254e2 KW	121
	122	# These don't contribute anything in this case.
	123	next if $suffix == \%cyrillic;
	124	next if $suffix == \%latin1;
	125	next if $prefix == \%cyrillic;
	126	next if $prefix == \%latin1;
	127
	128	$expected_upper = $pre_uc . $posix_to_upper[$i] . $post_uc;
	129	$expected_lower = $pre_lc . $posix_to_lower[$i] . $post_lc;
	130
	131	is (uc($to_upper), $expected_upper,
1e9e76cc	132	display("$cp: $phrase: Verify uc($to_upper) eq $expected_upper"));
00f254e2	133	is (lc($to_lower), $expected_lower,
1e9e76cc	134	display("$cp: $phrase: Verify lc($to_lower) eq $expected_lower"));
00f254e2 KW	135
	136	if ($pre_uc eq "") {
	137	my $expected_title = $posix_to_title[$i] . $post_lc;
	138	is (ucfirst($to_upper), $expected_title,
1e9e76cc	139	display("$cp: $phrase: Verify ucfirst($to_upper) eq $expected_title"));
00f254e2 KW	140	my $expected_lcfirst = $posix_to_lower[$i] . $post_uc;
00f254e2 KW	141	is (lcfirst($to_lower), $expected_lcfirst,
1e9e76cc	142	display("$cp: $phrase: Verify lcfirst($to_lower) eq $expected_lcfirst"));
00f254e2 KW	143	}
	144	}
	145	}
	146	}
a12cf05f	147
4e89eca0 KW	148	# In this section test that \w, \s, and \b (and complements) work correctly.
	149	# These are the only character classes affected by this pragma. Above ASCII
	150	# range Latin-1 characters are in \w and \s iff the pragma is on.
a12cf05f	151
4e89eca0 KW	152	# Construct the expected full Latin1 values without using anything we're
	153	# testing. All these were determined manually by looking in the manual.
	154	# Boolean: is w[$i] a \w character?
a12cf05f	155	my @w = (0) x 256;
4e89eca0 KW	156	for my $i ( 0x30 .. 0x39, # 0-9
	157	0x41 .. 0x5a, # A-Z
	158	0x61 .. 0x7a, # a-z
	159	0x5F, # _
	160	0xAA, # FEMININE ORDINAL INDICATOR
	161	0xB5, # MICRO SIGN
	162	0xBA, # MASCULINE ORDINAL INDICATOR
	163	0xC0 .. 0xD6, # various
	164	0xD8 .. 0xF6, # various
	165	0xF8 .. 0xFF, # various
	166	)
	167	{
	168	$w[ord_latin1_to_native $i] = 1;
	169	}
	170
	171	# Boolean: is s[$i] a \s character?
a12cf05f	172	my @s = (0) x 256;
4e89eca0 KW	173	$s[ord_latin1_to_native 0x09] = 1; # Tab
4e89eca0 KW	174	$s[ord_latin1_to_native 0x0A] = 1; # LF
075b9d7d	175	$s[ord_latin1_to_native 0x0B] = 1; # VT
4e89eca0 KW	176	$s[ord_latin1_to_native 0x0C] = 1; # FF
	177	$s[ord_latin1_to_native 0x0D] = 1; # CR
	178	$s[ord_latin1_to_native 0x20] = 1; # SPACE
	179	$s[ord_latin1_to_native 0x85] = 1; # NEL
	180	$s[ord_latin1_to_native 0xA0] = 1; # NO BREAK SPACE
a12cf05f KW	181
	182	for my $i (0 .. 255) {
	183	my $char = chr($i);
	184	my $hex_i = sprintf "%02X", $i;
	185	foreach my $which (\@s, \@w) {
	186	my $basic_name;
	187	if ($which == \@s) {
	188	$basic_name = 's';
	189	} else {
	190	$basic_name = 'w'
	191	}
	192
	193	# Test \w \W \s \S
	194	foreach my $complement (0, 1) {
	195	my $name = '\\' . (($complement) ? uc($basic_name) : $basic_name);
	196
	197	# in and out of [...]
	198	foreach my $charclass (0, 1) {
	199
	200	# And like [^...] or just plain [...]
	201	foreach my $complement_class (0, 1) {
	202	next if ! $charclass && $complement_class;
	203
	204	# Start with the boolean as to if the character is in the
	205	# class, and then complement as needed.
	206	my $expect_success = $which->[$i];
	207	$expect_success = ! $expect_success if $complement;
	208	$expect_success = ! $expect_success if $complement_class;
	209
	210	my $test = $name;
	211	$test = "^$test" if $complement_class;
	212	$test = "[$test]" if $charclass;
	213	$test = "^$test\$";
	214
	215	use feature 'unicode_strings';
	216	my $prefix = "in uni8bit; Verify chr(0x$hex_i)";
	217	if ($expect_success) {
	218	like($char, qr/$test/, display("$prefix =~ qr/$test/"));
	219	} else {
	220	unlike($char, qr/$test/, display("$prefix !~ qr/$test/"));
	221	}
	222
	223	no feature 'unicode_strings';
	224	$prefix = "no uni8bit; Verify chr(0x$hex_i)";
	225
	226	# With the legacy, nothing above 128 should be in the
	227	# class
	228	if ($i >= 128) {
	229	$expect_success = 0;
	230	$expect_success = ! $expect_success if $complement;
	231	$expect_success = ! $expect_success if $complement_class;
	232	}
	233	if ($expect_success) {
	234	like($char, qr/$test/, display("$prefix =~ qr/$test/"));
	235	} else {
	236	unlike($char, qr/$test/, display("$prefix !~ qr/$test/"));
	237	}
	238	}
	239	}
	240	}
	241	}
	242
	243	# Similarly for \b and \B.
	244	foreach my $complement (0, 1) {
245	my $name = '\\' . (($complement) ? 'B' : 'b');
246	my $expect_success = ! $w[$i]; # \b is complement of \w
247	$expect_success = ! $expect_success if $complement;
248
249	my $string = "a$char";
250	my $test = "(^a$name\\x{$hex_i}\$)";
251
252	use feature 'unicode_strings';
253	my $prefix = "in uni8bit; Verify $string";
254	if ($expect_success) {
255	like($string, qr/$test/, display("$prefix =~ qr/$test/"));
256	} else {
257	unlike($string, qr/$test/, display("$prefix !~ qr/$test/"));
258	}
259
260	no feature 'unicode_strings';
261	$prefix = "no uni8bit; Verify $string";
262	if ($i >= 128) {
263	$expect_success = 1;
264	$expect_success = ! $expect_success if $complement;
265	}
266	if ($expect_success) {
267	like($string, qr/$test/, display("$prefix =~ qr/$test/"));
268	like($string, qr/$test/, display("$prefix =~ qr/$test/"));
269	} else {
270	unlike($string, qr/$test/, display("$prefix !~ qr/$test/"));
271	}
272	}
273	}