[perl5.git] / lib / encoding.t

BEGIN {
    if (ord("A") == 193) {
	print "1..0 # encoding pragma does not support EBCDIC platforms\n";
	exit(0);
    }
}

print "1..29\n";

use encoding "latin1"; # ignored (overwritten by the next line)
use encoding "greek";  # iso 8859-7 (no "latin" alias, surprise...)

# "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is
# \x{3AF} in Unicode (GREEK SMALL LETTER IOTA WITH TONOS),
# instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S)

$a = "\xDF";
$b = "\x{100}";

print "not " unless ord($a) == 0x3af;
print "ok 1\n";

print "not " unless ord($b) == 0x100;
print "ok 2\n";

my $c;

$c = $a . $b;

print "not " unless ord($c) == 0x3af;
print "ok 3\n";

print "not " unless length($c) == 2;
print "ok 4\n";

print "not " unless ord(substr($c, 1, 1)) == 0x100;
print "ok 5\n";

print "not " unless ord(chr(0xdf)) == 0x3af; # spooky
print "ok 6\n";

print "not " unless ord(pack("C", 0xdf)) == 0x3af;
print "ok 7\n";

# we didn't break pack/unpack, I hope

print "not " unless unpack("C", pack("C", 0xdf)) == 0xdf;
print "ok 8\n";

# the first octet of UTF-8 encoded 0x3af 
print "not " unless unpack("C", chr(0xdf)) == 0xce;
print "ok 9\n";

print "not " unless unpack("U", pack("U", 0xdf)) == 0xdf;
print "ok 10\n";

print "not " unless unpack("U", chr(0xdf)) == 0x3af;
print "ok 11\n";

# charnames must still work
use charnames ':full';
print "not " unless ord("\N{LATIN SMALL LETTER SHARP S}") == 0xdf;
print "ok 12\n";

# combine

$c = "\xDF\N{LATIN SMALL LETTER SHARP S}" . chr(0xdf);

print "not " unless ord($c) == 0x3af;
print "ok 13\n";

print "not " unless ord(substr($c, 1, 1)) == 0xdf;
print "ok 14\n";

print "not " unless ord(substr($c, 2, 1)) == 0x3af;
print "ok 15\n";

# regex literals

print "not " unless "\xDF"    =~ /\x{3AF}/;
print "ok 16\n";

print "not " unless "\x{3AF}" =~ /\xDF/;
print "ok 17\n";

print "not " unless "\xDF"    =~ /\xDF/;
print "ok 18\n";

print "not " unless "\x{3AF}" =~ /\x{3AF}/;
print "ok 19\n";

# eq, cmp

my ($byte,$bytes,$U,$Ub,$g1,$g2,$l) = ( 
    pack("C*", 0xDF ),       # byte
    pack("C*", 0xDF, 0x20),  # ($bytes2 cmp $U) > 0
    pack("U*", 0x3AF),       # $U eq $byte
    pack("U*", 0xDF ),       # $Ub would eq $bytev w/o use encoding
    pack("U*", 0x3B1),       # ($g1 cmp $byte) > 0; === chr(0xe1)
    pack("U*", 0x3AF, 0x20), # ($g2 cmp $byte) > 0;
    pack("U*", 0x3AB),       # ($l  cmp $byte) < 0; === chr(0xdb)
);

# all the tests in this section that compare a byte encoded string 
# ato UTF-8 encoded are run in all possible vairants 
# all of the eq, ne, cmp operations tested,
# $v z $u tested as well as $u z $v

sub alleq($$){
    my ($a,$b)    =    (shift, shift);
     $a  eq  $b        &&     $b  eq  $a         && 
  !( $a  ne  $b )      &&  !( $b  ne  $a )       &&
   ( $a  cmp $b ) == 0 &&   ( $b  cmp $a ) == 0;
}
   
sub anyeq($$){
    my ($a,$b)    =    (shift, shift);
     $a  eq  $b        ||     $b  eq  $a         ||
  !( $a  ne  $b )      ||  !( $b  ne  $a )       ||
   ( $a  cmp $b ) == 0 ||   ( $b  cmp $a ) == 0;
}

sub allgt($$){
    my ($a,$b)    =    (shift, shift);
    ( $a cmp $b ) == 1 && ( $b cmp $a ) == -1;
}
#match the correct UTF-8 string
print "not " unless  alleq($byte, $U);
print "ok 20\n";

#do not match a wrong UTF-8 string
print "not " if anyeq($byte, $Ub);
print "ok 21\n";

#string ordering
print "not " unless allgt ( $g1,    $byte  )  &&
                    allgt ( $g2,    $byte  )  &&
                    allgt ( $byte,  $l     )  &&
                    allgt ( $bytes, $U     );
print "ok 22\n";

# upgrade, downgrade

my ($u,$v,$v2);
$u = $v = $v2 = pack("C*", 0xDF);
utf8::upgrade($v);                   #explicit upgrade
$v2 = substr( $v2."\x{410}", 0, -1); #implicit upgrade

# implicit upgrade === explicit upgrade
print "not "  if do{{use bytes; $v ne $v2}} || $v ne $v2;
print "ok 23\n";

# utf8::upgrade is transparent and does not break equality
print "not " unless alleq( $u, $v );
print "ok 24\n";

$u = $v = pack("C*", 0xDF);
utf8::upgrade($v);
#test for a roundtrip, we should get back from where we left
eval {utf8::downgrade( $v )};
print "not " if $@ !~ /^Wide / || do{{use bytes; $u eq $v}} || $u ne $v;
print "ok 25\n";

# some more eq, cmp

my $byte=pack("C*", 0xDF);

print "not " unless pack("U*", 0x3AF) eq $byte;
print "ok 26\n";

print "not " if chr(0xDF) cmp $byte;
print "ok 27\n";

print "not " unless ((pack("U*", 0x3B0)       cmp $byte) ==  1) &&
                    ((pack("U*", 0x3AE)       cmp $byte) == -1) &&
                    ((pack("U*", 0x3AF, 0x20) cmp $byte) ==  1) &&
	            ((pack("U*", 0x3AF) cmp pack("C*",0xDF,0x20))==-1);
print "ok 28\n";

# Used to core dump in 5.7.3
print ord undef == 0 ? "ok 29\n" : "not ok 29\n";
Commit	Line	Data
0effba8c JH	1	BEGIN {
	2	if (ord("A") == 193) {
	3	print "1..0 # encoding pragma does not support EBCDIC platforms\n";
0f963d18	4	exit(0);
0effba8c JH	5	}
	6	}
	7
553e1bcc	8	print "1..29\n";
799ef3cb	9
0a378802	10	use encoding "latin1"; # ignored (overwritten by the next line)
f14ed3c6	11	use encoding "greek"; # iso 8859-7 (no "latin" alias, surprise...)
0a378802	12
0a378802	13	# "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is
f14ed3c6	14	# \x{3AF} in Unicode (GREEK SMALL LETTER IOTA WITH TONOS),
0a378802 JH	15	# instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S)
0a378802 JH	16
9f4817db JH	17	$a = "\xDF";
	18	$b = "\x{100}";
	19
	20	print "not " unless ord($a) == 0x3af;
0a378802 JH	21	print "ok 1\n";
0a378802 JH	22
9f4817db	23	print "not " unless ord($b) == 0x100;
0a378802 JH	24	print "ok 2\n";
0a378802 JH	25
9f4817db JH	26	my $c;
	27
	28	$c = $a . $b;
	29
	30	print "not " unless ord($c) == 0x3af;
0a378802 JH	31	print "ok 3\n";
0a378802 JH	32
9f4817db JH	33	print "not " unless length($c) == 2;
	34	print "ok 4\n";
	35
	36	print "not " unless ord(substr($c, 1, 1)) == 0x100;
	37	print "ok 5\n";
0a378802	38
121910a4 JH	39	print "not " unless ord(chr(0xdf)) == 0x3af; # spooky
	40	print "ok 6\n";
	41
	42	print "not " unless ord(pack("C", 0xdf)) == 0x3af;
	43	print "ok 7\n";
	44
	45	# we didn't break pack/unpack, I hope
	46
	47	print "not " unless unpack("C", pack("C", 0xdf)) == 0xdf;
	48	print "ok 8\n";
	49
	50	# the first octet of UTF-8 encoded 0x3af
	51	print "not " unless unpack("C", chr(0xdf)) == 0xce;
	52	print "ok 9\n";
bfa383d6	53
3de8ed06 JH	54	print "not " unless unpack("U", pack("U", 0xdf)) == 0xdf;
	55	print "ok 10\n";
	56
	57	print "not " unless unpack("U", chr(0xdf)) == 0x3af;
	58	print "ok 11\n";
	59
bfa383d6 JH	60	# charnames must still work
	61	use charnames ':full';
	62	print "not " unless ord("\N{LATIN SMALL LETTER SHARP S}") == 0xdf;
3de8ed06 JH	63	print "ok 12\n";
	64
	65	# combine
	66
	67	$c = "\xDF\N{LATIN SMALL LETTER SHARP S}" . chr(0xdf);
	68
	69	print "not " unless ord($c) == 0x3af;
	70	print "ok 13\n";
	71
	72	print "not " unless ord(substr($c, 1, 1)) == 0xdf;
	73	print "ok 14\n";
	74
	75	print "not " unless ord(substr($c, 2, 1)) == 0x3af;
	76	print "ok 15\n";
bfa383d6	77
a72c7584 JH	78	# regex literals
	79
	80	print "not " unless "\xDF" =~ /\x{3AF}/;
	81	print "ok 16\n";
	82
	83	print "not " unless "\x{3AF}" =~ /\xDF/;
	84	print "ok 17\n";
	85
	86	print "not " unless "\xDF" =~ /\xDF/;
	87	print "ok 18\n";
	88
	89	print "not " unless "\x{3AF}" =~ /\x{3AF}/;
	90	print "ok 19\n";
	91
799ef3cb JH	92	# eq, cmp
799ef3cb JH	93
553e1bcc AT	94	my ($byte,$bytes,$U,$Ub,$g1,$g2,$l) = (
	95	pack("C*", 0xDF ), # byte
	96	pack("C*", 0xDF, 0x20), # ($bytes2 cmp $U) > 0
	97	pack("U*", 0x3AF), # $U eq $byte
	98	pack("U*", 0xDF ), # $Ub would eq $bytev w/o use encoding
	99	pack("U*", 0x3B1), # ($g1 cmp $byte) > 0; === chr(0xe1)
	100	pack("U*", 0x3AF, 0x20), # ($g2 cmp $byte) > 0;
	101	pack("U*", 0x3AB), # ($l cmp $byte) < 0; === chr(0xdb)
	102	);
	103
	104	# all the tests in this section that compare a byte encoded string
	105	# ato UTF-8 encoded are run in all possible vairants
	106	# all of the eq, ne, cmp operations tested,
	107	# $v z $u tested as well as $u z $v
	108
	109	sub alleq($$){
	110	my ($a,$b) = (shift, shift);
	111	$a eq $b && $b eq $a &&
	112	!( $a ne $b ) && !( $b ne $a ) &&
	113	( $a cmp $b ) == 0 && ( $b cmp $a ) == 0;
	114	}
	115
	116	sub anyeq($$){
	117	my ($a,$b) = (shift, shift);
	118	$a eq $b \|\| $b eq $a \|\|
	119	!( $a ne $b ) \|\| !( $b ne $a ) \|\|
	120	( $a cmp $b ) == 0 \|\| ( $b cmp $a ) == 0;
	121	}
	122
	123	sub allgt($$){
	124	my ($a,$b) = (shift, shift);
	125	( $a cmp $b ) == 1 && ( $b cmp $a ) == -1;
	126	}
	127	#match the correct UTF-8 string
	128	print "not " unless alleq($byte, $U);
	129	print "ok 20\n";
	130
	131	#do not match a wrong UTF-8 string
	132	print "not " if anyeq($byte, $Ub);
	133	print "ok 21\n";
	134
	135	#string ordering
	136	print "not " unless allgt ( $g1, $byte ) &&
	137	allgt ( $g2, $byte ) &&
	138	allgt ( $byte, $l ) &&
	139	allgt ( $bytes, $U );
	140	print "ok 22\n";
	141
	142	# upgrade, downgrade
	143
	144	my ($u,$v,$v2);
	145	$u = $v = $v2 = pack("C*", 0xDF);
	146	utf8::upgrade($v); #explicit upgrade
	147	$v2 = substr( $v2."\x{410}", 0, -1); #implicit upgrade
	148
	149	# implicit upgrade === explicit upgrade
	150	print "not " if do{{use bytes; $v ne $v2}} \|\| $v ne $v2;
	151	print "ok 23\n";
	152
	153	# utf8::upgrade is transparent and does not break equality
	154	print "not " unless alleq( $u, $v );
	155	print "ok 24\n";
	156
	157	$u = $v = pack("C*", 0xDF);
158	utf8::upgrade($v);
159	#test for a roundtrip, we should get back from where we left
160	eval {utf8::downgrade( $v )};
161	print "not " if $@ !~ /^Wide / \|\| do{{use bytes; $u eq $v}} \|\| $u ne $v;
162	print "ok 25\n";
163
164	# some more eq, cmp
165
799ef3cb JH	166	my $byte=pack("C*", 0xDF);
	167
	168	print "not " unless pack("U*", 0x3AF) eq $byte;
553e1bcc	169	print "ok 26\n";
799ef3cb JH	170
799ef3cb JH	171	print "not " if chr(0xDF) cmp $byte;
553e1bcc	172	print "ok 27\n";
799ef3cb JH	173
	174	print "not " unless ((pack("U*", 0x3B0) cmp $byte) == 1) &&
	175	((pack("U*", 0x3AE) cmp $byte) == -1) &&
	176	((pack("U*", 0x3AF, 0x20) cmp $byte) == 1) &&
	177	((pack("U", 0x3AF) cmp pack("C",0xDF,0x20))==-1);
553e1bcc	178	print "ok 28\n";
799ef3cb JH	179
799ef3cb JH	180	# Used to core dump in 5.7.3
553e1bcc	181	print ord undef == 0 ? "ok 29\n" : "not ok 29\n";