[perl5.git] / ext / XS-APItest / t / utf8_setup.pl

# Common subroutines and constants, called by .t files in this directory that
# deal with UTF-8

# The  test files can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
# because that uses the same functions we are testing here.  So UTF-EBCDIC
# strings are hard-coded as I8 strings in this file instead, and we use the
# translation functions to/from I8 from that file instead.

sub isASCII { ord "A" == 65 }

sub display_bytes_no_quotes {
    use bytes;
    my $string = shift;
    return join("", map {
                          ($_ =~ /[[:print:]]/)
                          ? $_
                          : sprintf("\\x%02x", ord $_)
                        } split "", $string)
}

sub display_bytes {
    return   '"' . display_bytes_no_quotes(shift) . '"';
}

sub output_warnings(@) {
    my @list = @_;
    if (@list) {
        diag "The warnings were:\n" . join "\n", map { chomp; $_ } @list;
    }
    else {
        diag "No warnings were raised";
    }
}

sub start_byte_to_cont($) {

    # Extract the code point information from the input UTF-8 start byte, and
    # return a continuation byte containing the same information.  This is
    # used in constructing an overlong malformation from valid input.

    my $byte = shift;
    my $len = test_UTF8_SKIP($byte);
    if ($len < 2) {
        die "start_byte_to_cont() is expecting a UTF-8 variant";
    }

    $byte = ord native_to_I8($byte);

    # Copied from utf8.h.  This gets rid of the leading 1 bits.
    $byte &= ((($len) >= 7) ? 0x00 : (0x1F >> (($len)-2)));

    $byte |= (isASCII) ? 0x80 : 0xA0;
    return I8_to_native(chr $byte);
}

$::is64bit = length sprintf("%x", ~0) > 8;

$::lowest_continuation = (isASCII) ? 0x80 : 0xA0;

$::I8c = (isASCII) ? "\x80" : "\xa0";    # A continuation byte


$::max_bytes = (isASCII) ? 13 : 14; # Max number of bytes in a UTF-8 sequence
                                    # representing a single code point

# Copied from utf8.h
$::UTF8_ALLOW_EMPTY            = 0x0001;
$::UTF8_GOT_EMPTY              = $UTF8_ALLOW_EMPTY;
$::UTF8_ALLOW_CONTINUATION     = 0x0002;
$::UTF8_GOT_CONTINUATION       = $UTF8_ALLOW_CONTINUATION;
$::UTF8_ALLOW_NON_CONTINUATION = 0x0004;
$::UTF8_GOT_NON_CONTINUATION   = $UTF8_ALLOW_NON_CONTINUATION;
$::UTF8_ALLOW_SHORT            = 0x0008;
$::UTF8_GOT_SHORT              = $UTF8_ALLOW_SHORT;
$::UTF8_ALLOW_LONG             = 0x0010;
$::UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020;
$::UTF8_GOT_LONG               = $UTF8_ALLOW_LONG;
$::UTF8_ALLOW_OVERFLOW         = 0x0080;
$::UTF8_GOT_OVERFLOW           = $UTF8_ALLOW_OVERFLOW;
$::UTF8_DISALLOW_SURROGATE     = 0x0100;
$::UTF8_GOT_SURROGATE          = $UTF8_DISALLOW_SURROGATE;
$::UTF8_WARN_SURROGATE         = 0x0200;
$::UTF8_DISALLOW_NONCHAR       = 0x0400;
$::UTF8_GOT_NONCHAR            = $UTF8_DISALLOW_NONCHAR;
$::UTF8_WARN_NONCHAR           = 0x0800;
$::UTF8_DISALLOW_SUPER         = 0x1000;
$::UTF8_GOT_SUPER              = $UTF8_DISALLOW_SUPER;
$::UTF8_WARN_SUPER             = 0x2000;
$::UTF8_DISALLOW_PERL_EXTENDED  = 0x4000;
$::UTF8_GOT_PERL_EXTENDED       = $UTF8_DISALLOW_PERL_EXTENDED;
$::UTF8_WARN_PERL_EXTENDED      = 0x8000;
$::UTF8_CHECK_ONLY             = 0x10000;
$::UTF8_NO_CONFIDENCE_IN_CURLEN_ = 0x20000;

$::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
                             = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE;
$::UTF8_DISALLOW_ILLEGAL_INTERCHANGE
              = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR;
$::UTF8_WARN_ILLEGAL_C9_INTERCHANGE
                             = $UTF8_WARN_SUPER|$UTF8_WARN_SURROGATE;
$::UTF8_WARN_ILLEGAL_INTERCHANGE
              = $UTF8_WARN_ILLEGAL_C9_INTERCHANGE|$UTF8_WARN_NONCHAR;

# Test uvchr_to_utf8().
$::UNICODE_WARN_SURROGATE        = 0x0001;
$::UNICODE_WARN_NONCHAR          = 0x0002;
$::UNICODE_WARN_SUPER            = 0x0004;
$::UNICODE_WARN_PERL_EXTENDED     = 0x0008;
$::UNICODE_DISALLOW_SURROGATE    = 0x0010;
$::UNICODE_DISALLOW_NONCHAR      = 0x0020;
$::UNICODE_DISALLOW_SUPER        = 0x0040;
$::UNICODE_DISALLOW_PERL_EXTENDED = 0x0080;
Commit	Line	Data
6aa905cf KW	1	# Common subroutines and constants, called by .t files in this directory that
	2	# deal with UTF-8
	3
	4	# The test files can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
	5	# because that uses the same functions we are testing here. So UTF-EBCDIC
	6	# strings are hard-coded as I8 strings in this file instead, and we use the
	7	# translation functions to/from I8 from that file instead.
	8
	9	sub isASCII { ord "A" == 65 }
	10
e86447a3	11	sub display_bytes_no_quotes {
6aa905cf KW	12	use bytes;
6aa905cf KW	13	my $string = shift;
8132136a KW	14	return join("", map {
	15	($_ =~ /[[:print:]]/)
	16	? $_
	17	: sprintf("\\x%02x", ord $_)
	18	} split "", $string)
e86447a3 KW	19	}
	20
	21	sub display_bytes {
	22	return '"' . display_bytes_no_quotes(shift) . '"';
6aa905cf KW	23	}
	24
	25	sub output_warnings(@) {
1aff4001 KW	26	my @list = @_;
	27	if (@list) {
	28	diag "The warnings were:\n" . join "\n", map { chomp; $_ } @list;
	29	}
	30	else {
	31	diag "No warnings were raised";
	32	}
6aa905cf KW	33	}
	34
	35	sub start_byte_to_cont($) {
	36
	37	# Extract the code point information from the input UTF-8 start byte, and
	38	# return a continuation byte containing the same information. This is
	39	# used in constructing an overlong malformation from valid input.
	40
	41	my $byte = shift;
	42	my $len = test_UTF8_SKIP($byte);
	43	if ($len < 2) {
	44	die "start_byte_to_cont() is expecting a UTF-8 variant";
	45	}
	46
	47	$byte = ord native_to_I8($byte);
	48
	49	# Copied from utf8.h. This gets rid of the leading 1 bits.
	50	$byte &= ((($len) >= 7) ? 0x00 : (0x1F >> (($len)-2)));
	51
	52	$byte \|= (isASCII) ? 0x80 : 0xA0;
	53	return I8_to_native(chr $byte);
	54	}
	55
	56	$::is64bit = length sprintf("%x", ~0) > 8;
	57
dbb8d798	58	$::lowest_continuation = (isASCII) ? 0x80 : 0xA0;
6aa905cf KW	59
	60	$::I8c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte
	61
	62
	63	$::max_bytes = (isASCII) ? 13 : 14; # Max number of bytes in a UTF-8 sequence
	64	# representing a single code point
	65
	66	# Copied from utf8.h
	67	$::UTF8_ALLOW_EMPTY = 0x0001;
	68	$::UTF8_GOT_EMPTY = $UTF8_ALLOW_EMPTY;
	69	$::UTF8_ALLOW_CONTINUATION = 0x0002;
	70	$::UTF8_GOT_CONTINUATION = $UTF8_ALLOW_CONTINUATION;
	71	$::UTF8_ALLOW_NON_CONTINUATION = 0x0004;
	72	$::UTF8_GOT_NON_CONTINUATION = $UTF8_ALLOW_NON_CONTINUATION;
	73	$::UTF8_ALLOW_SHORT = 0x0008;
	74	$::UTF8_GOT_SHORT = $UTF8_ALLOW_SHORT;
	75	$::UTF8_ALLOW_LONG = 0x0010;
	76	$::UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG\|0x0020;
	77	$::UTF8_GOT_LONG = $UTF8_ALLOW_LONG;
	78	$::UTF8_ALLOW_OVERFLOW = 0x0080;
	79	$::UTF8_GOT_OVERFLOW = $UTF8_ALLOW_OVERFLOW;
	80	$::UTF8_DISALLOW_SURROGATE = 0x0100;
	81	$::UTF8_GOT_SURROGATE = $UTF8_DISALLOW_SURROGATE;
	82	$::UTF8_WARN_SURROGATE = 0x0200;
	83	$::UTF8_DISALLOW_NONCHAR = 0x0400;
	84	$::UTF8_GOT_NONCHAR = $UTF8_DISALLOW_NONCHAR;
	85	$::UTF8_WARN_NONCHAR = 0x0800;
	86	$::UTF8_DISALLOW_SUPER = 0x1000;
	87	$::UTF8_GOT_SUPER = $UTF8_DISALLOW_SUPER;
	88	$::UTF8_WARN_SUPER = 0x2000;
d044b7a7 KW	89	$::UTF8_DISALLOW_PERL_EXTENDED = 0x4000;
	90	$::UTF8_GOT_PERL_EXTENDED = $UTF8_DISALLOW_PERL_EXTENDED;
	91	$::UTF8_WARN_PERL_EXTENDED = 0x8000;
6aa905cf	92	$::UTF8_CHECK_ONLY = 0x10000;
a228a8af KW	93	$::UTF8_NO_CONFIDENCE_IN_CURLEN_ = 0x20000;
a228a8af KW	94
6aa905cf KW	95	$::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
	96	= $UTF8_DISALLOW_SUPER\|$UTF8_DISALLOW_SURROGATE;
	97	$::UTF8_DISALLOW_ILLEGAL_INTERCHANGE
	98	= $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE\|$UTF8_DISALLOW_NONCHAR;
	99	$::UTF8_WARN_ILLEGAL_C9_INTERCHANGE
	100	= $UTF8_WARN_SUPER\|$UTF8_WARN_SURROGATE;
	101	$::UTF8_WARN_ILLEGAL_INTERCHANGE
	102	= $UTF8_WARN_ILLEGAL_C9_INTERCHANGE\|$UTF8_WARN_NONCHAR;
	103
	104	# Test uvchr_to_utf8().
	105	$::UNICODE_WARN_SURROGATE = 0x0001;
	106	$::UNICODE_WARN_NONCHAR = 0x0002;
	107	$::UNICODE_WARN_SUPER = 0x0004;
d044b7a7	108	$::UNICODE_WARN_PERL_EXTENDED = 0x0008;
6aa905cf KW	109	$::UNICODE_DISALLOW_SURROGATE = 0x0010;
	110	$::UNICODE_DISALLOW_NONCHAR = 0x0020;
	111	$::UNICODE_DISALLOW_SUPER = 0x0040;
d044b7a7	112	$::UNICODE_DISALLOW_PERL_EXTENDED = 0x0080;