Commit | Line | Data |
---|---|---|
6aa905cf KW |
1 | # Common subroutines and constants, called by .t files in this directory that |
2 | # deal with UTF-8 | |
3 | ||
4 | # The test files can't use byte_utf8a_to_utf8n() from t/charset_tools.pl | |
5 | # because that uses the same functions we are testing here. So UTF-EBCDIC | |
6 | # strings are hard-coded as I8 strings in this file instead, and we use the | |
7 | # translation functions to/from I8 from that file instead. | |
8 | ||
9 | sub isASCII { ord "A" == 65 } | |
10 | ||
e86447a3 | 11 | sub display_bytes_no_quotes { |
6aa905cf KW |
12 | use bytes; |
13 | my $string = shift; | |
8132136a KW |
14 | return join("", map { |
15 | ($_ =~ /[[:print:]]/) | |
16 | ? $_ | |
17 | : sprintf("\\x%02x", ord $_) | |
18 | } split "", $string) | |
e86447a3 KW |
19 | } |
20 | ||
21 | sub display_bytes { | |
22 | return '"' . display_bytes_no_quotes(shift) . '"'; | |
6aa905cf KW |
23 | } |
24 | ||
25 | sub output_warnings(@) { | |
1aff4001 KW |
26 | my @list = @_; |
27 | if (@list) { | |
28 | diag "The warnings were:\n" . join "\n", map { chomp; $_ } @list; | |
29 | } | |
30 | else { | |
31 | diag "No warnings were raised"; | |
32 | } | |
6aa905cf KW |
33 | } |
34 | ||
35 | sub start_byte_to_cont($) { | |
36 | ||
37 | # Extract the code point information from the input UTF-8 start byte, and | |
38 | # return a continuation byte containing the same information. This is | |
39 | # used in constructing an overlong malformation from valid input. | |
40 | ||
41 | my $byte = shift; | |
42 | my $len = test_UTF8_SKIP($byte); | |
43 | if ($len < 2) { | |
44 | die "start_byte_to_cont() is expecting a UTF-8 variant"; | |
45 | } | |
46 | ||
47 | $byte = ord native_to_I8($byte); | |
48 | ||
49 | # Copied from utf8.h. This gets rid of the leading 1 bits. | |
50 | $byte &= ((($len) >= 7) ? 0x00 : (0x1F >> (($len)-2))); | |
51 | ||
52 | $byte |= (isASCII) ? 0x80 : 0xA0; | |
53 | return I8_to_native(chr $byte); | |
54 | } | |
55 | ||
56 | $::is64bit = length sprintf("%x", ~0) > 8; | |
57 | ||
dbb8d798 | 58 | $::lowest_continuation = (isASCII) ? 0x80 : 0xA0; |
6aa905cf KW |
59 | |
60 | $::I8c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte | |
61 | ||
62 | ||
63 | $::max_bytes = (isASCII) ? 13 : 14; # Max number of bytes in a UTF-8 sequence | |
64 | # representing a single code point | |
65 | ||
66 | # Copied from utf8.h | |
67 | $::UTF8_ALLOW_EMPTY = 0x0001; | |
68 | $::UTF8_GOT_EMPTY = $UTF8_ALLOW_EMPTY; | |
69 | $::UTF8_ALLOW_CONTINUATION = 0x0002; | |
70 | $::UTF8_GOT_CONTINUATION = $UTF8_ALLOW_CONTINUATION; | |
71 | $::UTF8_ALLOW_NON_CONTINUATION = 0x0004; | |
72 | $::UTF8_GOT_NON_CONTINUATION = $UTF8_ALLOW_NON_CONTINUATION; | |
73 | $::UTF8_ALLOW_SHORT = 0x0008; | |
74 | $::UTF8_GOT_SHORT = $UTF8_ALLOW_SHORT; | |
75 | $::UTF8_ALLOW_LONG = 0x0010; | |
76 | $::UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020; | |
77 | $::UTF8_GOT_LONG = $UTF8_ALLOW_LONG; | |
78 | $::UTF8_ALLOW_OVERFLOW = 0x0080; | |
79 | $::UTF8_GOT_OVERFLOW = $UTF8_ALLOW_OVERFLOW; | |
80 | $::UTF8_DISALLOW_SURROGATE = 0x0100; | |
81 | $::UTF8_GOT_SURROGATE = $UTF8_DISALLOW_SURROGATE; | |
82 | $::UTF8_WARN_SURROGATE = 0x0200; | |
83 | $::UTF8_DISALLOW_NONCHAR = 0x0400; | |
84 | $::UTF8_GOT_NONCHAR = $UTF8_DISALLOW_NONCHAR; | |
85 | $::UTF8_WARN_NONCHAR = 0x0800; | |
86 | $::UTF8_DISALLOW_SUPER = 0x1000; | |
87 | $::UTF8_GOT_SUPER = $UTF8_DISALLOW_SUPER; | |
88 | $::UTF8_WARN_SUPER = 0x2000; | |
d044b7a7 KW |
89 | $::UTF8_DISALLOW_PERL_EXTENDED = 0x4000; |
90 | $::UTF8_GOT_PERL_EXTENDED = $UTF8_DISALLOW_PERL_EXTENDED; | |
91 | $::UTF8_WARN_PERL_EXTENDED = 0x8000; | |
6aa905cf | 92 | $::UTF8_CHECK_ONLY = 0x10000; |
a228a8af KW |
93 | $::UTF8_NO_CONFIDENCE_IN_CURLEN_ = 0x20000; |
94 | ||
6aa905cf KW |
95 | $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE |
96 | = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE; | |
97 | $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE | |
98 | = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR; | |
99 | $::UTF8_WARN_ILLEGAL_C9_INTERCHANGE | |
100 | = $UTF8_WARN_SUPER|$UTF8_WARN_SURROGATE; | |
101 | $::UTF8_WARN_ILLEGAL_INTERCHANGE | |
102 | = $UTF8_WARN_ILLEGAL_C9_INTERCHANGE|$UTF8_WARN_NONCHAR; | |
103 | ||
104 | # Test uvchr_to_utf8(). | |
105 | $::UNICODE_WARN_SURROGATE = 0x0001; | |
106 | $::UNICODE_WARN_NONCHAR = 0x0002; | |
107 | $::UNICODE_WARN_SUPER = 0x0004; | |
d044b7a7 | 108 | $::UNICODE_WARN_PERL_EXTENDED = 0x0008; |
6aa905cf KW |
109 | $::UNICODE_DISALLOW_SURROGATE = 0x0010; |
110 | $::UNICODE_DISALLOW_NONCHAR = 0x0020; | |
111 | $::UNICODE_DISALLOW_SUPER = 0x0040; | |
d044b7a7 | 112 | $::UNICODE_DISALLOW_PERL_EXTENDED = 0x0080; |