This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Move #define to different header
[perl5.git] / ext / XS-APItest / t / utf8.t
CommitLineData
fed3ba5d
NC
1#!perl -w
2
3use strict;
4use Test::More;
7dfd8446 5$|=1;
fed3ba5d 6
760c7c2f
KW
7no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
8 # machines, and that is tested elsewhere
9
fed3ba5d
NC
10use XS::APItest;
11
4deba822
KW
12my $pound_sign = chr utf8::unicode_to_native(163);
13
7dfd8446
KW
14sub isASCII { ord "A" == 65 }
15
16sub display_bytes {
9d2d0ecd 17 use bytes;
7dfd8446
KW
18 my $string = shift;
19 return '"'
20 . join("", map { sprintf("\\x%02x", ord $_) } split "", $string)
21 . '"';
22}
23
24# This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
25# because that uses the same functions we are testing here. So UTF-EBCDIC
26# strings are hard-coded as I8 strings in this file instead, and we use array
27# lookup to translate into the appropriate code page.
28
29my @i8_to_native = ( # Only code page 1047 so far.
30# _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F
310x00,0x01,0x02,0x03,0x37,0x2D,0x2E,0x2F,0x16,0x05,0x15,0x0B,0x0C,0x0D,0x0E,0x0F,
320x10,0x11,0x12,0x13,0x3C,0x3D,0x32,0x26,0x18,0x19,0x3F,0x27,0x1C,0x1D,0x1E,0x1F,
330x40,0x5A,0x7F,0x7B,0x5B,0x6C,0x50,0x7D,0x4D,0x5D,0x5C,0x4E,0x6B,0x60,0x4B,0x61,
340xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0x7A,0x5E,0x4C,0x7E,0x6E,0x6F,
350x7C,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,
360xD7,0xD8,0xD9,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xAD,0xE0,0xBD,0x5F,0x6D,
370x79,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x91,0x92,0x93,0x94,0x95,0x96,
380x97,0x98,0x99,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xC0,0x4F,0xD0,0xA1,0x07,
390x20,0x21,0x22,0x23,0x24,0x25,0x06,0x17,0x28,0x29,0x2A,0x2B,0x2C,0x09,0x0A,0x1B,
400x30,0x31,0x1A,0x33,0x34,0x35,0x36,0x08,0x38,0x39,0x3A,0x3B,0x04,0x14,0x3E,0xFF,
410x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x51,0x52,0x53,0x54,0x55,0x56,
420x57,0x58,0x59,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x70,0x71,0x72,0x73,
430x74,0x75,0x76,0x77,0x78,0x80,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,0x90,0x9A,0x9B,0x9C,
440x9D,0x9E,0x9F,0xA0,0xAA,0xAB,0xAC,0xAE,0xAF,0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,
450xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBE,0xBF,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xDA,0xDB,
460xDC,0xDD,0xDE,0xDF,0xE1,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xFA,0xFB,0xFC,0xFD,0xFE,
47);
48
49*I8_to_native = (isASCII)
50 ? sub { return shift }
51 : sub { return join "", map { chr $i8_to_native[ord $_] }
52 split "", shift };
53
54my $is64bit = length sprintf("%x", ~0) > 8;
55
56
57# Test utf8n_to_uvchr(). These provide essentially complete code coverage.
58# Copied from utf8.h
59my $UTF8_ALLOW_EMPTY = 0x0001;
60my $UTF8_ALLOW_CONTINUATION = 0x0002;
61my $UTF8_ALLOW_NON_CONTINUATION = 0x0004;
62my $UTF8_ALLOW_SHORT = 0x0008;
63my $UTF8_ALLOW_LONG = 0x0010;
64my $UTF8_DISALLOW_SURROGATE = 0x0020;
65my $UTF8_WARN_SURROGATE = 0x0040;
66my $UTF8_DISALLOW_NONCHAR = 0x0080;
67my $UTF8_WARN_NONCHAR = 0x0100;
68my $UTF8_DISALLOW_SUPER = 0x0200;
69my $UTF8_WARN_SUPER = 0x0400;
1d1c12d9
KW
70my $UTF8_DISALLOW_ABOVE_31_BIT = 0x0800;
71my $UTF8_WARN_ABOVE_31_BIT = 0x1000;
7dfd8446 72my $UTF8_CHECK_ONLY = 0x2000;
25e3a4e0
KW
73my $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
74 = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE;
75my $UTF8_DISALLOW_ILLEGAL_INTERCHANGE
76 = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR;
7dfd8446 77
046d01eb
KW
78# Test uvchr_to_utf8().
79my $UNICODE_WARN_SURROGATE = 0x0001;
80my $UNICODE_WARN_NONCHAR = 0x0002;
81my $UNICODE_WARN_SUPER = 0x0004;
82my $UNICODE_WARN_ABOVE_31_BIT = 0x0008;
83my $UNICODE_DISALLOW_SURROGATE = 0x0010;
84my $UNICODE_DISALLOW_NONCHAR = 0x0020;
85my $UNICODE_DISALLOW_SUPER = 0x0040;
86my $UNICODE_DISALLOW_ABOVE_31_BIT = 0x0080;
87
88my $look_for_everything_utf8n_to
89 = $UTF8_DISALLOW_SURROGATE
7dfd8446
KW
90 | $UTF8_WARN_SURROGATE
91 | $UTF8_DISALLOW_NONCHAR
92 | $UTF8_WARN_NONCHAR
93 | $UTF8_DISALLOW_SUPER
94 | $UTF8_WARN_SUPER
1d1c12d9
KW
95 | $UTF8_DISALLOW_ABOVE_31_BIT
96 | $UTF8_WARN_ABOVE_31_BIT;
046d01eb
KW
97my $look_for_everything_uvchr_to
98 = $UNICODE_DISALLOW_SURROGATE
99 | $UNICODE_WARN_SURROGATE
100 | $UNICODE_DISALLOW_NONCHAR
101 | $UNICODE_WARN_NONCHAR
102 | $UNICODE_DISALLOW_SUPER
103 | $UNICODE_WARN_SUPER
104 | $UNICODE_DISALLOW_ABOVE_31_BIT
105 | $UNICODE_WARN_ABOVE_31_BIT;
7dfd8446 106
fed3ba5d
NC
107foreach ([0, '', '', 'empty'],
108 [0, 'N', 'N', '1 char'],
109 [1, 'NN', 'N', '1 char substring'],
110 [-2, 'Perl', 'Rules', 'different'],
4deba822
KW
111 [0, $pound_sign, $pound_sign, 'pound sign'],
112 [1, $pound_sign . 10, $pound_sign . 1, '10 pounds is more than 1 pound'],
113 [1, $pound_sign . $pound_sign, $pound_sign, '2 pound signs are more than 1'],
fed3ba5d
NC
114 [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'],
115 [-1, '!', "!\x{1F42A}", 'Initial substrings match'],
116 ) {
117 my ($expect, $left, $right, $desc) = @$_;
118 my $copy = $right;
119 utf8::encode($copy);
120 is(bytes_cmp_utf8($left, $copy), $expect, $desc);
121 next if $right =~ tr/\0-\377//c;
122 utf8::encode($left);
123 is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed");
124}
125
7dfd8446
KW
126# The keys to this hash are Unicode code points, their values are the native
127# UTF-8 representations of them. The code points are chosen because they are
128# "interesting" on either or both ASCII and EBCDIC platforms. First we add
129# boundaries where the number of bytes required to represent them increase, or
130# are adjacent to problematic code points, so we want to make sure they aren't
131# considered problematic.
132my %code_points = (
133 0x0100 => (isASCII) ? "\xc4\x80" : I8_to_native("\xc8\xa0"),
134 0x0400 - 1 => (isASCII) ? "\xcf\xbf" : I8_to_native("\xdf\xbf"),
135 0x0400 => (isASCII) ? "\xd0\x80" : I8_to_native("\xe1\xa0\xa0"),
136 0x0800 - 1 => (isASCII) ? "\xdf\xbf" : I8_to_native("\xe1\xbf\xbf"),
137 0x0800 => (isASCII) ? "\xe0\xa0\x80" : I8_to_native("\xe2\xa0\xa0"),
138 0x4000 - 1 => (isASCII) ? "\xe3\xbf\xbf" : I8_to_native("\xef\xbf\xbf"),
139 0x4000 => (isASCII) ? "\xe4\x80\x80" : I8_to_native("\xf0\xb0\xa0\xa0"),
140 0x8000 - 1 => (isASCII) ? "\xe7\xbf\xbf" : I8_to_native("\xf0\xbf\xbf\xbf"),
141
142 # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC,
143 # as of this writing, considers potentially problematic on EBCDIC
144 0x8000 => (isASCII) ? "\xe8\x80\x80" : I8_to_native("\xf1\xa0\xa0\xa0"),
145
146 0xD000 - 1 => (isASCII) ? "\xec\xbf\xbf" : I8_to_native("\xf1\xb3\xbf\xbf"),
147
148 # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC,
149 # as of this writing, considers potentially problematic on ASCII
150 0xD000 => (isASCII) ? "\xed\x80\x80" : I8_to_native("\xf1\xb4\xa0\xa0"),
151
152 # Bracket the surrogates
153 0xD7FF => (isASCII) ? "\xed\x9f\xbf" : I8_to_native("\xf1\xb5\xbf\xbf"),
154 0xE000 => (isASCII) ? "\xee\x80\x80" : I8_to_native("\xf1\xb8\xa0\xa0"),
155
156 # Bracket the 32 contiguous non characters
157 0xFDCF => (isASCII) ? "\xef\xb7\x8f" : I8_to_native("\xf1\xbf\xae\xaf"),
158 0xFDF0 => (isASCII) ? "\xef\xb7\xb0" : I8_to_native("\xf1\xbf\xaf\xb0"),
159
160 # Mostly bracket non-characters, but some are transitions to longer
161 # strings
162 0xFFFD => (isASCII) ? "\xef\xbf\xbd" : I8_to_native("\xf1\xbf\xbf\xbd"),
163 0x10000 - 1 => (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
164 0x10000 => (isASCII) ? "\xf0\x90\x80\x80" : I8_to_native("\xf2\xa0\xa0\xa0"),
165 0x1FFFD => (isASCII) ? "\xf0\x9f\xbf\xbd" : I8_to_native("\xf3\xbf\xbf\xbd"),
166 0x20000 => (isASCII) ? "\xf0\xa0\x80\x80" : I8_to_native("\xf4\xa0\xa0\xa0"),
167 0x2FFFD => (isASCII) ? "\xf0\xaf\xbf\xbd" : I8_to_native("\xf5\xbf\xbf\xbd"),
168 0x30000 => (isASCII) ? "\xf0\xb0\x80\x80" : I8_to_native("\xf6\xa0\xa0\xa0"),
169 0x3FFFD => (isASCII) ? "\xf0\xbf\xbf\xbd" : I8_to_native("\xf7\xbf\xbf\xbd"),
170 0x40000 - 1 => (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
171 0x40000 => (isASCII) ? "\xf1\x80\x80\x80" : I8_to_native("\xf8\xa8\xa0\xa0\xa0"),
172 0x4FFFD => (isASCII) ? "\xf1\x8f\xbf\xbd" : I8_to_native("\xf8\xa9\xbf\xbf\xbd"),
173 0x50000 => (isASCII) ? "\xf1\x90\x80\x80" : I8_to_native("\xf8\xaa\xa0\xa0\xa0"),
174 0x5FFFD => (isASCII) ? "\xf1\x9f\xbf\xbd" : I8_to_native("\xf8\xab\xbf\xbf\xbd"),
175 0x60000 => (isASCII) ? "\xf1\xa0\x80\x80" : I8_to_native("\xf8\xac\xa0\xa0\xa0"),
176 0x6FFFD => (isASCII) ? "\xf1\xaf\xbf\xbd" : I8_to_native("\xf8\xad\xbf\xbf\xbd"),
177 0x70000 => (isASCII) ? "\xf1\xb0\x80\x80" : I8_to_native("\xf8\xae\xa0\xa0\xa0"),
178 0x7FFFD => (isASCII) ? "\xf1\xbf\xbf\xbd" : I8_to_native("\xf8\xaf\xbf\xbf\xbd"),
179 0x80000 => (isASCII) ? "\xf2\x80\x80\x80" : I8_to_native("\xf8\xb0\xa0\xa0\xa0"),
180 0x8FFFD => (isASCII) ? "\xf2\x8f\xbf\xbd" : I8_to_native("\xf8\xb1\xbf\xbf\xbd"),
181 0x90000 => (isASCII) ? "\xf2\x90\x80\x80" : I8_to_native("\xf8\xb2\xa0\xa0\xa0"),
182 0x9FFFD => (isASCII) ? "\xf2\x9f\xbf\xbd" : I8_to_native("\xf8\xb3\xbf\xbf\xbd"),
183 0xA0000 => (isASCII) ? "\xf2\xa0\x80\x80" : I8_to_native("\xf8\xb4\xa0\xa0\xa0"),
184 0xAFFFD => (isASCII) ? "\xf2\xaf\xbf\xbd" : I8_to_native("\xf8\xb5\xbf\xbf\xbd"),
185 0xB0000 => (isASCII) ? "\xf2\xb0\x80\x80" : I8_to_native("\xf8\xb6\xa0\xa0\xa0"),
186 0xBFFFD => (isASCII) ? "\xf2\xbf\xbf\xbd" : I8_to_native("\xf8\xb7\xbf\xbf\xbd"),
187 0xC0000 => (isASCII) ? "\xf3\x80\x80\x80" : I8_to_native("\xf8\xb8\xa0\xa0\xa0"),
188 0xCFFFD => (isASCII) ? "\xf3\x8f\xbf\xbd" : I8_to_native("\xf8\xb9\xbf\xbf\xbd"),
189 0xD0000 => (isASCII) ? "\xf3\x90\x80\x80" : I8_to_native("\xf8\xba\xa0\xa0\xa0"),
190 0xDFFFD => (isASCII) ? "\xf3\x9f\xbf\xbd" : I8_to_native("\xf8\xbb\xbf\xbf\xbd"),
191 0xE0000 => (isASCII) ? "\xf3\xa0\x80\x80" : I8_to_native("\xf8\xbc\xa0\xa0\xa0"),
192 0xEFFFD => (isASCII) ? "\xf3\xaf\xbf\xbd" : I8_to_native("\xf8\xbd\xbf\xbf\xbd"),
193 0xF0000 => (isASCII) ? "\xf3\xb0\x80\x80" : I8_to_native("\xf8\xbe\xa0\xa0\xa0"),
194 0xFFFFD => (isASCII) ? "\xf3\xbf\xbf\xbd" : I8_to_native("\xf8\xbf\xbf\xbf\xbd"),
195 0x100000 => (isASCII) ? "\xf4\x80\x80\x80" : I8_to_native("\xf9\xa0\xa0\xa0\xa0"),
196 0x10FFFD => (isASCII) ? "\xf4\x8f\xbf\xbd" : I8_to_native("\xf9\xa1\xbf\xbf\xbd"),
197 0x110000 => (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
198
199 # Things that would be noncharacters if they were in Unicode, and might be
200 # mistaken, if the C code is bad, to be nonchars
201 0x11FFFE => (isASCII) ? "\xf4\x9f\xbf\xbe" : I8_to_native("\xf9\xa3\xbf\xbf\xbe"),
202 0x11FFFF => (isASCII) ? "\xf4\x9f\xbf\xbf" : I8_to_native("\xf9\xa3\xbf\xbf\xbf"),
203 0x20FFFE => (isASCII) ? "\xf8\x88\x8f\xbf\xbe" : I8_to_native("\xfa\xa1\xbf\xbf\xbe"),
204 0x20FFFF => (isASCII) ? "\xf8\x88\x8f\xbf\xbf" : I8_to_native("\xfa\xa1\xbf\xbf\xbf"),
205
206 0x200000 - 1 => (isASCII) ? "\xf7\xbf\xbf\xbf" : I8_to_native("\xf9\xbf\xbf\xbf\xbf"),
207 0x200000 => (isASCII) ? "\xf8\x88\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
208 0x400000 - 1 => (isASCII) ? "\xf8\x8f\xbf\xbf\xbf" : I8_to_native("\xfb\xbf\xbf\xbf\xbf"),
209 0x400000 => (isASCII) ? "\xf8\x90\x80\x80\x80" : I8_to_native("\xfc\xa4\xa0\xa0\xa0\xa0"),
210 0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"),
211 0x4000000 => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"),
212 0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"),
213 0x4000000 => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"),
214 0x40000000 - 1 => (isASCII) ? "\xfc\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xfe\xbf\xbf\xbf\xbf\xbf\xbf"),
215 0x40000000 => (isASCII) ? "\xfd\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0"),
216 0x80000000 - 1 => (isASCII) ? "\xfd\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"),
c0236afe
KW
217 0x80000000 => (isASCII) ? "\xfe\x82\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
218 0xFFFFFFFF => (isASCII) ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
7dfd8446 219);
4deba822 220
7dfd8446
KW
221if ($is64bit) {
222 no warnings qw(overflow portable);
9d2d0ecd
KW
223 $code_points{0x100000000} = (isASCII)
224 ? "\xfe\x84\x80\x80\x80\x80\x80"
225 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0");
226 $code_points{0x1000000000 - 1} = (isASCII)
227 ? "\xfe\xbf\xbf\xbf\xbf\xbf\xbf"
228 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf\xbf");
229 $code_points{0x1000000000} = (isASCII)
230 ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
231 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0");
232 $code_points{0xFFFFFFFFFFFFFFFF} = (isASCII)
233 ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
234 : I8_to_native("\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf");
d566bd20
KW
235 if (isASCII) { # These could falsely show as overlongs in a naive implementation
236 $code_points{0x40000000000} = "\xff\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80";
237 $code_points{0x1000000000000} = "\xff\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80";
238 $code_points{0x40000000000000} = "\xff\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80";
239 $code_points{0x1000000000000000} = "\xff\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80";
240 # overflows
241 #$code_points{0xfoo} = "\xff\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80";
242 }
243}
244elsif (! isASCII) { # 32-bit EBCDIC. 64-bit is clearer to handle, so doesn't need this test case
245 no warnings qw(overflow portable);
246 $code_points{0x40000000} = I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0");
7dfd8446 247}
eb83ed87 248
7dfd8446
KW
249# Now add in entries for each of code points 0-255, which require special
250# handling on EBCDIC. Remember the keys are Unicode values, and the values
251# are the native UTF-8. For invariants, the bytes are just the native chr.
252
253my $cp = 0;
254while ($cp < ((isASCII) ? 128 : 160)) { # This is from the definition of
255 # invariant
256 $code_points{$cp} = chr utf8::unicode_to_native($cp);
257 $cp++;
258}
259
260# Done with the invariants. Now do the variants. All in this range are 2
261# byte. Again, we can't use the internal functions to generate UTF-8, as
262# those are what we are trying to test. In the loop, we know what range the
263# continuation bytes can be in, and what the lowest start byte can be. So we
264# cycle through them.
265
266my $first_continuation = (isASCII) ? 0x80 : 0xA0;
267my $final_continuation = 0xBF;
268my $start = (isASCII) ? 0xC2 : 0xC5;
269
270my $continuation = $first_continuation - 1;
271
272while ($cp < 255) {
273 if (++$continuation > $final_continuation) {
274
275 # Wrap to the next start byte when we reach the final continuation
276 # byte possible
277 $continuation = $first_continuation;
278 $start++;
279 }
280 $code_points{$cp} = I8_to_native(chr($start) . chr($continuation));
281
282 $cp++;
283}
eb83ed87
KW
284
285my @warnings;
286
287use warnings 'utf8';
288local $SIG{__WARN__} = sub { push @warnings, @_ };
289
7dfd8446
KW
290# This set of tests looks for basic sanity, and lastly tests the bottom level
291# decode routine for the given code point. If the earlier tests for that code
292# point fail, that one probably will too. Malformations are tested in later
293# segments of code.
294for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
295 keys %code_points)
296{
297 my $hex_u = sprintf("0x%02X", $u);
298 my $n = utf8::unicode_to_native($u);
299 my $hex_n = sprintf("0x%02X", $n);
300 my $bytes = $code_points{$u};
301
302 my $offskip_should_be;
303 {
304 no warnings qw(overflow portable);
305 $offskip_should_be = (isASCII)
306 ? ( $u < 0x80 ? 1 :
307 $u < 0x800 ? 2 :
308 $u < 0x10000 ? 3 :
309 $u < 0x200000 ? 4 :
310 $u < 0x4000000 ? 5 :
311 $u < 0x80000000 ? 6 : (($is64bit)
312 ? ($u < 0x1000000000 ? 7 : 13)
313 : 7)
314 )
315 : ($u < 0xA0 ? 1 :
316 $u < 0x400 ? 2 :
317 $u < 0x4000 ? 3 :
318 $u < 0x40000 ? 4 :
319 $u < 0x400000 ? 5 :
c0236afe
KW
320 $u < 0x4000000 ? 6 :
321 $u < 0x40000000 ? 7 : 14 );
7dfd8446
KW
322 }
323
324 # If this test fails, subsequent ones are meaningless.
325 next unless is(test_OFFUNISKIP($u), $offskip_should_be,
326 "Verify OFFUNISKIP($hex_u) is $offskip_should_be");
327 my $invariant = $offskip_should_be == 1;
328 my $display_invariant = $invariant || 0;
329 is(test_OFFUNI_IS_INVARIANT($u), $invariant,
330 "Verify OFFUNI_IS_INVARIANT($hex_u) is $display_invariant");
331
332 my $uvchr_skip_should_be = $offskip_should_be;
333 next unless is(test_UVCHR_SKIP($n), $uvchr_skip_should_be,
334 "Verify UVCHR_SKIP($hex_n) is $uvchr_skip_should_be");
335 is(test_UVCHR_IS_INVARIANT($n), $offskip_should_be == 1,
336 "Verify UVCHR_IS_INVARIANT($hex_n) is $display_invariant");
337
338 my $n_chr = chr $n;
339 utf8::upgrade $n_chr;
340
341 is(test_UTF8_SKIP($n_chr), $uvchr_skip_should_be,
342 "Verify UTF8_SKIP(chr $hex_n) is $uvchr_skip_should_be");
343
344 use bytes;
3d56ecbe
KW
345 my $byte_length = length $n_chr;
346 for (my $j = 0; $j < $byte_length; $j++) {
347 undef @warnings;
348
349 if ($j == $byte_length - 1) {
350 my $ret = test_is_utf8_valid_partial_char_flags($n_chr, $byte_length, 0);
351 is($ret, 0, " Verify is_utf8_valid_partial_char_flags(" . display_bytes($n_chr) . ") returns 0 for full character");
352 }
353 else {
354 my $bytes_so_far = substr($n_chr, 0, $j + 1);
355 my $ret = test_is_utf8_valid_partial_char_flags($bytes_so_far, $j + 1, 0);
356 is($ret, 1, " Verify is_utf8_valid_partial_char_flags(" . display_bytes($bytes_so_far) . ") returns 1");
357 }
358
359 unless (is(scalar @warnings, 0,
360 " Verify is_utf8_valid_partial_char_flags generated no warnings"))
361 {
362 diag "The warnings were: " . join(", ", @warnings);
363 }
364
7dfd8446
KW
365 my $b = substr($n_chr, $j, 1);
366 my $hex_b = sprintf("\"\\x%02x\"", ord $b);
367
368 my $byte_invariant = $j == 0 && $uvchr_skip_should_be == 1;
369 my $display_byte_invariant = $byte_invariant || 0;
370 next unless is(test_UTF8_IS_INVARIANT($b), $byte_invariant,
371 " Verify UTF8_IS_INVARIANT($hex_b) for byte $j "
372 . "is $display_byte_invariant");
373
374 my $is_start = $j == 0 && $uvchr_skip_should_be > 1;
375 my $display_is_start = $is_start || 0;
376 next unless is(test_UTF8_IS_START($b), $is_start,
377 " Verify UTF8_IS_START($hex_b) is $display_is_start");
378
379 my $is_continuation = $j != 0 && $uvchr_skip_should_be > 1;
380 my $display_is_continuation = $is_continuation || 0;
381 next unless is(test_UTF8_IS_CONTINUATION($b), $is_continuation,
382 " Verify UTF8_IS_CONTINUATION($hex_b) is "
383 . "$display_is_continuation");
384
385 my $is_continued = $uvchr_skip_should_be > 1;
386 my $display_is_continued = $is_continued || 0;
387 next unless is(test_UTF8_IS_CONTINUED($b), $is_continued,
388 " Verify UTF8_IS_CONTINUED($hex_b) is "
389 . "$display_is_continued");
390
391 my $is_downgradeable_start = $n < 256
392 && $uvchr_skip_should_be > 1
393 && $j == 0;
394 my $display_is_downgradeable_start = $is_downgradeable_start || 0;
395 next unless is(test_UTF8_IS_DOWNGRADEABLE_START($b),
396 $is_downgradeable_start,
397 " Verify UTF8_IS_DOWNGRADEABLE_START($hex_b) is "
398 . "$display_is_downgradeable_start");
399
400 my $is_above_latin1 = $n > 255 && $j == 0;
401 my $display_is_above_latin1 = $is_above_latin1 || 0;
402 next unless is(test_UTF8_IS_ABOVE_LATIN1($b),
403 $is_above_latin1,
404 " Verify UTF8_IS_ABOVE_LATIN1($hex_b) is "
405 . "$display_is_above_latin1");
406
407 my $is_possibly_problematic = $j == 0
408 && $n >= ((isASCII)
409 ? 0xD000
410 : 0x8000);
411 my $display_is_possibly_problematic = $is_possibly_problematic || 0;
412 next unless is(test_isUTF8_POSSIBLY_PROBLEMATIC($b),
413 $is_possibly_problematic,
414 " Verify isUTF8_POSSIBLY_PROBLEMATIC($hex_b) is "
415 . "$display_is_above_latin1");
416 }
417
418 # We are not trying to look for warnings, etc, so if they should occur, it
419 # is an error. But some of the code points here do cause warnings, so we
420 # check here and turn off the ones that apply to such code points. A
421 # later section of the code tests for these kinds of things.
046d01eb 422 my $this_utf8_flags = $look_for_everything_utf8n_to;
7dfd8446
KW
423 my $len = length $bytes;
424 if ($n > 2 ** 31 - 1) {
1d1c12d9
KW
425 $this_utf8_flags &=
426 ~($UTF8_DISALLOW_ABOVE_31_BIT|$UTF8_WARN_ABOVE_31_BIT);
7dfd8446 427 }
e23e8bc1
KW
428
429 my $valid_under_strict = 1;
a82be82b 430 my $valid_under_c9strict = 1;
7dfd8446
KW
431 if ($n > 0x10FFFF) {
432 $this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER);
e23e8bc1 433 $valid_under_strict = 0;
a82be82b 434 $valid_under_c9strict = 0;
7dfd8446
KW
435 }
436 elsif (($n & 0xFFFE) == 0xFFFE) {
437 $this_utf8_flags &= ~($UTF8_DISALLOW_NONCHAR|$UTF8_WARN_NONCHAR);
e23e8bc1 438 $valid_under_strict = 0;
7dfd8446
KW
439 }
440
441 undef @warnings;
442
443 my $display_flags = sprintf "0x%x", $this_utf8_flags;
7dfd8446 444 my $display_bytes = display_bytes($bytes);
d7874298 445 my $ret_ref = test_utf8n_to_uvchr($bytes, $len, $this_utf8_flags);
7dfd8446 446 is($ret_ref->[0], $n, "Verify utf8n_to_uvchr($display_bytes, $display_flags) returns $hex_n");
9d2d0ecd 447 is($ret_ref->[1], $len, "Verify utf8n_to_uvchr() for $hex_n returns expected length: $len");
7dfd8446
KW
448
449 unless (is(scalar @warnings, 0,
450 "Verify utf8n_to_uvchr() for $hex_n generated no warnings"))
451 {
452 diag "The warnings were: " . join(", ", @warnings);
453 }
046d01eb 454
75ffa578
KW
455 undef @warnings;
456
d7874298 457 my $ret = test_isUTF8_CHAR($bytes, $len);
9d2d0ecd 458 is($ret, $len, "Verify isUTF8_CHAR($display_bytes) returns expected length: $len");
d7874298
KW
459
460 unless (is(scalar @warnings, 0,
461 "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
462 {
463 diag "The warnings were: " . join(", ", @warnings);
464 }
465
466 undef @warnings;
467
468 $ret = test_isUTF8_CHAR($bytes, $len - 1);
469 is($ret, 0, "Verify isUTF8_CHAR() with too short length parameter returns 0");
470
471 unless (is(scalar @warnings, 0,
472 "Verify isUTF8_CHAR() generated no warnings"))
473 {
474 diag "The warnings were: " . join(", ", @warnings);
475 }
476
477 undef @warnings;
478
25e3a4e0
KW
479 $ret = test_isUTF8_CHAR_flags($bytes, $len, 0);
480 is($ret, $len, "Verify isUTF8_CHAR_flags($display_bytes, 0) returns expected length: $len");
481
482 unless (is(scalar @warnings, 0,
483 "Verify isUTF8_CHAR_flags() for $hex_n generated no warnings"))
484 {
485 diag "The warnings were: " . join(", ", @warnings);
486 }
487
488 undef @warnings;
489
490 $ret = test_isUTF8_CHAR_flags($bytes, $len - 1, 0);
491 is($ret, 0, "Verify isUTF8_CHAR_flags() with too short length parameter returns 0");
492
493 unless (is(scalar @warnings, 0,
494 "Verify isUTF8_CHAR_flags() generated no warnings"))
495 {
496 diag "The warnings were: " . join(", ", @warnings);
497 }
498
499 undef @warnings;
500
e23e8bc1
KW
501 $ret = test_isSTRICT_UTF8_CHAR($bytes, $len);
502 my $expected_len = ($valid_under_strict) ? $len : 0;
503 is($ret, $expected_len, "Verify isSTRICT_UTF8_CHAR($display_bytes) returns expected length: $expected_len");
504
505 unless (is(scalar @warnings, 0,
506 "Verify isSTRICT_UTF8_CHAR() for $hex_n generated no warnings"))
507 {
508 diag "The warnings were: " . join(", ", @warnings);
509 }
510
511 undef @warnings;
512
513 $ret = test_isSTRICT_UTF8_CHAR($bytes, $len - 1);
514 is($ret, 0, "Verify isSTRICT_UTF8_CHAR() with too short length parameter returns 0");
515
516 unless (is(scalar @warnings, 0,
517 "Verify isSTRICT_UTF8_CHAR() generated no warnings"))
518 {
519 diag "The warnings were: " . join(", ", @warnings);
520 }
521
25e3a4e0
KW
522 undef @warnings;
523
524 $ret = test_isUTF8_CHAR_flags($bytes, $len, $UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
525 is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE') acts like isSTRICT_UTF8_CHAR");
526
527 unless (is(scalar @warnings, 0,
528 "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
529 {
530 diag "The warnings were: " . join(", ", @warnings);
531 }
532
533 undef @warnings;
534
a82be82b
KW
535 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len);
536 $expected_len = ($valid_under_c9strict) ? $len : 0;
537 is($ret, $expected_len, "Verify isC9_STRICT_UTF8_CHAR($display_bytes) returns expected length: $len");
538
539 unless (is(scalar @warnings, 0,
540 "Verify isC9_STRICT_UTF8_CHAR() for $hex_n generated no warnings"))
541 {
542 diag "The warnings were: " . join(", ", @warnings);
543 }
544
545 undef @warnings;
546
547 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len - 1);
548 is($ret, 0, "Verify isC9_STRICT_UTF8_CHAR() with too short length parameter returns 0");
549
550 unless (is(scalar @warnings, 0,
551 "Verify isC9_STRICT_UTF8_CHAR() generated no warnings"))
552 {
553 diag "The warnings were: " . join(", ", @warnings);
554 }
555
e23e8bc1
KW
556 undef @warnings;
557
25e3a4e0
KW
558 $ret = test_isUTF8_CHAR_flags($bytes, $len, $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
559 is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like isC9_STRICT_UTF8_CHAR");
560
561 unless (is(scalar @warnings, 0,
562 "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
563 {
564 diag "The warnings were: " . join(", ", @warnings);
565 }
566
567 undef @warnings;
568
75ffa578
KW
569 $ret_ref = test_valid_utf8_to_uvchr($bytes);
570 is($ret_ref->[0], $n, "Verify valid_utf8_to_uvchr($display_bytes) returns $hex_n");
9d2d0ecd 571 is($ret_ref->[1], $len, "Verify valid_utf8_to_uvchr() for $hex_n returns expected length: $len");
75ffa578
KW
572
573 unless (is(scalar @warnings, 0,
574 "Verify valid_utf8_to_uvchr() for $hex_n generated no warnings"))
575 {
576 diag "The warnings were: " . join(", ", @warnings);
577 }
578
046d01eb
KW
579 # Similarly for uvchr_to_utf8
580 my $this_uvchr_flags = $look_for_everything_uvchr_to;
581 if ($n > 2 ** 31 - 1) {
582 $this_uvchr_flags &=
583 ~($UNICODE_DISALLOW_ABOVE_31_BIT|$UNICODE_WARN_ABOVE_31_BIT);
584 }
585 if ($n > 0x10FFFF) {
586 $this_uvchr_flags &= ~($UNICODE_DISALLOW_SUPER|$UNICODE_WARN_SUPER);
587 }
588 elsif (($n & 0xFFFE) == 0xFFFE) {
589 $this_uvchr_flags &= ~($UNICODE_DISALLOW_NONCHAR|$UNICODE_WARN_NONCHAR);
590 }
591 $display_flags = sprintf "0x%x", $this_uvchr_flags;
592
593 undef @warnings;
594
d7874298 595 $ret = test_uvchr_to_utf8_flags($n, $this_uvchr_flags);
046d01eb
KW
596 ok(defined $ret, "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returned success");
597 is($ret, $bytes, "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returns correct bytes");
598
599 unless (is(scalar @warnings, 0,
600 "Verify uvchr_to_utf8_flags($hex_n, $display_flags) for $hex_n generated no warnings"))
601 {
602 diag "The warnings were: " . join(", ", @warnings);
603 }
7dfd8446
KW
604}
605
606my $REPLACEMENT = 0xFFFD;
607
608# Now test the malformations. All these raise category utf8 warnings.
609my $c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte
610my @malformations = (
eb83ed87
KW
611 [ "zero length string malformation", "", 0,
612 $UTF8_ALLOW_EMPTY, 0, 0,
613 qr/empty string/
614 ],
7dfd8446
KW
615 [ "orphan continuation byte malformation", I8_to_native("${c}a"),
616 2,
eb83ed87
KW
617 $UTF8_ALLOW_CONTINUATION, $REPLACEMENT, 1,
618 qr/unexpected continuation byte/
619 ],
7dfd8446 620 [ "premature next character malformation (immediate)",
9d2d0ecd
KW
621 (isASCII) ? "\xc2\xc2\x80" : I8_to_native("\xc5\xc5\xa0"),
622 3,
eb83ed87
KW
623 $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 1,
624 qr/unexpected non-continuation byte.*immediately after start byte/
625 ],
7dfd8446
KW
626 [ "premature next character malformation (non-immediate)",
627 I8_to_native("\xf0${c}a"),
628 3,
eb83ed87
KW
629 $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 2,
630 qr/unexpected non-continuation byte .* 2 bytes after start byte/
631 ],
7dfd8446 632 [ "too short malformation", I8_to_native("\xf0${c}a"), 2,
eb83ed87
KW
633 # Having the 'a' after this, but saying there are only 2 bytes also
634 # tests that we pay attention to the passed in length
635 $UTF8_ALLOW_SHORT, $REPLACEMENT, 2,
636 qr/2 bytes, need 4/
637 ],
d566bd20
KW
638 [ "overlong malformation, lowest 2-byte",
639 (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"),
640 2,
7dfd8446
KW
641 $UTF8_ALLOW_LONG,
642 0, # NUL
643 2,
eb83ed87 644 qr/2 bytes, need 1/
c0236afe 645 ],
d566bd20
KW
646 [ "overlong malformation, highest 2-byte",
647 (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
648 2,
649 $UTF8_ALLOW_LONG,
650 (isASCII) ? 0x7F : utf8::unicode_to_native(0xBF),
651 2,
652 qr/2 bytes, need 1/
653 ],
654 [ "overlong malformation, lowest 3-byte",
655 (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
656 3,
657 $UTF8_ALLOW_LONG,
658 0, # NUL
659 3,
660 qr/3 bytes, need 1/
661 ],
662 [ "overlong malformation, highest 3-byte",
663 (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
664 3,
665 $UTF8_ALLOW_LONG,
666 (isASCII) ? 0x7FF : 0x3FF,
667 3,
668 qr/3 bytes, need 2/
669 ],
670 [ "overlong malformation, lowest 4-byte",
671 (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
672 4,
673 $UTF8_ALLOW_LONG,
674 0, # NUL
675 4,
676 qr/4 bytes, need 1/
677 ],
678 [ "overlong malformation, highest 4-byte",
679 (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
680 4,
681 $UTF8_ALLOW_LONG,
682 (isASCII) ? 0xFFFF : 0x3FFF,
683 4,
684 qr/4 bytes, need 3/
685 ],
686 [ "overlong malformation, lowest 5-byte",
687 (isASCII)
688 ? "\xf8\x80\x80\x80\x80"
689 : I8_to_native("\xf8\xa0\xa0\xa0\xa0"),
690 5,
691 $UTF8_ALLOW_LONG,
692 0, # NUL
693 5,
694 qr/5 bytes, need 1/
695 ],
696 [ "overlong malformation, highest 5-byte",
697 (isASCII)
698 ? "\xf8\x87\xbf\xbf\xbf"
699 : I8_to_native("\xf8\xa7\xbf\xbf\xbf"),
700 5,
701 $UTF8_ALLOW_LONG,
702 (isASCII) ? 0x1FFFFF : 0x3FFFF,
703 5,
704 qr/5 bytes, need 4/
705 ],
706 [ "overlong malformation, lowest 6-byte",
707 (isASCII)
708 ? "\xfc\x80\x80\x80\x80\x80"
709 : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"),
710 6,
711 $UTF8_ALLOW_LONG,
712 0, # NUL
713 6,
714 qr/6 bytes, need 1/
715 ],
716 [ "overlong malformation, highest 6-byte",
717 (isASCII)
718 ? "\xfc\x83\xbf\xbf\xbf\xbf"
719 : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"),
720 6,
721 $UTF8_ALLOW_LONG,
722 (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
723 6,
724 qr/6 bytes, need 5/
725 ],
726 [ "overlong malformation, lowest 7-byte",
727 (isASCII)
728 ? "\xfe\x80\x80\x80\x80\x80\x80"
729 : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"),
730 7,
731 $UTF8_ALLOW_LONG,
732 0, # NUL
733 7,
734 qr/7 bytes, need 1/
735 ],
736 [ "overlong malformation, highest 7-byte",
737 (isASCII)
738 ? "\xfe\x81\xbf\xbf\xbf\xbf\xbf"
739 : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"),
740 7,
741 $UTF8_ALLOW_LONG,
742 (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
743 7,
744 qr/7 bytes, need 6/
c0236afe
KW
745 ],
746);
7dfd8446 747
d566bd20
KW
748if (isASCII && ! $is64bit) { # 32-bit ASCII platform
749 no warnings 'portable';
750 push @malformations,
751 [ "overflow malformation",
752 "\xfe\x84\x80\x80\x80\x80\x80", # Represents 2**32
753 7,
754 0, # There is no way to allow this malformation
755 $REPLACEMENT,
756 7,
757 qr/overflow/
758 ],
759 [ "overflow malformation, can tell on first byte",
760 "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
761 13,
762 0, # There is no way to allow this malformation
763 $REPLACEMENT,
764 13,
765 qr/overflow/
766 ];
767}
768else {
769 # On EBCDIC platforms, another overlong test is needed even on 32-bit
770 # systems, whereas it doesn't happen on ASCII except on 64-bit ones.
771
772 no warnings 'portable';
773 no warnings 'overflow'; # Doesn't run on 32-bit systems, but compiles
774 push @malformations,
775 [ "overlong malformation, lowest max-byte",
776 (isASCII)
777 ? "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
778 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
779 (isASCII) ? 13 : 14,
780 $UTF8_ALLOW_LONG,
781 0, # NUL
782 (isASCII) ? 13 : 14,
783 qr/1[34] bytes, need 1/, # 1[34] to work on either ASCII or EBCDIC
784 ],
785 [ "overlong malformation, highest max-byte",
786 (isASCII) # 2**36-1 on ASCII; 2**30-1 on EBCDIC
787 ? "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf"
788 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"),
789 (isASCII) ? 13 : 14,
790 $UTF8_ALLOW_LONG,
791 (isASCII) ? 0xFFFFFFFFF : 0x3FFFFFFF,
792 (isASCII) ? 13 : 14,
793 qr/1[34] bytes, need 7/,
794 ];
795
796 if (! $is64bit) { # 32-bit EBCDIC
797 push @malformations,
798 [ "overflow malformation",
799 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
800 14,
801 0, # There is no way to allow this malformation
802 $REPLACEMENT,
803 14,
804 qr/overflow/
805 ];
806 }
807 else { # 64-bit
808 push @malformations,
809 [ "overflow malformation",
810 (isASCII)
811 ? "\xff\x80\x90\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
812 : I8_to_native("\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
813 (isASCII) ? 13 : 14,
814 0, # There is no way to allow this malformation
815 $REPLACEMENT,
816 (isASCII) ? 13 : 14,
817 qr/overflow/
818 ];
819 }
820}
821
7dfd8446 822foreach my $test (@malformations) {
eb83ed87
KW
823 my ($testname, $bytes, $length, $allow_flags, $allowed_uv, $expected_len, $message ) = @$test;
824
825 next if ! ok(length($bytes) >= $length, "$testname: Make sure won't read beyond buffer: " . length($bytes) . " >= $length");
826
d7874298
KW
827 undef @warnings;
828
829 my $ret = test_isUTF8_CHAR($bytes, $length);
830 is($ret, 0, "$testname: isUTF8_CHAR returns 0");
831 unless (is(scalar @warnings, 0,
832 "$testname: isUTF8_CHAR() generated no warnings"))
833 {
834 diag "The warnings were: " . join(", ", @warnings);
835 }
836
25e3a4e0
KW
837 undef @warnings;
838
839 $ret = test_isUTF8_CHAR_flags($bytes, $length, 0);
840 is($ret, 0, "$testname: isUTF8_CHAR_flags returns 0");
841 unless (is(scalar @warnings, 0,
842 "$testname: isUTF8_CHAR() generated no warnings"))
843 {
844 diag "The warnings were: " . join(", ", @warnings);
845 }
846
e23e8bc1
KW
847 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
848 is($ret, 0, "$testname: isSTRICT_UTF8_CHAR returns 0");
849 unless (is(scalar @warnings, 0,
850 "$testname: isSTRICT_UTF8_CHAR() generated no warnings"))
851 {
852 diag "The warnings were: " . join(", ", @warnings);
853 }
854
a82be82b
KW
855 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
856 is($ret, 0, "$testname: isC9_STRICT_UTF8_CHAR returns 0");
857 unless (is(scalar @warnings, 0,
858 "$testname: isC9_STRICT_UTF8_CHAR() generated no warnings"))
859 {
860 diag "The warnings were: " . join(", ", @warnings);
861 }
862
3d56ecbe
KW
863 for my $j (1 .. $length - 1) {
864 my $partial = substr($bytes, 0, $j);
865
866 undef @warnings;
867
868 $ret = test_is_utf8_valid_partial_char_flags($bytes, $j, 0);
869 my $ret_should_be = 0;
870 my $comment = "";
871 if ($testname =~ /premature|short/ && $j < 2) {
872 $ret_should_be = 1;
873 $comment = ", but need 2 bytes to discern:";
874 }
875 elsif ($testname =~ /overlong/ && $length > 2) {
876 if ($length <= 7 && $j < 2) {
877 $ret_should_be = 1;
878 $comment = ", but need 2 bytes to discern:";
879 }
880 elsif ($length > 7 && $j < 7) {
881 $ret_should_be = 1;
882 $comment = ", but need 7 bytes to discern:";
883 }
884 }
885 elsif ($testname =~ /overflow/ && $testname !~ /first byte/) {
886 if (isASCII) {
887 if ($j < (($is64bit) ? 3 : 2)) {
888 $comment = ", but need $j bytes to discern:";
889 $ret_should_be = 1;
890 }
891 }
892 else {
893 if ($j < (($is64bit) ? 2 : 8)) {
894 $comment = ", but need $j bytes to discern:";
895 $ret_should_be = 1;
896 }
897 }
898 }
899 is($ret, $ret_should_be, "$testname: is_utf8_valid_partial_char_flags("
900 . display_bytes($partial)
901 . ")$comment returns $ret_should_be");
902 unless (is(scalar @warnings, 0,
903 "$testname: is_utf8_valid_partial_char_flags() generated no warnings"))
904 {
905 diag "The warnings were: " . join(", ", @warnings);
906 }
907 }
908
d7874298 909
eb83ed87
KW
910 # Test what happens when this malformation is not allowed
911 undef @warnings;
bd70aaaf 912 my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
eb83ed87 913 is($ret_ref->[0], 0, "$testname: disallowed: Returns 0");
9d2d0ecd 914 is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), disallowed: Returns expected length: $expected_len");
eb83ed87
KW
915 if (is(scalar @warnings, 1, "$testname: disallowed: Got a single warning ")) {
916 like($warnings[0], $message, "$testname: disallowed: Got expected warning");
917 }
918 else {
919 if (scalar @warnings) {
7dfd8446 920 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
921 }
922 }
923
924 { # Next test when disallowed, and warnings are off.
925 undef @warnings;
926 no warnings 'utf8';
bd70aaaf 927 my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
9d2d0ecd
KW
928 is($ret_ref->[0], 0, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': Returns 0");
929 is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': Returns expected length: $expected_len");
930 if (!is(scalar @warnings, 0, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': no warnings generated")) {
7dfd8446 931 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
932 }
933 }
934
935 # Test with CHECK_ONLY
936 undef @warnings;
bd70aaaf 937 $ret_ref = test_utf8n_to_uvchr($bytes, $length, $UTF8_CHECK_ONLY);
eb83ed87 938 is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0");
9d2d0ecd 939 is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns -1 for length");
eb83ed87 940 if (! is(scalar @warnings, 0, "$testname: CHECK_ONLY: no warnings generated")) {
7dfd8446 941 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
942 }
943
944 next if $allow_flags == 0; # Skip if can't allow this malformation
945
946 # Test when the malformation is allowed
947 undef @warnings;
bd70aaaf 948 $ret_ref = test_utf8n_to_uvchr($bytes, $length, $allow_flags);
9d2d0ecd
KW
949 is($ret_ref->[0], $allowed_uv, "$testname: utf8n_to_uvchr(), allowed: Returns expected uv: " . sprintf("0x%04X", $allowed_uv));
950 is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), allowed: Returns expected length: $expected_len");
951 if (!is(scalar @warnings, 0, "$testname: utf8n_to_uvchr(), allowed: no warnings generated"))
eb83ed87 952 {
7dfd8446 953 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
954 }
955}
956
eb83ed87
KW
957# Now test the cases where a legal code point is generated, but may or may not
958# be allowed/warned on.
2f8f112e 959my @tests = (
7dfd8446
KW
960 [ "lowest surrogate",
961 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
962 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
963 'surrogate', 0xD800,
964 (isASCII) ? 3 : 4,
965 qr/surrogate/
966 ],
967 [ "a middle surrogate",
968 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
969 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
970 'surrogate', 0xD90D,
971 (isASCII) ? 3 : 4,
eb83ed87
KW
972 qr/surrogate/
973 ],
7dfd8446
KW
974 [ "highest surrogate",
975 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
976 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
977 'surrogate', 0xDFFF,
978 (isASCII) ? 3 : 4,
979 qr/surrogate/
980 ],
981 [ "first non_unicode",
982 (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
983 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
984 'non_unicode', 0x110000,
985 (isASCII) ? 4 : 5,
986 qr/not Unicode.* may not be portable/
987 ],
d566bd20
KW
988 [ "non_unicode whose first byte tells that",
989 (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
990 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
991 'non_unicode',
992 (isASCII) ? 0x140000 : 0x200000,
993 (isASCII) ? 4 : 5,
994 qr/not Unicode.* may not be portable/
995 ],
7dfd8446
KW
996 [ "first of 32 consecutive non-character code points",
997 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
998 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
999 'nonchar', 0xFDD0,
1000 (isASCII) ? 3 : 4,
1001 qr/Unicode non-character.*is not recommended for open interchange/
1002 ],
1003 [ "a mid non-character code point of the 32 consecutive ones",
1004 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
1005 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1006 'nonchar', 0xFDE0,
1007 (isASCII) ? 3 : 4,
1008 qr/Unicode non-character.*is not recommended for open interchange/
1009 ],
1010 [ "final of 32 consecutive non-character code points",
1011 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
1012 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1013 'nonchar', 0xFDEF,
1014 (isASCII) ? 3 : 4,
1015 qr/Unicode non-character.*is not recommended for open interchange/
1016 ],
1017 [ "non-character code point U+FFFE",
1018 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
1019 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1020 'nonchar', 0xFFFE,
1021 (isASCII) ? 3 : 4,
1022 qr/Unicode non-character.*is not recommended for open interchange/
1023 ],
1024 [ "non-character code point U+FFFF",
1025 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
1026 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1027 'nonchar', 0xFFFF,
1028 (isASCII) ? 3 : 4,
1029 qr/Unicode non-character.*is not recommended for open interchange/
1030 ],
1031 [ "non-character code point U+1FFFE",
1032 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
1033 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1034 'nonchar', 0x1FFFE, 4,
1035 qr/Unicode non-character.*is not recommended for open interchange/
1036 ],
1037 [ "non-character code point U+1FFFF",
1038 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
1039 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1040 'nonchar', 0x1FFFF, 4,
1041 qr/Unicode non-character.*is not recommended for open interchange/
1042 ],
1043 [ "non-character code point U+2FFFE",
1044 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
1045 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1046 'nonchar', 0x2FFFE, 4,
1047 qr/Unicode non-character.*is not recommended for open interchange/
1048 ],
1049 [ "non-character code point U+2FFFF",
1050 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
1051 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1052 'nonchar', 0x2FFFF, 4,
1053 qr/Unicode non-character.*is not recommended for open interchange/
1054 ],
1055 [ "non-character code point U+3FFFE",
1056 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
1057 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1058 'nonchar', 0x3FFFE, 4,
1059 qr/Unicode non-character.*is not recommended for open interchange/
1060 ],
1061 [ "non-character code point U+3FFFF",
1062 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
1063 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1064 'nonchar', 0x3FFFF, 4,
1065 qr/Unicode non-character.*is not recommended for open interchange/
1066 ],
1067 [ "non-character code point U+4FFFE",
1068 (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
1069 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1070 'nonchar', 0x4FFFE,
1071 (isASCII) ? 4 : 5,
1072 qr/Unicode non-character.*is not recommended for open interchange/
1073 ],
1074 [ "non-character code point U+4FFFF",
1075 (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
1076 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1077 'nonchar', 0x4FFFF,
1078 (isASCII) ? 4 : 5,
1079 qr/Unicode non-character.*is not recommended for open interchange/
1080 ],
1081 [ "non-character code point U+5FFFE",
1082 (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
1083 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1084 'nonchar', 0x5FFFE,
1085 (isASCII) ? 4 : 5,
1086 qr/Unicode non-character.*is not recommended for open interchange/
1087 ],
1088 [ "non-character code point U+5FFFF",
1089 (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
1090 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1091 'nonchar', 0x5FFFF,
1092 (isASCII) ? 4 : 5,
1093 qr/Unicode non-character.*is not recommended for open interchange/
1094 ],
1095 [ "non-character code point U+6FFFE",
1096 (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
1097 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1098 'nonchar', 0x6FFFE,
1099 (isASCII) ? 4 : 5,
1100 qr/Unicode non-character.*is not recommended for open interchange/
1101 ],
1102 [ "non-character code point U+6FFFF",
1103 (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
1104 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1105 'nonchar', 0x6FFFF,
1106 (isASCII) ? 4 : 5,
1107 qr/Unicode non-character.*is not recommended for open interchange/
1108 ],
1109 [ "non-character code point U+7FFFE",
1110 (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
1111 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1112 'nonchar', 0x7FFFE,
1113 (isASCII) ? 4 : 5,
1114 qr/Unicode non-character.*is not recommended for open interchange/
1115 ],
1116 [ "non-character code point U+7FFFF",
1117 (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
1118 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1119 'nonchar', 0x7FFFF,
1120 (isASCII) ? 4 : 5,
1121 qr/Unicode non-character.*is not recommended for open interchange/
1122 ],
1123 [ "non-character code point U+8FFFE",
1124 (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
1125 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1126 'nonchar', 0x8FFFE,
1127 (isASCII) ? 4 : 5,
1128 qr/Unicode non-character.*is not recommended for open interchange/
1129 ],
1130 [ "non-character code point U+8FFFF",
1131 (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
1132 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1133 'nonchar', 0x8FFFF,
1134 (isASCII) ? 4 : 5,
1135 qr/Unicode non-character.*is not recommended for open interchange/
1136 ],
1137 [ "non-character code point U+9FFFE",
1138 (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
1139 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1140 'nonchar', 0x9FFFE,
1141 (isASCII) ? 4 : 5,
1142 qr/Unicode non-character.*is not recommended for open interchange/
1143 ],
1144 [ "non-character code point U+9FFFF",
1145 (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
1146 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1147 'nonchar', 0x9FFFF,
1148 (isASCII) ? 4 : 5,
1149 qr/Unicode non-character.*is not recommended for open interchange/
1150 ],
1151 [ "non-character code point U+AFFFE",
1152 (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
1153 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1154 'nonchar', 0xAFFFE,
1155 (isASCII) ? 4 : 5,
1156 qr/Unicode non-character.*is not recommended for open interchange/
1157 ],
1158 [ "non-character code point U+AFFFF",
1159 (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
1160 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1161 'nonchar', 0xAFFFF,
1162 (isASCII) ? 4 : 5,
1163 qr/Unicode non-character.*is not recommended for open interchange/
1164 ],
1165 [ "non-character code point U+BFFFE",
1166 (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
1167 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1168 'nonchar', 0xBFFFE,
1169 (isASCII) ? 4 : 5,
1170 qr/Unicode non-character.*is not recommended for open interchange/
1171 ],
1172 [ "non-character code point U+BFFFF",
1173 (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
1174 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1175 'nonchar', 0xBFFFF,
1176 (isASCII) ? 4 : 5,
1177 qr/Unicode non-character.*is not recommended for open interchange/
1178 ],
1179 [ "non-character code point U+CFFFE",
1180 (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
1181 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1182 'nonchar', 0xCFFFE,
1183 (isASCII) ? 4 : 5,
1184 qr/Unicode non-character.*is not recommended for open interchange/
1185 ],
1186 [ "non-character code point U+CFFFF",
1187 (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
1188 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1189 'nonchar', 0xCFFFF,
1190 (isASCII) ? 4 : 5,
1191 qr/Unicode non-character.*is not recommended for open interchange/
1192 ],
1193 [ "non-character code point U+DFFFE",
1194 (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
1195 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1196 'nonchar', 0xDFFFE,
1197 (isASCII) ? 4 : 5,
1198 qr/Unicode non-character.*is not recommended for open interchange/
1199 ],
1200 [ "non-character code point U+DFFFF",
1201 (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
1202 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1203 'nonchar', 0xDFFFF,
1204 (isASCII) ? 4 : 5,
1205 qr/Unicode non-character.*is not recommended for open interchange/
1206 ],
1207 [ "non-character code point U+EFFFE",
1208 (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
1209 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1210 'nonchar', 0xEFFFE,
1211 (isASCII) ? 4 : 5,
1212 qr/Unicode non-character.*is not recommended for open interchange/
1213 ],
1214 [ "non-character code point U+EFFFF",
1215 (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
1216 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1217 'nonchar', 0xEFFFF,
1218 (isASCII) ? 4 : 5,
1219 qr/Unicode non-character.*is not recommended for open interchange/
1220 ],
1221 [ "non-character code point U+FFFFE",
1222 (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
1223 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1224 'nonchar', 0xFFFFE,
1225 (isASCII) ? 4 : 5,
1226 qr/Unicode non-character.*is not recommended for open interchange/
1227 ],
1228 [ "non-character code point U+FFFFF",
1229 (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
1230 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1231 'nonchar', 0xFFFFF,
1232 (isASCII) ? 4 : 5,
1233 qr/Unicode non-character.*is not recommended for open interchange/
1234 ],
1235 [ "non-character code point U+10FFFE",
1236 (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
1237 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1238 'nonchar', 0x10FFFE,
1239 (isASCII) ? 4 : 5,
1240 qr/Unicode non-character.*is not recommended for open interchange/
eb83ed87 1241 ],
7dfd8446
KW
1242 [ "non-character code point U+10FFFF",
1243 (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
1244 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1245 'nonchar', 0x10FFFF,
1246 (isASCII) ? 4 : 5,
ba707cdc 1247 qr/Unicode non-character.*is not recommended for open interchange/
eb83ed87 1248 ],
1d1c12d9 1249 [ "requires at least 32 bits",
c0236afe
KW
1250 (isASCII)
1251 ? "\xfe\x82\x80\x80\x80\x80\x80"
1252 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
eb83ed87 1253 # This code point is chosen so that it is representable in a UV on
2f8f112e 1254 # 32-bit machines
1d1c12d9 1255 $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
c0236afe 1256 'utf8', 0x80000000, (isASCII) ? 7 :14,
7dfd8446
KW
1257 qr/Code point 0x80000000 is not Unicode, and not portable/
1258 ],
1d1c12d9 1259 [ "requires at least 32 bits, and use SUPER-type flags, instead of ABOVE_31_BIT",
c0236afe
KW
1260 (isASCII)
1261 ? "\xfe\x82\x80\x80\x80\x80\x80"
1262 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
7dfd8446 1263 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
c0236afe 1264 'utf8', 0x80000000, (isASCII) ? 7 :14,
ea5ced44 1265 qr/Code point 0x80000000 is not Unicode, and not portable/
eb83ed87 1266 ],
1d1c12d9
KW
1267 [ "overflow with warnings/disallow for more than 31 bits",
1268 # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
1269 # with overflow. The overflow malformation is never allowed, so
1270 # preventing it takes precedence if the ABOVE_31_BIT options would
c0236afe 1271 # otherwise allow in an overflowing value. The ASCII code points (1
1d1c12d9
KW
1272 # for 32-bits; 1 for 64) were chosen because the old overflow
1273 # detection algorithm did not catch them; this means this test also
c0236afe
KW
1274 # checks for that fix. The EBCDIC are arbitrary overflowing ones
1275 # since we have no reports of failures with it.
1276 (($is64bit)
1277 ? ((isASCII)
9d2d0ecd 1278 ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
c0236afe
KW
1279 : I8_to_native("\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
1280 : ((isASCII)
9d2d0ecd 1281 ? "\xfe\x86\x80\x80\x80\x80\x80"
c0236afe 1282 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
ea5ced44 1283
1d1c12d9
KW
1284 # We include both warning categories to make sure the ABOVE_31_BIT one
1285 # has precedence
1286 "$UTF8_WARN_ABOVE_31_BIT|$UTF8_WARN_SUPER",
1287 "$UTF8_DISALLOW_ABOVE_31_BIT",
7dfd8446 1288 'utf8', 0,
c0236afe 1289 (! isASCII) ? 14 : ($is64bit) ? 13 : 7,
ea5ced44 1290 qr/overflow at byte .*, after start byte 0xf/
eb83ed87 1291 ],
c0236afe 1292);
2f8f112e 1293
c0236afe
KW
1294if ($is64bit) {
1295 no warnings qw{portable overflow};
2f8f112e 1296 push @tests,
c0236afe
KW
1297 [ "More than 32 bits",
1298 (isASCII)
9d2d0ecd 1299 ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
c0236afe 1300 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1d1c12d9 1301 $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
c0236afe 1302 'utf8', 0x1000000000, (isASCII) ? 13 : 14,
ea5ced44 1303 qr/Code point 0x.* is not Unicode, and not portable/
2f8f112e 1304 ];
83dc0f42
KW
1305 if (! isASCII) {
1306 push @tests, # These could falsely show wrongly in a naive implementation
1307 [ "requires at least 32 bits",
1308 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1309 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1310 'utf8', 0x800000000, 14,
1311 qr/Code point 0x800000000 is not Unicode, and not portable/
1312 ],
1313 [ "requires at least 32 bits",
1314 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1315 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1316 'utf8', 0x10000000000, 14,
1317 qr/Code point 0x10000000000 is not Unicode, and not portable/
1318 ],
1319 [ "requires at least 32 bits",
1320 I8_to_native("\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1321 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1322 'utf8', 0x200000000000, 14,
1323 qr/Code point 0x200000000000 is not Unicode, and not portable/
1324 ],
1325 [ "requires at least 32 bits",
1326 I8_to_native("\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1327 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1328 'utf8', 0x4000000000000, 14,
1329 qr/Code point 0x4000000000000 is not Unicode, and not portable/
1330 ],
1331 [ "requires at least 32 bits",
1332 I8_to_native("\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1333 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1334 'utf8', 0x80000000000000, 14,
1335 qr/Code point 0x80000000000000 is not Unicode, and not portable/
1336 ],
1337 [ "requires at least 32 bits",
1338 I8_to_native("\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1339 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1340 'utf8', 0x1000000000000000, 14,
1341 qr/Code point 0x1000000000000000 is not Unicode, and not portable/
1342 ];
1343 }
2f8f112e
KW
1344}
1345
1346foreach my $test (@tests) {
eb83ed87
KW
1347 my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test;
1348
1349 my $length = length $bytes;
2f8f112e 1350 my $will_overflow = $testname =~ /overflow/;
eb83ed87 1351
d7874298
KW
1352 {
1353 use warnings;
1354 undef @warnings;
1355 my $ret = test_isUTF8_CHAR($bytes, $length);
25e3a4e0 1356 my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
d7874298
KW
1357 if ($will_overflow) {
1358 is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
25e3a4e0 1359 is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
d7874298
KW
1360 }
1361 else {
1362 is($ret, $length,
1363 "isUTF8_CHAR() $testname: returns expected length: $length");
25e3a4e0
KW
1364 is($ret_flags, $length,
1365 "isUTF8_CHAR_flags(...,0) $testname: returns expected length: $length");
d7874298
KW
1366 }
1367 unless (is(scalar @warnings, 0,
25e3a4e0 1368 "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated no warnings"))
d7874298
KW
1369 {
1370 diag "The warnings were: " . join(", ", @warnings);
1371 }
3d56ecbe 1372
e23e8bc1
KW
1373 undef @warnings;
1374 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
1375 if ($will_overflow) {
1376 is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
1377 }
1378 else {
1379 my $expected_ret = ( $testname =~ /surrogate|non-character/
1380 || $allowed_uv > 0x10FFFF)
1381 ? 0
1382 : $length;
1383 is($ret, $expected_ret,
1384 "isSTRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
25e3a4e0
KW
1385 $ret = test_isUTF8_CHAR_flags($bytes, $length,
1386 $UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
1387 is($ret, $expected_ret,
1388 "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE') acts like isSTRICT_UTF8_CHAR");
e23e8bc1
KW
1389 }
1390 unless (is(scalar @warnings, 0,
25e3a4e0 1391 "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname: generated no warnings"))
e23e8bc1
KW
1392 {
1393 diag "The warnings were: " . join(", ", @warnings);
1394 }
1395
a82be82b
KW
1396 undef @warnings;
1397 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
1398 if ($will_overflow) {
1399 is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
1400 }
1401 else {
1402 my $expected_ret = ( $testname =~ /surrogate/
1403 || $allowed_uv > 0x10FFFF)
1404 ? 0
1405 : $length;
1406 is($ret, $expected_ret,
1407 "isC9_STRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
25e3a4e0
KW
1408 $ret = test_isUTF8_CHAR_flags($bytes, $length,
1409 $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
1410 is($ret, $expected_ret,
1411 "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like isC9_STRICT_UTF8_CHAR");
a82be82b
KW
1412 }
1413 unless (is(scalar @warnings, 0,
25e3a4e0 1414 "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname: generated no warnings"))
a82be82b
KW
1415 {
1416 diag "The warnings were: " . join(", ", @warnings);
1417 }
1418
3d56ecbe
KW
1419 # Test partial character handling, for each byte not a full character
1420 for my $j (1.. $length - 1) {
1421
1422 # Skip the test for the interaction between overflow and above-31
1423 # bit. It is really testing other things than the partial
1424 # character tests, for which other tests in this file are
1425 # sufficient
1426 last if $testname =~ /overflow/;
1427
1428 foreach my $disallow_flag (0, $disallow_flags) {
1429 my $partial = substr($bytes, 0, $j);
1430 my $ret_should_be;
1431 my $comment;
1432 if ($disallow_flag) {
1433 $ret_should_be = 0;
1434 $comment = "disallowed";
1435 }
1436 else {
1437 $ret_should_be = 1;
1438 $comment = "allowed";
1439 }
1440
1441 if ($disallow_flag) {
1442 if ($testname =~ /non-character/) {
1443 $ret_should_be = 1;
1444 $comment .= ", but but need full char to discern";
1445 }
1446 elsif ($testname =~ /surrogate/) {
1447 if ($j < 2) {
1448 $ret_should_be = 1;
1449 $comment .= ", but need 2 bytes to discern";
1450 }
1451 }
1452 elsif ($testname =~ /first non_unicode/ && $j < 2) {
1453 $ret_should_be = 1;
1454 $comment .= ", but need 2 bytes to discern";
1455 }
1456 }
1457
1458 undef @warnings;
1459
1460 $ret = test_is_utf8_valid_partial_char_flags($partial, $j, $disallow_flag);
1461 is($ret, $ret_should_be, "$testname: is_utf8_valid_partial_char_flags("
1462 . display_bytes($partial)
1463 . "), $comment: returns $ret_should_be");
1464 unless (is(scalar @warnings, 0,
1465 "$testname: is_utf8_valid_partial_char_flags() generated no warnings"))
1466 {
1467 diag "The warnings were: " . join(", ", @warnings);
1468 }
1469 }
1470 }
d7874298
KW
1471 }
1472
eb83ed87
KW
1473 # This is more complicated than the malformations tested earlier, as there
1474 # are several orthogonal variables involved. We test all the subclasses
1475 # of utf8 warnings to verify they work with and without the utf8 class,
1476 # and don't have effects on other sublass warnings
54f4afef 1477 foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
eb83ed87
KW
1478 foreach my $warn_flag (0, $warn_flags) {
1479 foreach my $disallow_flag (0, $disallow_flags) {
54f4afef 1480 foreach my $do_warning (0, 1) {
eb83ed87 1481
13d7a909
KW
1482 my $eval_warn = $do_warning
1483 ? "use warnings '$warning'"
1484 : $warning eq "utf8"
7dfd8446
KW
1485 ? "no warnings 'utf8'"
1486 : "use warnings 'utf8'; no warnings '$warning'";
2f8f112e 1487
13d7a909
KW
1488 # is effectively disallowed if will overflow, even if the
1489 # flag indicates it is allowed, fix up test name to
1490 # indicate this as well
1491 my $disallowed = $disallow_flag || $will_overflow;
2f8f112e 1492
046d01eb 1493 my $this_name = "utf8n_to_uvchr() $testname: " . (($disallow_flag)
13d7a909
KW
1494 ? 'disallowed'
1495 : ($disallowed)
1d1c12d9 1496 ? 'ABOVE_31_BIT allowed'
13d7a909
KW
1497 : 'allowed');
1498 $this_name .= ", $eval_warn";
1499 $this_name .= ", " . (($warn_flag)
1500 ? 'with warning flag'
1501 : 'no warning flag');
eb83ed87 1502
13d7a909
KW
1503 undef @warnings;
1504 my $ret_ref;
7dfd8446
KW
1505 my $display_bytes = display_bytes($bytes);
1506 my $call = "Call was: $eval_warn; \$ret_ref = test_utf8n_to_uvchr('$display_bytes', $length, $warn_flag|$disallow_flag)";
1507 my $eval_text = "$eval_warn; \$ret_ref = test_utf8n_to_uvchr('$bytes', $length, $warn_flag|$disallow_flag)";
13d7a909
KW
1508 eval "$eval_text";
1509 if (! ok ("$@ eq ''", "$this_name: eval succeeded")) {
7dfd8446 1510 diag "\$!='$!'; eval'd=\"$call\"";
13d7a909 1511 next;
54f4afef 1512 }
13d7a909 1513 if ($disallowed) {
7dfd8446
KW
1514 unless (is($ret_ref->[0], 0, "$this_name: Returns 0"))
1515 {
1516 diag $call;
1517 }
2f8f112e
KW
1518 }
1519 else {
7dfd8446 1520 unless (is($ret_ref->[0], $allowed_uv,
9d2d0ecd
KW
1521 "$this_name: Returns expected uv: "
1522 . sprintf("0x%04X", $allowed_uv)))
7dfd8446
KW
1523 {
1524 diag $call;
1525 }
1526 }
1527 unless (is($ret_ref->[1], $expected_len,
9d2d0ecd 1528 "$this_name: Returns expected length: $expected_len"))
7dfd8446
KW
1529 {
1530 diag $call;
13d7a909 1531 }
13d7a909
KW
1532
1533 if (! $do_warning
1534 && ($warning eq 'utf8' || $warning eq $category))
1535 {
1536 if (!is(scalar @warnings, 0,
1537 "$this_name: No warnings generated"))
1538 {
7dfd8446
KW
1539 diag $call;
1540 diag "The warnings were: " . join(", ", @warnings);
2f8f112e
KW
1541 }
1542 }
13d7a909
KW
1543 elsif ($will_overflow
1544 && ! $disallow_flag
1545 && $warning eq 'utf8')
1546 {
1547
1548 # Will get the overflow message instead of the expected
1549 # message under these circumstances, as they would
1550 # otherwise accept an overflowed value, which the code
1551 # should not allow, so falls back to overflow.
1552 if (is(scalar @warnings, 1,
1553 "$this_name: Got a single warning "))
1554 {
7dfd8446
KW
1555 unless (like($warnings[0], qr/overflow/,
1556 "$this_name: Got overflow warning"))
1557 {
1558 diag $call;
1559 }
13d7a909
KW
1560 }
1561 else {
7dfd8446 1562 diag $call;
13d7a909 1563 if (scalar @warnings) {
7dfd8446 1564 diag "The warnings were: "
13d7a909
KW
1565 . join(", ", @warnings);
1566 }
1567 }
eb83ed87 1568 }
13d7a909
KW
1569 elsif ($warn_flag
1570 && ($warning eq 'utf8' || $warning eq $category))
1571 {
1572 if (is(scalar @warnings, 1,
1573 "$this_name: Got a single warning "))
1574 {
7dfd8446
KW
1575 unless (like($warnings[0], $message,
1576 "$this_name: Got expected warning"))
1577 {
1578 diag $call;
1579 }
13d7a909
KW
1580 }
1581 else {
7dfd8446 1582 diag $call;
13d7a909 1583 if (scalar @warnings) {
7dfd8446 1584 diag "The warnings were: "
13d7a909
KW
1585 . join(", ", @warnings);
1586 }
eb83ed87
KW
1587 }
1588 }
eb83ed87 1589
13d7a909
KW
1590 # Check CHECK_ONLY results when the input is disallowed. Do
1591 # this when actually disallowed, not just when the
1592 # $disallow_flag is set
1593 if ($disallowed) {
1594 undef @warnings;
1595 $ret_ref = test_utf8n_to_uvchr($bytes, $length,
1596 $disallow_flag|$UTF8_CHECK_ONLY);
7dfd8446
KW
1597 unless (is($ret_ref->[0], 0, "$this_name, CHECK_ONLY: Returns 0")) {
1598 diag $call;
1599 }
1600 unless (is($ret_ref->[1], -1,
9d2d0ecd 1601 "$this_name: CHECK_ONLY: returns -1 for length"))
7dfd8446
KW
1602 {
1603 diag $call;
1604 }
13d7a909
KW
1605 if (! is(scalar @warnings, 0,
1606 "$this_name, CHECK_ONLY: no warnings generated"))
1607 {
7dfd8446
KW
1608 diag $call;
1609 diag "The warnings were: " . join(", ", @warnings);
13d7a909 1610 }
eb83ed87 1611 }
046d01eb
KW
1612
1613 # Now repeat some of the above, but for
1614 # uvchr_to_utf8_flags(). Since this comes from an
1615 # existing code point, it hasn't overflowed.
1616 next if $will_overflow;
1617
1618 # The warning and disallow flags passed in are for
1619 # utf8n_to_uvchr(). Convert them for
1620 # uvchr_to_utf8_flags().
1621 my $uvchr_warn_flag = 0;
1622 my $uvchr_disallow_flag = 0;
1623 if ($warn_flag) {
1624 if ($warn_flag == $UTF8_WARN_SURROGATE) {
1625 $uvchr_warn_flag = $UNICODE_WARN_SURROGATE
1626 }
1627 elsif ($warn_flag == $UTF8_WARN_NONCHAR) {
1628 $uvchr_warn_flag = $UNICODE_WARN_NONCHAR
1629 }
1630 elsif ($warn_flag == $UTF8_WARN_SUPER) {
1631 $uvchr_warn_flag = $UNICODE_WARN_SUPER
1632 }
1d1c12d9 1633 elsif ($warn_flag == $UTF8_WARN_ABOVE_31_BIT) {
046d01eb
KW
1634 $uvchr_warn_flag = $UNICODE_WARN_ABOVE_31_BIT;
1635 }
1636 else {
1637 fail(sprintf "Unexpected warn flag: %x",
1638 $warn_flag);
1639 next;
1640 }
1641 }
1642 if ($disallow_flag) {
1643 if ($disallow_flag == $UTF8_DISALLOW_SURROGATE) {
1644 $uvchr_disallow_flag = $UNICODE_DISALLOW_SURROGATE
1645 }
1646 elsif ($disallow_flag == $UTF8_DISALLOW_NONCHAR) {
1647 $uvchr_disallow_flag = $UNICODE_DISALLOW_NONCHAR
1648 }
1649 elsif ($disallow_flag == $UTF8_DISALLOW_SUPER) {
1650 $uvchr_disallow_flag = $UNICODE_DISALLOW_SUPER
1651 }
1d1c12d9 1652 elsif ($disallow_flag == $UTF8_DISALLOW_ABOVE_31_BIT) {
046d01eb
KW
1653 $uvchr_disallow_flag =
1654 $UNICODE_DISALLOW_ABOVE_31_BIT;
1655 }
1656 else {
1657 fail(sprintf "Unexpected disallow flag: %x",
1658 $disallow_flag);
1659 next;
1660 }
1661 }
1662
1663 $disallowed = $uvchr_disallow_flag;
1664
1665 $this_name = "uvchr_to_utf8_flags() $testname: "
1666 . (($uvchr_disallow_flag)
1667 ? 'disallowed'
1668 : ($disallowed)
1669 ? 'ABOVE_31_BIT allowed'
1670 : 'allowed');
1671 $this_name .= ", $eval_warn";
1672 $this_name .= ", " . (($uvchr_warn_flag)
1673 ? 'with warning flag'
1674 : 'no warning flag');
1675
1676 undef @warnings;
1677 my $ret;
1678 my $warn_flag = sprintf "0x%x", $uvchr_warn_flag;
1679 my $disallow_flag = sprintf "0x%x", $uvchr_disallow_flag;
1680 $call = sprintf "call was: $eval_warn; \$ret = test_uvchr_to_utf8_flags(0x%x, $warn_flag|$disallow_flag)", $allowed_uv;
1681 $eval_text = "$eval_warn; \$ret = test_uvchr_to_utf8_flags($allowed_uv, $warn_flag|$disallow_flag)";
1682 eval "$eval_text";
1683 if (! ok ("$@ eq ''", "$this_name: eval succeeded")) {
1684 diag "\$!='$!'; eval'd=\"$eval_text\"";
1685 next;
1686 }
1687 if ($disallowed) {
1688 unless (is($ret, undef, "$this_name: Returns undef")) {
1689 diag $call;
1690 }
1691 }
1692 else {
1693 unless (is($ret, $bytes, "$this_name: Returns expected string")) {
1694 diag $call;
1695 }
1696 }
1697 if (! $do_warning
1698 && ($warning eq 'utf8' || $warning eq $category))
1699 {
1700 if (!is(scalar @warnings, 0,
1701 "$this_name: No warnings generated"))
1702 {
1703 diag $call;
1704 diag "The warnings were: " . join(", ", @warnings);
1705 }
1706 }
1707 elsif ($uvchr_warn_flag
1708 && ($warning eq 'utf8' || $warning eq $category))
1709 {
1710 if (is(scalar @warnings, 1,
1711 "$this_name: Got a single warning "))
1712 {
046d01eb
KW
1713 unless (like($warnings[0], $message,
1714 "$this_name: Got expected warning"))
1715 {
1716 diag $call;
1717 }
046d01eb
KW
1718 }
1719 else {
1720 diag $call;
1721 if (scalar @warnings) {
1722 diag "The warnings were: "
1723 . join(", ", @warnings);
1724 }
1725 }
1726 }
eb83ed87
KW
1727 }
1728 }
1729 }
1730 }
1731}
6e3d6c02 1732
fed3ba5d 1733done_testing;