This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/utf8.t: Rename variable
[perl5.git] / ext / XS-APItest / t / utf8.t
CommitLineData
fed3ba5d
NC
1#!perl -w
2
3use strict;
4use Test::More;
7dfd8446 5$|=1;
fed3ba5d 6
760c7c2f
KW
7no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
8 # machines, and that is tested elsewhere
9
fed3ba5d
NC
10use XS::APItest;
11
4deba822
KW
12my $pound_sign = chr utf8::unicode_to_native(163);
13
7dfd8446
KW
14sub isASCII { ord "A" == 65 }
15
16sub display_bytes {
9d2d0ecd 17 use bytes;
7dfd8446
KW
18 my $string = shift;
19 return '"'
20 . join("", map { sprintf("\\x%02x", ord $_) } split "", $string)
21 . '"';
22}
23
24# This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
25# because that uses the same functions we are testing here. So UTF-EBCDIC
26# strings are hard-coded as I8 strings in this file instead, and we use array
27# lookup to translate into the appropriate code page.
28
29my @i8_to_native = ( # Only code page 1047 so far.
30# _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F
310x00,0x01,0x02,0x03,0x37,0x2D,0x2E,0x2F,0x16,0x05,0x15,0x0B,0x0C,0x0D,0x0E,0x0F,
320x10,0x11,0x12,0x13,0x3C,0x3D,0x32,0x26,0x18,0x19,0x3F,0x27,0x1C,0x1D,0x1E,0x1F,
330x40,0x5A,0x7F,0x7B,0x5B,0x6C,0x50,0x7D,0x4D,0x5D,0x5C,0x4E,0x6B,0x60,0x4B,0x61,
340xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0x7A,0x5E,0x4C,0x7E,0x6E,0x6F,
350x7C,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,
360xD7,0xD8,0xD9,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xAD,0xE0,0xBD,0x5F,0x6D,
370x79,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x91,0x92,0x93,0x94,0x95,0x96,
380x97,0x98,0x99,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xC0,0x4F,0xD0,0xA1,0x07,
390x20,0x21,0x22,0x23,0x24,0x25,0x06,0x17,0x28,0x29,0x2A,0x2B,0x2C,0x09,0x0A,0x1B,
400x30,0x31,0x1A,0x33,0x34,0x35,0x36,0x08,0x38,0x39,0x3A,0x3B,0x04,0x14,0x3E,0xFF,
410x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x51,0x52,0x53,0x54,0x55,0x56,
420x57,0x58,0x59,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x70,0x71,0x72,0x73,
430x74,0x75,0x76,0x77,0x78,0x80,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,0x90,0x9A,0x9B,0x9C,
440x9D,0x9E,0x9F,0xA0,0xAA,0xAB,0xAC,0xAE,0xAF,0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,
450xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBE,0xBF,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xDA,0xDB,
460xDC,0xDD,0xDE,0xDF,0xE1,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xFA,0xFB,0xFC,0xFD,0xFE,
47);
48
49*I8_to_native = (isASCII)
50 ? sub { return shift }
51 : sub { return join "", map { chr $i8_to_native[ord $_] }
52 split "", shift };
53
54my $is64bit = length sprintf("%x", ~0) > 8;
55
56
57# Test utf8n_to_uvchr(). These provide essentially complete code coverage.
58# Copied from utf8.h
59my $UTF8_ALLOW_EMPTY = 0x0001;
60my $UTF8_ALLOW_CONTINUATION = 0x0002;
61my $UTF8_ALLOW_NON_CONTINUATION = 0x0004;
62my $UTF8_ALLOW_SHORT = 0x0008;
63my $UTF8_ALLOW_LONG = 0x0010;
64my $UTF8_DISALLOW_SURROGATE = 0x0020;
65my $UTF8_WARN_SURROGATE = 0x0040;
66my $UTF8_DISALLOW_NONCHAR = 0x0080;
67my $UTF8_WARN_NONCHAR = 0x0100;
68my $UTF8_DISALLOW_SUPER = 0x0200;
69my $UTF8_WARN_SUPER = 0x0400;
1d1c12d9
KW
70my $UTF8_DISALLOW_ABOVE_31_BIT = 0x0800;
71my $UTF8_WARN_ABOVE_31_BIT = 0x1000;
7dfd8446 72my $UTF8_CHECK_ONLY = 0x2000;
25e3a4e0
KW
73my $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
74 = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE;
75my $UTF8_DISALLOW_ILLEGAL_INTERCHANGE
76 = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR;
7dfd8446 77
046d01eb
KW
78# Test uvchr_to_utf8().
79my $UNICODE_WARN_SURROGATE = 0x0001;
80my $UNICODE_WARN_NONCHAR = 0x0002;
81my $UNICODE_WARN_SUPER = 0x0004;
82my $UNICODE_WARN_ABOVE_31_BIT = 0x0008;
83my $UNICODE_DISALLOW_SURROGATE = 0x0010;
84my $UNICODE_DISALLOW_NONCHAR = 0x0020;
85my $UNICODE_DISALLOW_SUPER = 0x0040;
86my $UNICODE_DISALLOW_ABOVE_31_BIT = 0x0080;
87
88my $look_for_everything_utf8n_to
89 = $UTF8_DISALLOW_SURROGATE
7dfd8446
KW
90 | $UTF8_WARN_SURROGATE
91 | $UTF8_DISALLOW_NONCHAR
92 | $UTF8_WARN_NONCHAR
93 | $UTF8_DISALLOW_SUPER
94 | $UTF8_WARN_SUPER
1d1c12d9
KW
95 | $UTF8_DISALLOW_ABOVE_31_BIT
96 | $UTF8_WARN_ABOVE_31_BIT;
046d01eb
KW
97my $look_for_everything_uvchr_to
98 = $UNICODE_DISALLOW_SURROGATE
99 | $UNICODE_WARN_SURROGATE
100 | $UNICODE_DISALLOW_NONCHAR
101 | $UNICODE_WARN_NONCHAR
102 | $UNICODE_DISALLOW_SUPER
103 | $UNICODE_WARN_SUPER
104 | $UNICODE_DISALLOW_ABOVE_31_BIT
105 | $UNICODE_WARN_ABOVE_31_BIT;
7dfd8446 106
fed3ba5d
NC
107foreach ([0, '', '', 'empty'],
108 [0, 'N', 'N', '1 char'],
109 [1, 'NN', 'N', '1 char substring'],
110 [-2, 'Perl', 'Rules', 'different'],
4deba822
KW
111 [0, $pound_sign, $pound_sign, 'pound sign'],
112 [1, $pound_sign . 10, $pound_sign . 1, '10 pounds is more than 1 pound'],
113 [1, $pound_sign . $pound_sign, $pound_sign, '2 pound signs are more than 1'],
fed3ba5d
NC
114 [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'],
115 [-1, '!', "!\x{1F42A}", 'Initial substrings match'],
116 ) {
117 my ($expect, $left, $right, $desc) = @$_;
118 my $copy = $right;
119 utf8::encode($copy);
120 is(bytes_cmp_utf8($left, $copy), $expect, $desc);
121 next if $right =~ tr/\0-\377//c;
122 utf8::encode($left);
123 is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed");
124}
125
7dfd8446
KW
126# The keys to this hash are Unicode code points, their values are the native
127# UTF-8 representations of them. The code points are chosen because they are
128# "interesting" on either or both ASCII and EBCDIC platforms. First we add
129# boundaries where the number of bytes required to represent them increase, or
130# are adjacent to problematic code points, so we want to make sure they aren't
131# considered problematic.
132my %code_points = (
133 0x0100 => (isASCII) ? "\xc4\x80" : I8_to_native("\xc8\xa0"),
134 0x0400 - 1 => (isASCII) ? "\xcf\xbf" : I8_to_native("\xdf\xbf"),
135 0x0400 => (isASCII) ? "\xd0\x80" : I8_to_native("\xe1\xa0\xa0"),
136 0x0800 - 1 => (isASCII) ? "\xdf\xbf" : I8_to_native("\xe1\xbf\xbf"),
137 0x0800 => (isASCII) ? "\xe0\xa0\x80" : I8_to_native("\xe2\xa0\xa0"),
138 0x4000 - 1 => (isASCII) ? "\xe3\xbf\xbf" : I8_to_native("\xef\xbf\xbf"),
139 0x4000 => (isASCII) ? "\xe4\x80\x80" : I8_to_native("\xf0\xb0\xa0\xa0"),
140 0x8000 - 1 => (isASCII) ? "\xe7\xbf\xbf" : I8_to_native("\xf0\xbf\xbf\xbf"),
141
142 # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC,
143 # as of this writing, considers potentially problematic on EBCDIC
144 0x8000 => (isASCII) ? "\xe8\x80\x80" : I8_to_native("\xf1\xa0\xa0\xa0"),
145
146 0xD000 - 1 => (isASCII) ? "\xec\xbf\xbf" : I8_to_native("\xf1\xb3\xbf\xbf"),
147
148 # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC,
149 # as of this writing, considers potentially problematic on ASCII
150 0xD000 => (isASCII) ? "\xed\x80\x80" : I8_to_native("\xf1\xb4\xa0\xa0"),
151
5f8a3d1d 152 # Bracket the surrogates, and include several surrogates
7dfd8446 153 0xD7FF => (isASCII) ? "\xed\x9f\xbf" : I8_to_native("\xf1\xb5\xbf\xbf"),
5f8a3d1d
KW
154 0xD800 => (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
155 0xDC00 => (isASCII) ? "\xed\xb0\x80" : I8_to_native("\xf1\xb7\xa0\xa0"),
156 0xDFFF => (isASCII) ? "\xee\x80\x80" : I8_to_native("\xf1\xb8\xa0\xa0"),
157 0xDFFF => (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
7dfd8446
KW
158 0xE000 => (isASCII) ? "\xee\x80\x80" : I8_to_native("\xf1\xb8\xa0\xa0"),
159
5f8a3d1d 160 # Include the 32 contiguous non characters, and surrounding code points
7dfd8446 161 0xFDCF => (isASCII) ? "\xef\xb7\x8f" : I8_to_native("\xf1\xbf\xae\xaf"),
5f8a3d1d
KW
162 0xFDD0 => (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
163 0xFDD1 => (isASCII) ? "\xef\xb7\x91" : I8_to_native("\xf1\xbf\xae\xb1"),
164 0xFDD2 => (isASCII) ? "\xef\xb7\x92" : I8_to_native("\xf1\xbf\xae\xb2"),
165 0xFDD3 => (isASCII) ? "\xef\xb7\x93" : I8_to_native("\xf1\xbf\xae\xb3"),
166 0xFDD4 => (isASCII) ? "\xef\xb7\x94" : I8_to_native("\xf1\xbf\xae\xb4"),
167 0xFDD5 => (isASCII) ? "\xef\xb7\x95" : I8_to_native("\xf1\xbf\xae\xb5"),
168 0xFDD6 => (isASCII) ? "\xef\xb7\x96" : I8_to_native("\xf1\xbf\xae\xb6"),
169 0xFDD7 => (isASCII) ? "\xef\xb7\x97" : I8_to_native("\xf1\xbf\xae\xb7"),
170 0xFDD8 => (isASCII) ? "\xef\xb7\x98" : I8_to_native("\xf1\xbf\xae\xb8"),
171 0xFDD9 => (isASCII) ? "\xef\xb7\x99" : I8_to_native("\xf1\xbf\xae\xb9"),
172 0xFDDA => (isASCII) ? "\xef\xb7\x9a" : I8_to_native("\xf1\xbf\xae\xba"),
173 0xFDDB => (isASCII) ? "\xef\xb7\x9b" : I8_to_native("\xf1\xbf\xae\xbb"),
174 0xFDDC => (isASCII) ? "\xef\xb7\x9c" : I8_to_native("\xf1\xbf\xae\xbc"),
175 0xFDDD => (isASCII) ? "\xef\xb7\x9d" : I8_to_native("\xf1\xbf\xae\xbd"),
176 0xFDDE => (isASCII) ? "\xef\xb7\x9e" : I8_to_native("\xf1\xbf\xae\xbe"),
177 0xFDDF => (isASCII) ? "\xef\xb7\x9f" : I8_to_native("\xf1\xbf\xae\xbf"),
178 0xFDE0 => (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
179 0xFDE1 => (isASCII) ? "\xef\xb7\xa1" : I8_to_native("\xf1\xbf\xaf\xa1"),
180 0xFDE2 => (isASCII) ? "\xef\xb7\xa2" : I8_to_native("\xf1\xbf\xaf\xa2"),
181 0xFDE3 => (isASCII) ? "\xef\xb7\xa3" : I8_to_native("\xf1\xbf\xaf\xa3"),
182 0xFDE4 => (isASCII) ? "\xef\xb7\xa4" : I8_to_native("\xf1\xbf\xaf\xa4"),
183 0xFDE5 => (isASCII) ? "\xef\xb7\xa5" : I8_to_native("\xf1\xbf\xaf\xa5"),
184 0xFDE6 => (isASCII) ? "\xef\xb7\xa6" : I8_to_native("\xf1\xbf\xaf\xa6"),
185 0xFDE7 => (isASCII) ? "\xef\xb7\xa7" : I8_to_native("\xf1\xbf\xaf\xa7"),
186 0xFDE8 => (isASCII) ? "\xef\xb7\xa8" : I8_to_native("\xf1\xbf\xaf\xa8"),
187 0xFDEa => (isASCII) ? "\xef\xb7\x99" : I8_to_native("\xf1\xbf\xaf\xa9"),
188 0xFDEA => (isASCII) ? "\xef\xb7\xaa" : I8_to_native("\xf1\xbf\xaf\xaa"),
189 0xFDEB => (isASCII) ? "\xef\xb7\xab" : I8_to_native("\xf1\xbf\xaf\xab"),
190 0xFDEC => (isASCII) ? "\xef\xb7\xac" : I8_to_native("\xf1\xbf\xaf\xac"),
191 0xFDED => (isASCII) ? "\xef\xb7\xad" : I8_to_native("\xf1\xbf\xaf\xad"),
192 0xFDEE => (isASCII) ? "\xef\xb7\xae" : I8_to_native("\xf1\xbf\xae\xae"),
193 0xFDEF => (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
7dfd8446
KW
194 0xFDF0 => (isASCII) ? "\xef\xb7\xb0" : I8_to_native("\xf1\xbf\xaf\xb0"),
195
5f8a3d1d 196 # Mostly around non-characters, but some are transitions to longer strings
7dfd8446
KW
197 0xFFFD => (isASCII) ? "\xef\xbf\xbd" : I8_to_native("\xf1\xbf\xbf\xbd"),
198 0x10000 - 1 => (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
199 0x10000 => (isASCII) ? "\xf0\x90\x80\x80" : I8_to_native("\xf2\xa0\xa0\xa0"),
200 0x1FFFD => (isASCII) ? "\xf0\x9f\xbf\xbd" : I8_to_native("\xf3\xbf\xbf\xbd"),
5f8a3d1d
KW
201 0x1FFFE => (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
202 0x1FFFF => (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
7dfd8446
KW
203 0x20000 => (isASCII) ? "\xf0\xa0\x80\x80" : I8_to_native("\xf4\xa0\xa0\xa0"),
204 0x2FFFD => (isASCII) ? "\xf0\xaf\xbf\xbd" : I8_to_native("\xf5\xbf\xbf\xbd"),
5f8a3d1d
KW
205 0x2FFFE => (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
206 0x2FFFF => (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
7dfd8446
KW
207 0x30000 => (isASCII) ? "\xf0\xb0\x80\x80" : I8_to_native("\xf6\xa0\xa0\xa0"),
208 0x3FFFD => (isASCII) ? "\xf0\xbf\xbf\xbd" : I8_to_native("\xf7\xbf\xbf\xbd"),
5f8a3d1d 209 0x3FFFE => (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
7dfd8446
KW
210 0x40000 - 1 => (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
211 0x40000 => (isASCII) ? "\xf1\x80\x80\x80" : I8_to_native("\xf8\xa8\xa0\xa0\xa0"),
212 0x4FFFD => (isASCII) ? "\xf1\x8f\xbf\xbd" : I8_to_native("\xf8\xa9\xbf\xbf\xbd"),
5f8a3d1d
KW
213 0x4FFFE => (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
214 0x4FFFF => (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
7dfd8446
KW
215 0x50000 => (isASCII) ? "\xf1\x90\x80\x80" : I8_to_native("\xf8\xaa\xa0\xa0\xa0"),
216 0x5FFFD => (isASCII) ? "\xf1\x9f\xbf\xbd" : I8_to_native("\xf8\xab\xbf\xbf\xbd"),
5f8a3d1d
KW
217 0x5FFFE => (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
218 0x5FFFF => (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
7dfd8446
KW
219 0x60000 => (isASCII) ? "\xf1\xa0\x80\x80" : I8_to_native("\xf8\xac\xa0\xa0\xa0"),
220 0x6FFFD => (isASCII) ? "\xf1\xaf\xbf\xbd" : I8_to_native("\xf8\xad\xbf\xbf\xbd"),
5f8a3d1d
KW
221 0x6FFFE => (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
222 0x6FFFF => (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
7dfd8446
KW
223 0x70000 => (isASCII) ? "\xf1\xb0\x80\x80" : I8_to_native("\xf8\xae\xa0\xa0\xa0"),
224 0x7FFFD => (isASCII) ? "\xf1\xbf\xbf\xbd" : I8_to_native("\xf8\xaf\xbf\xbf\xbd"),
5f8a3d1d
KW
225 0x7FFFE => (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
226 0x7FFFF => (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
7dfd8446
KW
227 0x80000 => (isASCII) ? "\xf2\x80\x80\x80" : I8_to_native("\xf8\xb0\xa0\xa0\xa0"),
228 0x8FFFD => (isASCII) ? "\xf2\x8f\xbf\xbd" : I8_to_native("\xf8\xb1\xbf\xbf\xbd"),
5f8a3d1d
KW
229 0x8FFFE => (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
230 0x8FFFF => (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
7dfd8446
KW
231 0x90000 => (isASCII) ? "\xf2\x90\x80\x80" : I8_to_native("\xf8\xb2\xa0\xa0\xa0"),
232 0x9FFFD => (isASCII) ? "\xf2\x9f\xbf\xbd" : I8_to_native("\xf8\xb3\xbf\xbf\xbd"),
5f8a3d1d
KW
233 0x9FFFE => (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
234 0x9FFFF => (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
7dfd8446
KW
235 0xA0000 => (isASCII) ? "\xf2\xa0\x80\x80" : I8_to_native("\xf8\xb4\xa0\xa0\xa0"),
236 0xAFFFD => (isASCII) ? "\xf2\xaf\xbf\xbd" : I8_to_native("\xf8\xb5\xbf\xbf\xbd"),
5f8a3d1d
KW
237 0xAFFFE => (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
238 0xAFFFF => (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
7dfd8446
KW
239 0xB0000 => (isASCII) ? "\xf2\xb0\x80\x80" : I8_to_native("\xf8\xb6\xa0\xa0\xa0"),
240 0xBFFFD => (isASCII) ? "\xf2\xbf\xbf\xbd" : I8_to_native("\xf8\xb7\xbf\xbf\xbd"),
5f8a3d1d
KW
241 0xBFFFE => (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
242 0xBFFFF => (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
7dfd8446
KW
243 0xC0000 => (isASCII) ? "\xf3\x80\x80\x80" : I8_to_native("\xf8\xb8\xa0\xa0\xa0"),
244 0xCFFFD => (isASCII) ? "\xf3\x8f\xbf\xbd" : I8_to_native("\xf8\xb9\xbf\xbf\xbd"),
5f8a3d1d
KW
245 0xCFFFE => (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
246 0xCFFFF => (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
7dfd8446
KW
247 0xD0000 => (isASCII) ? "\xf3\x90\x80\x80" : I8_to_native("\xf8\xba\xa0\xa0\xa0"),
248 0xDFFFD => (isASCII) ? "\xf3\x9f\xbf\xbd" : I8_to_native("\xf8\xbb\xbf\xbf\xbd"),
5f8a3d1d
KW
249 0xDFFFE => (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
250 0xDFFFF => (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
7dfd8446
KW
251 0xE0000 => (isASCII) ? "\xf3\xa0\x80\x80" : I8_to_native("\xf8\xbc\xa0\xa0\xa0"),
252 0xEFFFD => (isASCII) ? "\xf3\xaf\xbf\xbd" : I8_to_native("\xf8\xbd\xbf\xbf\xbd"),
5f8a3d1d
KW
253 0xEFFFE => (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
254 0xEFFFF => (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
7dfd8446
KW
255 0xF0000 => (isASCII) ? "\xf3\xb0\x80\x80" : I8_to_native("\xf8\xbe\xa0\xa0\xa0"),
256 0xFFFFD => (isASCII) ? "\xf3\xbf\xbf\xbd" : I8_to_native("\xf8\xbf\xbf\xbf\xbd"),
5f8a3d1d
KW
257 0xFFFFE => (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
258 0xFFFFF => (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
7dfd8446
KW
259 0x100000 => (isASCII) ? "\xf4\x80\x80\x80" : I8_to_native("\xf9\xa0\xa0\xa0\xa0"),
260 0x10FFFD => (isASCII) ? "\xf4\x8f\xbf\xbd" : I8_to_native("\xf9\xa1\xbf\xbf\xbd"),
5f8a3d1d
KW
261 0x10FFFE => (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
262 0x10FFFF => (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
7dfd8446
KW
263 0x110000 => (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
264
265 # Things that would be noncharacters if they were in Unicode, and might be
266 # mistaken, if the C code is bad, to be nonchars
267 0x11FFFE => (isASCII) ? "\xf4\x9f\xbf\xbe" : I8_to_native("\xf9\xa3\xbf\xbf\xbe"),
268 0x11FFFF => (isASCII) ? "\xf4\x9f\xbf\xbf" : I8_to_native("\xf9\xa3\xbf\xbf\xbf"),
269 0x20FFFE => (isASCII) ? "\xf8\x88\x8f\xbf\xbe" : I8_to_native("\xfa\xa1\xbf\xbf\xbe"),
270 0x20FFFF => (isASCII) ? "\xf8\x88\x8f\xbf\xbf" : I8_to_native("\xfa\xa1\xbf\xbf\xbf"),
271
272 0x200000 - 1 => (isASCII) ? "\xf7\xbf\xbf\xbf" : I8_to_native("\xf9\xbf\xbf\xbf\xbf"),
273 0x200000 => (isASCII) ? "\xf8\x88\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
274 0x400000 - 1 => (isASCII) ? "\xf8\x8f\xbf\xbf\xbf" : I8_to_native("\xfb\xbf\xbf\xbf\xbf"),
275 0x400000 => (isASCII) ? "\xf8\x90\x80\x80\x80" : I8_to_native("\xfc\xa4\xa0\xa0\xa0\xa0"),
276 0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"),
277 0x4000000 => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"),
278 0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"),
279 0x4000000 => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"),
280 0x40000000 - 1 => (isASCII) ? "\xfc\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xfe\xbf\xbf\xbf\xbf\xbf\xbf"),
281 0x40000000 => (isASCII) ? "\xfd\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0"),
282 0x80000000 - 1 => (isASCII) ? "\xfd\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"),
c0236afe
KW
283 0x80000000 => (isASCII) ? "\xfe\x82\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
284 0xFFFFFFFF => (isASCII) ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
7dfd8446 285);
4deba822 286
7dfd8446
KW
287if ($is64bit) {
288 no warnings qw(overflow portable);
9d2d0ecd
KW
289 $code_points{0x100000000} = (isASCII)
290 ? "\xfe\x84\x80\x80\x80\x80\x80"
291 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0");
292 $code_points{0x1000000000 - 1} = (isASCII)
293 ? "\xfe\xbf\xbf\xbf\xbf\xbf\xbf"
294 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf\xbf");
295 $code_points{0x1000000000} = (isASCII)
296 ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
297 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0");
298 $code_points{0xFFFFFFFFFFFFFFFF} = (isASCII)
299 ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
300 : I8_to_native("\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf");
d566bd20
KW
301 if (isASCII) { # These could falsely show as overlongs in a naive implementation
302 $code_points{0x40000000000} = "\xff\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80";
303 $code_points{0x1000000000000} = "\xff\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80";
304 $code_points{0x40000000000000} = "\xff\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80";
305 $code_points{0x1000000000000000} = "\xff\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80";
306 # overflows
307 #$code_points{0xfoo} = "\xff\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80";
308 }
309}
310elsif (! isASCII) { # 32-bit EBCDIC. 64-bit is clearer to handle, so doesn't need this test case
311 no warnings qw(overflow portable);
312 $code_points{0x40000000} = I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0");
7dfd8446 313}
eb83ed87 314
7dfd8446
KW
315# Now add in entries for each of code points 0-255, which require special
316# handling on EBCDIC. Remember the keys are Unicode values, and the values
317# are the native UTF-8. For invariants, the bytes are just the native chr.
318
319my $cp = 0;
320while ($cp < ((isASCII) ? 128 : 160)) { # This is from the definition of
321 # invariant
322 $code_points{$cp} = chr utf8::unicode_to_native($cp);
323 $cp++;
324}
325
326# Done with the invariants. Now do the variants. All in this range are 2
327# byte. Again, we can't use the internal functions to generate UTF-8, as
328# those are what we are trying to test. In the loop, we know what range the
329# continuation bytes can be in, and what the lowest start byte can be. So we
330# cycle through them.
331
332my $first_continuation = (isASCII) ? 0x80 : 0xA0;
333my $final_continuation = 0xBF;
334my $start = (isASCII) ? 0xC2 : 0xC5;
335
336my $continuation = $first_continuation - 1;
337
338while ($cp < 255) {
339 if (++$continuation > $final_continuation) {
340
341 # Wrap to the next start byte when we reach the final continuation
342 # byte possible
343 $continuation = $first_continuation;
344 $start++;
345 }
346 $code_points{$cp} = I8_to_native(chr($start) . chr($continuation));
347
348 $cp++;
349}
eb83ed87
KW
350
351my @warnings;
352
353use warnings 'utf8';
354local $SIG{__WARN__} = sub { push @warnings, @_ };
355
7dfd8446
KW
356# This set of tests looks for basic sanity, and lastly tests the bottom level
357# decode routine for the given code point. If the earlier tests for that code
358# point fail, that one probably will too. Malformations are tested in later
359# segments of code.
360for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
361 keys %code_points)
362{
363 my $hex_u = sprintf("0x%02X", $u);
364 my $n = utf8::unicode_to_native($u);
365 my $hex_n = sprintf("0x%02X", $n);
366 my $bytes = $code_points{$u};
367
368 my $offskip_should_be;
369 {
370 no warnings qw(overflow portable);
371 $offskip_should_be = (isASCII)
372 ? ( $u < 0x80 ? 1 :
373 $u < 0x800 ? 2 :
374 $u < 0x10000 ? 3 :
375 $u < 0x200000 ? 4 :
376 $u < 0x4000000 ? 5 :
377 $u < 0x80000000 ? 6 : (($is64bit)
378 ? ($u < 0x1000000000 ? 7 : 13)
379 : 7)
380 )
381 : ($u < 0xA0 ? 1 :
382 $u < 0x400 ? 2 :
383 $u < 0x4000 ? 3 :
384 $u < 0x40000 ? 4 :
385 $u < 0x400000 ? 5 :
c0236afe
KW
386 $u < 0x4000000 ? 6 :
387 $u < 0x40000000 ? 7 : 14 );
7dfd8446
KW
388 }
389
390 # If this test fails, subsequent ones are meaningless.
391 next unless is(test_OFFUNISKIP($u), $offskip_should_be,
392 "Verify OFFUNISKIP($hex_u) is $offskip_should_be");
393 my $invariant = $offskip_should_be == 1;
394 my $display_invariant = $invariant || 0;
395 is(test_OFFUNI_IS_INVARIANT($u), $invariant,
396 "Verify OFFUNI_IS_INVARIANT($hex_u) is $display_invariant");
397
398 my $uvchr_skip_should_be = $offskip_should_be;
399 next unless is(test_UVCHR_SKIP($n), $uvchr_skip_should_be,
400 "Verify UVCHR_SKIP($hex_n) is $uvchr_skip_should_be");
401 is(test_UVCHR_IS_INVARIANT($n), $offskip_should_be == 1,
402 "Verify UVCHR_IS_INVARIANT($hex_n) is $display_invariant");
403
404 my $n_chr = chr $n;
405 utf8::upgrade $n_chr;
406
407 is(test_UTF8_SKIP($n_chr), $uvchr_skip_should_be,
408 "Verify UTF8_SKIP(chr $hex_n) is $uvchr_skip_should_be");
409
410 use bytes;
3d56ecbe
KW
411 my $byte_length = length $n_chr;
412 for (my $j = 0; $j < $byte_length; $j++) {
413 undef @warnings;
414
415 if ($j == $byte_length - 1) {
416 my $ret = test_is_utf8_valid_partial_char_flags($n_chr, $byte_length, 0);
417 is($ret, 0, " Verify is_utf8_valid_partial_char_flags(" . display_bytes($n_chr) . ") returns 0 for full character");
418 }
419 else {
420 my $bytes_so_far = substr($n_chr, 0, $j + 1);
421 my $ret = test_is_utf8_valid_partial_char_flags($bytes_so_far, $j + 1, 0);
422 is($ret, 1, " Verify is_utf8_valid_partial_char_flags(" . display_bytes($bytes_so_far) . ") returns 1");
423 }
424
425 unless (is(scalar @warnings, 0,
426 " Verify is_utf8_valid_partial_char_flags generated no warnings"))
427 {
428 diag "The warnings were: " . join(", ", @warnings);
429 }
430
7dfd8446
KW
431 my $b = substr($n_chr, $j, 1);
432 my $hex_b = sprintf("\"\\x%02x\"", ord $b);
433
434 my $byte_invariant = $j == 0 && $uvchr_skip_should_be == 1;
435 my $display_byte_invariant = $byte_invariant || 0;
436 next unless is(test_UTF8_IS_INVARIANT($b), $byte_invariant,
437 " Verify UTF8_IS_INVARIANT($hex_b) for byte $j "
438 . "is $display_byte_invariant");
439
440 my $is_start = $j == 0 && $uvchr_skip_should_be > 1;
441 my $display_is_start = $is_start || 0;
442 next unless is(test_UTF8_IS_START($b), $is_start,
443 " Verify UTF8_IS_START($hex_b) is $display_is_start");
444
445 my $is_continuation = $j != 0 && $uvchr_skip_should_be > 1;
446 my $display_is_continuation = $is_continuation || 0;
447 next unless is(test_UTF8_IS_CONTINUATION($b), $is_continuation,
448 " Verify UTF8_IS_CONTINUATION($hex_b) is "
449 . "$display_is_continuation");
450
451 my $is_continued = $uvchr_skip_should_be > 1;
452 my $display_is_continued = $is_continued || 0;
453 next unless is(test_UTF8_IS_CONTINUED($b), $is_continued,
454 " Verify UTF8_IS_CONTINUED($hex_b) is "
455 . "$display_is_continued");
456
457 my $is_downgradeable_start = $n < 256
458 && $uvchr_skip_should_be > 1
459 && $j == 0;
460 my $display_is_downgradeable_start = $is_downgradeable_start || 0;
461 next unless is(test_UTF8_IS_DOWNGRADEABLE_START($b),
462 $is_downgradeable_start,
463 " Verify UTF8_IS_DOWNGRADEABLE_START($hex_b) is "
464 . "$display_is_downgradeable_start");
465
466 my $is_above_latin1 = $n > 255 && $j == 0;
467 my $display_is_above_latin1 = $is_above_latin1 || 0;
468 next unless is(test_UTF8_IS_ABOVE_LATIN1($b),
469 $is_above_latin1,
470 " Verify UTF8_IS_ABOVE_LATIN1($hex_b) is "
471 . "$display_is_above_latin1");
472
473 my $is_possibly_problematic = $j == 0
474 && $n >= ((isASCII)
475 ? 0xD000
476 : 0x8000);
477 my $display_is_possibly_problematic = $is_possibly_problematic || 0;
478 next unless is(test_isUTF8_POSSIBLY_PROBLEMATIC($b),
479 $is_possibly_problematic,
480 " Verify isUTF8_POSSIBLY_PROBLEMATIC($hex_b) is "
481 . "$display_is_above_latin1");
482 }
483
484 # We are not trying to look for warnings, etc, so if they should occur, it
485 # is an error. But some of the code points here do cause warnings, so we
486 # check here and turn off the ones that apply to such code points. A
487 # later section of the code tests for these kinds of things.
046d01eb 488 my $this_utf8_flags = $look_for_everything_utf8n_to;
7dfd8446 489 my $len = length $bytes;
e23e8bc1
KW
490
491 my $valid_under_strict = 1;
a82be82b 492 my $valid_under_c9strict = 1;
7dfd8446
KW
493 if ($n > 0x10FFFF) {
494 $this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER);
e23e8bc1 495 $valid_under_strict = 0;
a82be82b 496 $valid_under_c9strict = 0;
5f8a3d1d
KW
497 if ($n > 2 ** 31 - 1) {
498 $this_utf8_flags &=
499 ~($UTF8_DISALLOW_ABOVE_31_BIT|$UTF8_WARN_ABOVE_31_BIT);
500 }
7dfd8446 501 }
5f8a3d1d 502 elsif (($n >= 0xFDD0 && $n <= 0xFDEF) || ($n & 0xFFFE) == 0xFFFE) {
7dfd8446 503 $this_utf8_flags &= ~($UTF8_DISALLOW_NONCHAR|$UTF8_WARN_NONCHAR);
e23e8bc1 504 $valid_under_strict = 0;
7dfd8446 505 }
5f8a3d1d
KW
506 elsif ($n >= 0xD800 && $n <= 0xDFFF) {
507 $this_utf8_flags &= ~($UTF8_DISALLOW_SURROGATE|$UTF8_WARN_SURROGATE);
508 $valid_under_c9strict = 0;
509 $valid_under_strict = 0;
510 }
7dfd8446
KW
511
512 undef @warnings;
513
514 my $display_flags = sprintf "0x%x", $this_utf8_flags;
7dfd8446 515 my $display_bytes = display_bytes($bytes);
d7874298 516 my $ret_ref = test_utf8n_to_uvchr($bytes, $len, $this_utf8_flags);
7dfd8446 517 is($ret_ref->[0], $n, "Verify utf8n_to_uvchr($display_bytes, $display_flags) returns $hex_n");
9d2d0ecd 518 is($ret_ref->[1], $len, "Verify utf8n_to_uvchr() for $hex_n returns expected length: $len");
7dfd8446
KW
519
520 unless (is(scalar @warnings, 0,
521 "Verify utf8n_to_uvchr() for $hex_n generated no warnings"))
522 {
523 diag "The warnings were: " . join(", ", @warnings);
524 }
046d01eb 525
75ffa578
KW
526 undef @warnings;
527
d7874298 528 my $ret = test_isUTF8_CHAR($bytes, $len);
9d2d0ecd 529 is($ret, $len, "Verify isUTF8_CHAR($display_bytes) returns expected length: $len");
d7874298
KW
530
531 unless (is(scalar @warnings, 0,
532 "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
533 {
534 diag "The warnings were: " . join(", ", @warnings);
535 }
536
537 undef @warnings;
538
539 $ret = test_isUTF8_CHAR($bytes, $len - 1);
540 is($ret, 0, "Verify isUTF8_CHAR() with too short length parameter returns 0");
541
542 unless (is(scalar @warnings, 0,
543 "Verify isUTF8_CHAR() generated no warnings"))
544 {
545 diag "The warnings were: " . join(", ", @warnings);
546 }
547
548 undef @warnings;
549
25e3a4e0
KW
550 $ret = test_isUTF8_CHAR_flags($bytes, $len, 0);
551 is($ret, $len, "Verify isUTF8_CHAR_flags($display_bytes, 0) returns expected length: $len");
552
553 unless (is(scalar @warnings, 0,
554 "Verify isUTF8_CHAR_flags() for $hex_n generated no warnings"))
555 {
556 diag "The warnings were: " . join(", ", @warnings);
557 }
558
559 undef @warnings;
560
561 $ret = test_isUTF8_CHAR_flags($bytes, $len - 1, 0);
562 is($ret, 0, "Verify isUTF8_CHAR_flags() with too short length parameter returns 0");
563
564 unless (is(scalar @warnings, 0,
565 "Verify isUTF8_CHAR_flags() generated no warnings"))
566 {
567 diag "The warnings were: " . join(", ", @warnings);
568 }
569
570 undef @warnings;
571
e23e8bc1
KW
572 $ret = test_isSTRICT_UTF8_CHAR($bytes, $len);
573 my $expected_len = ($valid_under_strict) ? $len : 0;
574 is($ret, $expected_len, "Verify isSTRICT_UTF8_CHAR($display_bytes) returns expected length: $expected_len");
575
576 unless (is(scalar @warnings, 0,
577 "Verify isSTRICT_UTF8_CHAR() for $hex_n generated no warnings"))
578 {
579 diag "The warnings were: " . join(", ", @warnings);
580 }
581
582 undef @warnings;
583
584 $ret = test_isSTRICT_UTF8_CHAR($bytes, $len - 1);
585 is($ret, 0, "Verify isSTRICT_UTF8_CHAR() with too short length parameter returns 0");
586
587 unless (is(scalar @warnings, 0,
588 "Verify isSTRICT_UTF8_CHAR() generated no warnings"))
589 {
590 diag "The warnings were: " . join(", ", @warnings);
591 }
592
25e3a4e0
KW
593 undef @warnings;
594
595 $ret = test_isUTF8_CHAR_flags($bytes, $len, $UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
596 is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE') acts like isSTRICT_UTF8_CHAR");
597
598 unless (is(scalar @warnings, 0,
599 "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
600 {
601 diag "The warnings were: " . join(", ", @warnings);
602 }
603
604 undef @warnings;
605
a82be82b
KW
606 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len);
607 $expected_len = ($valid_under_c9strict) ? $len : 0;
608 is($ret, $expected_len, "Verify isC9_STRICT_UTF8_CHAR($display_bytes) returns expected length: $len");
609
610 unless (is(scalar @warnings, 0,
611 "Verify isC9_STRICT_UTF8_CHAR() for $hex_n generated no warnings"))
612 {
613 diag "The warnings were: " . join(", ", @warnings);
614 }
615
616 undef @warnings;
617
618 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len - 1);
619 is($ret, 0, "Verify isC9_STRICT_UTF8_CHAR() with too short length parameter returns 0");
620
621 unless (is(scalar @warnings, 0,
622 "Verify isC9_STRICT_UTF8_CHAR() generated no warnings"))
623 {
624 diag "The warnings were: " . join(", ", @warnings);
625 }
626
e23e8bc1
KW
627 undef @warnings;
628
25e3a4e0
KW
629 $ret = test_isUTF8_CHAR_flags($bytes, $len, $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
630 is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like isC9_STRICT_UTF8_CHAR");
631
632 unless (is(scalar @warnings, 0,
633 "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
634 {
635 diag "The warnings were: " . join(", ", @warnings);
636 }
637
638 undef @warnings;
639
75ffa578
KW
640 $ret_ref = test_valid_utf8_to_uvchr($bytes);
641 is($ret_ref->[0], $n, "Verify valid_utf8_to_uvchr($display_bytes) returns $hex_n");
9d2d0ecd 642 is($ret_ref->[1], $len, "Verify valid_utf8_to_uvchr() for $hex_n returns expected length: $len");
75ffa578
KW
643
644 unless (is(scalar @warnings, 0,
645 "Verify valid_utf8_to_uvchr() for $hex_n generated no warnings"))
646 {
647 diag "The warnings were: " . join(", ", @warnings);
648 }
649
046d01eb
KW
650 # Similarly for uvchr_to_utf8
651 my $this_uvchr_flags = $look_for_everything_uvchr_to;
652 if ($n > 2 ** 31 - 1) {
653 $this_uvchr_flags &=
654 ~($UNICODE_DISALLOW_ABOVE_31_BIT|$UNICODE_WARN_ABOVE_31_BIT);
655 }
656 if ($n > 0x10FFFF) {
657 $this_uvchr_flags &= ~($UNICODE_DISALLOW_SUPER|$UNICODE_WARN_SUPER);
658 }
5f8a3d1d 659 elsif (($n >= 0xFDD0 && $n <= 0xFDEF) || ($n & 0xFFFE) == 0xFFFE) {
046d01eb
KW
660 $this_uvchr_flags &= ~($UNICODE_DISALLOW_NONCHAR|$UNICODE_WARN_NONCHAR);
661 }
5f8a3d1d
KW
662 elsif ($n >= 0xD800 && $n <= 0xDFFF) {
663 $this_uvchr_flags &= ~($UNICODE_DISALLOW_SURROGATE|$UNICODE_WARN_SURROGATE);
664 }
046d01eb
KW
665 $display_flags = sprintf "0x%x", $this_uvchr_flags;
666
667 undef @warnings;
668
d7874298 669 $ret = test_uvchr_to_utf8_flags($n, $this_uvchr_flags);
046d01eb
KW
670 ok(defined $ret, "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returned success");
671 is($ret, $bytes, "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returns correct bytes");
672
673 unless (is(scalar @warnings, 0,
674 "Verify uvchr_to_utf8_flags($hex_n, $display_flags) for $hex_n generated no warnings"))
675 {
676 diag "The warnings were: " . join(", ", @warnings);
677 }
7dfd8446
KW
678}
679
680my $REPLACEMENT = 0xFFFD;
681
682# Now test the malformations. All these raise category utf8 warnings.
152c1f4b 683my $I8c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte
7dfd8446 684my @malformations = (
eb83ed87
KW
685 [ "zero length string malformation", "", 0,
686 $UTF8_ALLOW_EMPTY, 0, 0,
687 qr/empty string/
688 ],
152c1f4b 689 [ "orphan continuation byte malformation", I8_to_native("${I8c}a"),
7dfd8446 690 2,
eb83ed87
KW
691 $UTF8_ALLOW_CONTINUATION, $REPLACEMENT, 1,
692 qr/unexpected continuation byte/
693 ],
7dfd8446 694 [ "premature next character malformation (immediate)",
9d2d0ecd
KW
695 (isASCII) ? "\xc2\xc2\x80" : I8_to_native("\xc5\xc5\xa0"),
696 3,
eb83ed87
KW
697 $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 1,
698 qr/unexpected non-continuation byte.*immediately after start byte/
699 ],
7dfd8446 700 [ "premature next character malformation (non-immediate)",
152c1f4b 701 I8_to_native("\xf0${I8c}a"),
7dfd8446 702 3,
eb83ed87
KW
703 $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 2,
704 qr/unexpected non-continuation byte .* 2 bytes after start byte/
705 ],
152c1f4b 706 [ "too short malformation", I8_to_native("\xf0${I8c}a"), 2,
eb83ed87
KW
707 # Having the 'a' after this, but saying there are only 2 bytes also
708 # tests that we pay attention to the passed in length
709 $UTF8_ALLOW_SHORT, $REPLACEMENT, 2,
710 qr/2 bytes, need 4/
711 ],
d566bd20
KW
712 [ "overlong malformation, lowest 2-byte",
713 (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"),
714 2,
7dfd8446
KW
715 $UTF8_ALLOW_LONG,
716 0, # NUL
717 2,
eb83ed87 718 qr/2 bytes, need 1/
c0236afe 719 ],
d566bd20
KW
720 [ "overlong malformation, highest 2-byte",
721 (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
722 2,
723 $UTF8_ALLOW_LONG,
724 (isASCII) ? 0x7F : utf8::unicode_to_native(0xBF),
725 2,
726 qr/2 bytes, need 1/
727 ],
728 [ "overlong malformation, lowest 3-byte",
729 (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
730 3,
731 $UTF8_ALLOW_LONG,
732 0, # NUL
733 3,
734 qr/3 bytes, need 1/
735 ],
736 [ "overlong malformation, highest 3-byte",
737 (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
738 3,
739 $UTF8_ALLOW_LONG,
740 (isASCII) ? 0x7FF : 0x3FF,
741 3,
742 qr/3 bytes, need 2/
743 ],
744 [ "overlong malformation, lowest 4-byte",
745 (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
746 4,
747 $UTF8_ALLOW_LONG,
748 0, # NUL
749 4,
750 qr/4 bytes, need 1/
751 ],
752 [ "overlong malformation, highest 4-byte",
753 (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
754 4,
755 $UTF8_ALLOW_LONG,
756 (isASCII) ? 0xFFFF : 0x3FFF,
757 4,
758 qr/4 bytes, need 3/
759 ],
760 [ "overlong malformation, lowest 5-byte",
761 (isASCII)
762 ? "\xf8\x80\x80\x80\x80"
763 : I8_to_native("\xf8\xa0\xa0\xa0\xa0"),
764 5,
765 $UTF8_ALLOW_LONG,
766 0, # NUL
767 5,
768 qr/5 bytes, need 1/
769 ],
770 [ "overlong malformation, highest 5-byte",
771 (isASCII)
772 ? "\xf8\x87\xbf\xbf\xbf"
773 : I8_to_native("\xf8\xa7\xbf\xbf\xbf"),
774 5,
775 $UTF8_ALLOW_LONG,
776 (isASCII) ? 0x1FFFFF : 0x3FFFF,
777 5,
778 qr/5 bytes, need 4/
779 ],
780 [ "overlong malformation, lowest 6-byte",
781 (isASCII)
782 ? "\xfc\x80\x80\x80\x80\x80"
783 : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"),
784 6,
785 $UTF8_ALLOW_LONG,
786 0, # NUL
787 6,
788 qr/6 bytes, need 1/
789 ],
790 [ "overlong malformation, highest 6-byte",
791 (isASCII)
792 ? "\xfc\x83\xbf\xbf\xbf\xbf"
793 : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"),
794 6,
795 $UTF8_ALLOW_LONG,
796 (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
797 6,
798 qr/6 bytes, need 5/
799 ],
800 [ "overlong malformation, lowest 7-byte",
801 (isASCII)
802 ? "\xfe\x80\x80\x80\x80\x80\x80"
803 : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"),
804 7,
805 $UTF8_ALLOW_LONG,
806 0, # NUL
807 7,
808 qr/7 bytes, need 1/
809 ],
810 [ "overlong malformation, highest 7-byte",
811 (isASCII)
812 ? "\xfe\x81\xbf\xbf\xbf\xbf\xbf"
813 : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"),
814 7,
815 $UTF8_ALLOW_LONG,
816 (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
817 7,
818 qr/7 bytes, need 6/
c0236afe
KW
819 ],
820);
7dfd8446 821
d566bd20
KW
822if (isASCII && ! $is64bit) { # 32-bit ASCII platform
823 no warnings 'portable';
824 push @malformations,
825 [ "overflow malformation",
826 "\xfe\x84\x80\x80\x80\x80\x80", # Represents 2**32
827 7,
828 0, # There is no way to allow this malformation
829 $REPLACEMENT,
830 7,
831 qr/overflow/
832 ],
833 [ "overflow malformation, can tell on first byte",
834 "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
835 13,
836 0, # There is no way to allow this malformation
837 $REPLACEMENT,
838 13,
839 qr/overflow/
840 ];
841}
842else {
843 # On EBCDIC platforms, another overlong test is needed even on 32-bit
844 # systems, whereas it doesn't happen on ASCII except on 64-bit ones.
845
846 no warnings 'portable';
847 no warnings 'overflow'; # Doesn't run on 32-bit systems, but compiles
848 push @malformations,
849 [ "overlong malformation, lowest max-byte",
850 (isASCII)
851 ? "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
852 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
853 (isASCII) ? 13 : 14,
854 $UTF8_ALLOW_LONG,
855 0, # NUL
856 (isASCII) ? 13 : 14,
857 qr/1[34] bytes, need 1/, # 1[34] to work on either ASCII or EBCDIC
858 ],
859 [ "overlong malformation, highest max-byte",
860 (isASCII) # 2**36-1 on ASCII; 2**30-1 on EBCDIC
861 ? "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf"
862 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"),
863 (isASCII) ? 13 : 14,
864 $UTF8_ALLOW_LONG,
865 (isASCII) ? 0xFFFFFFFFF : 0x3FFFFFFF,
866 (isASCII) ? 13 : 14,
867 qr/1[34] bytes, need 7/,
868 ];
869
870 if (! $is64bit) { # 32-bit EBCDIC
871 push @malformations,
872 [ "overflow malformation",
873 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
874 14,
875 0, # There is no way to allow this malformation
876 $REPLACEMENT,
877 14,
878 qr/overflow/
879 ];
880 }
881 else { # 64-bit
882 push @malformations,
883 [ "overflow malformation",
884 (isASCII)
885 ? "\xff\x80\x90\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
886 : I8_to_native("\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
887 (isASCII) ? 13 : 14,
888 0, # There is no way to allow this malformation
889 $REPLACEMENT,
890 (isASCII) ? 13 : 14,
891 qr/overflow/
892 ];
893 }
894}
895
7dfd8446 896foreach my $test (@malformations) {
eb83ed87
KW
897 my ($testname, $bytes, $length, $allow_flags, $allowed_uv, $expected_len, $message ) = @$test;
898
899 next if ! ok(length($bytes) >= $length, "$testname: Make sure won't read beyond buffer: " . length($bytes) . " >= $length");
900
d7874298
KW
901 undef @warnings;
902
903 my $ret = test_isUTF8_CHAR($bytes, $length);
904 is($ret, 0, "$testname: isUTF8_CHAR returns 0");
905 unless (is(scalar @warnings, 0,
906 "$testname: isUTF8_CHAR() generated no warnings"))
907 {
908 diag "The warnings were: " . join(", ", @warnings);
909 }
910
25e3a4e0
KW
911 undef @warnings;
912
913 $ret = test_isUTF8_CHAR_flags($bytes, $length, 0);
914 is($ret, 0, "$testname: isUTF8_CHAR_flags returns 0");
915 unless (is(scalar @warnings, 0,
916 "$testname: isUTF8_CHAR() generated no warnings"))
917 {
918 diag "The warnings were: " . join(", ", @warnings);
919 }
920
e23e8bc1
KW
921 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
922 is($ret, 0, "$testname: isSTRICT_UTF8_CHAR returns 0");
923 unless (is(scalar @warnings, 0,
924 "$testname: isSTRICT_UTF8_CHAR() generated no warnings"))
925 {
926 diag "The warnings were: " . join(", ", @warnings);
927 }
928
a82be82b
KW
929 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
930 is($ret, 0, "$testname: isC9_STRICT_UTF8_CHAR returns 0");
931 unless (is(scalar @warnings, 0,
932 "$testname: isC9_STRICT_UTF8_CHAR() generated no warnings"))
933 {
934 diag "The warnings were: " . join(", ", @warnings);
935 }
936
3d56ecbe
KW
937 for my $j (1 .. $length - 1) {
938 my $partial = substr($bytes, 0, $j);
939
940 undef @warnings;
941
942 $ret = test_is_utf8_valid_partial_char_flags($bytes, $j, 0);
943 my $ret_should_be = 0;
944 my $comment = "";
945 if ($testname =~ /premature|short/ && $j < 2) {
946 $ret_should_be = 1;
947 $comment = ", but need 2 bytes to discern:";
948 }
949 elsif ($testname =~ /overlong/ && $length > 2) {
950 if ($length <= 7 && $j < 2) {
951 $ret_should_be = 1;
952 $comment = ", but need 2 bytes to discern:";
953 }
954 elsif ($length > 7 && $j < 7) {
955 $ret_should_be = 1;
956 $comment = ", but need 7 bytes to discern:";
957 }
958 }
959 elsif ($testname =~ /overflow/ && $testname !~ /first byte/) {
960 if (isASCII) {
961 if ($j < (($is64bit) ? 3 : 2)) {
962 $comment = ", but need $j bytes to discern:";
963 $ret_should_be = 1;
964 }
965 }
966 else {
967 if ($j < (($is64bit) ? 2 : 8)) {
968 $comment = ", but need $j bytes to discern:";
969 $ret_should_be = 1;
970 }
971 }
972 }
973 is($ret, $ret_should_be, "$testname: is_utf8_valid_partial_char_flags("
974 . display_bytes($partial)
975 . ")$comment returns $ret_should_be");
976 unless (is(scalar @warnings, 0,
977 "$testname: is_utf8_valid_partial_char_flags() generated no warnings"))
978 {
979 diag "The warnings were: " . join(", ", @warnings);
980 }
981 }
982
d7874298 983
eb83ed87
KW
984 # Test what happens when this malformation is not allowed
985 undef @warnings;
bd70aaaf 986 my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
eb83ed87 987 is($ret_ref->[0], 0, "$testname: disallowed: Returns 0");
9d2d0ecd 988 is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), disallowed: Returns expected length: $expected_len");
eb83ed87
KW
989 if (is(scalar @warnings, 1, "$testname: disallowed: Got a single warning ")) {
990 like($warnings[0], $message, "$testname: disallowed: Got expected warning");
991 }
992 else {
993 if (scalar @warnings) {
7dfd8446 994 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
995 }
996 }
997
998 { # Next test when disallowed, and warnings are off.
999 undef @warnings;
1000 no warnings 'utf8';
bd70aaaf 1001 my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
9d2d0ecd
KW
1002 is($ret_ref->[0], 0, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': Returns 0");
1003 is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': Returns expected length: $expected_len");
1004 if (!is(scalar @warnings, 0, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': no warnings generated")) {
7dfd8446 1005 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
1006 }
1007 }
1008
1009 # Test with CHECK_ONLY
1010 undef @warnings;
bd70aaaf 1011 $ret_ref = test_utf8n_to_uvchr($bytes, $length, $UTF8_CHECK_ONLY);
eb83ed87 1012 is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0");
9d2d0ecd 1013 is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns -1 for length");
eb83ed87 1014 if (! is(scalar @warnings, 0, "$testname: CHECK_ONLY: no warnings generated")) {
7dfd8446 1015 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
1016 }
1017
1018 next if $allow_flags == 0; # Skip if can't allow this malformation
1019
1020 # Test when the malformation is allowed
1021 undef @warnings;
bd70aaaf 1022 $ret_ref = test_utf8n_to_uvchr($bytes, $length, $allow_flags);
9d2d0ecd
KW
1023 is($ret_ref->[0], $allowed_uv, "$testname: utf8n_to_uvchr(), allowed: Returns expected uv: " . sprintf("0x%04X", $allowed_uv));
1024 is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), allowed: Returns expected length: $expected_len");
1025 if (!is(scalar @warnings, 0, "$testname: utf8n_to_uvchr(), allowed: no warnings generated"))
eb83ed87 1026 {
7dfd8446 1027 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
1028 }
1029}
1030
eb83ed87
KW
1031# Now test the cases where a legal code point is generated, but may or may not
1032# be allowed/warned on.
2f8f112e 1033my @tests = (
7dfd8446
KW
1034 [ "lowest surrogate",
1035 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
1036 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
1037 'surrogate', 0xD800,
1038 (isASCII) ? 3 : 4,
1039 qr/surrogate/
1040 ],
1041 [ "a middle surrogate",
1042 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
1043 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
1044 'surrogate', 0xD90D,
1045 (isASCII) ? 3 : 4,
eb83ed87
KW
1046 qr/surrogate/
1047 ],
7dfd8446
KW
1048 [ "highest surrogate",
1049 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
1050 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
1051 'surrogate', 0xDFFF,
1052 (isASCII) ? 3 : 4,
1053 qr/surrogate/
1054 ],
1055 [ "first non_unicode",
1056 (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
1057 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
1058 'non_unicode', 0x110000,
1059 (isASCII) ? 4 : 5,
1060 qr/not Unicode.* may not be portable/
1061 ],
d566bd20
KW
1062 [ "non_unicode whose first byte tells that",
1063 (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
1064 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
1065 'non_unicode',
1066 (isASCII) ? 0x140000 : 0x200000,
1067 (isASCII) ? 4 : 5,
1068 qr/not Unicode.* may not be portable/
1069 ],
7dfd8446
KW
1070 [ "first of 32 consecutive non-character code points",
1071 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
1072 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1073 'nonchar', 0xFDD0,
1074 (isASCII) ? 3 : 4,
1075 qr/Unicode non-character.*is not recommended for open interchange/
1076 ],
1077 [ "a mid non-character code point of the 32 consecutive ones",
1078 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
1079 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1080 'nonchar', 0xFDE0,
1081 (isASCII) ? 3 : 4,
1082 qr/Unicode non-character.*is not recommended for open interchange/
1083 ],
1084 [ "final of 32 consecutive non-character code points",
1085 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
1086 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1087 'nonchar', 0xFDEF,
1088 (isASCII) ? 3 : 4,
1089 qr/Unicode non-character.*is not recommended for open interchange/
1090 ],
1091 [ "non-character code point U+FFFE",
1092 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
1093 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1094 'nonchar', 0xFFFE,
1095 (isASCII) ? 3 : 4,
1096 qr/Unicode non-character.*is not recommended for open interchange/
1097 ],
1098 [ "non-character code point U+FFFF",
1099 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
1100 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1101 'nonchar', 0xFFFF,
1102 (isASCII) ? 3 : 4,
1103 qr/Unicode non-character.*is not recommended for open interchange/
1104 ],
1105 [ "non-character code point U+1FFFE",
1106 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
1107 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1108 'nonchar', 0x1FFFE, 4,
1109 qr/Unicode non-character.*is not recommended for open interchange/
1110 ],
1111 [ "non-character code point U+1FFFF",
1112 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
1113 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1114 'nonchar', 0x1FFFF, 4,
1115 qr/Unicode non-character.*is not recommended for open interchange/
1116 ],
1117 [ "non-character code point U+2FFFE",
1118 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
1119 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1120 'nonchar', 0x2FFFE, 4,
1121 qr/Unicode non-character.*is not recommended for open interchange/
1122 ],
1123 [ "non-character code point U+2FFFF",
1124 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
1125 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1126 'nonchar', 0x2FFFF, 4,
1127 qr/Unicode non-character.*is not recommended for open interchange/
1128 ],
1129 [ "non-character code point U+3FFFE",
1130 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
1131 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1132 'nonchar', 0x3FFFE, 4,
1133 qr/Unicode non-character.*is not recommended for open interchange/
1134 ],
1135 [ "non-character code point U+3FFFF",
1136 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
1137 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1138 'nonchar', 0x3FFFF, 4,
1139 qr/Unicode non-character.*is not recommended for open interchange/
1140 ],
1141 [ "non-character code point U+4FFFE",
1142 (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
1143 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1144 'nonchar', 0x4FFFE,
1145 (isASCII) ? 4 : 5,
1146 qr/Unicode non-character.*is not recommended for open interchange/
1147 ],
1148 [ "non-character code point U+4FFFF",
1149 (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
1150 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1151 'nonchar', 0x4FFFF,
1152 (isASCII) ? 4 : 5,
1153 qr/Unicode non-character.*is not recommended for open interchange/
1154 ],
1155 [ "non-character code point U+5FFFE",
1156 (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
1157 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1158 'nonchar', 0x5FFFE,
1159 (isASCII) ? 4 : 5,
1160 qr/Unicode non-character.*is not recommended for open interchange/
1161 ],
1162 [ "non-character code point U+5FFFF",
1163 (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
1164 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1165 'nonchar', 0x5FFFF,
1166 (isASCII) ? 4 : 5,
1167 qr/Unicode non-character.*is not recommended for open interchange/
1168 ],
1169 [ "non-character code point U+6FFFE",
1170 (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
1171 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1172 'nonchar', 0x6FFFE,
1173 (isASCII) ? 4 : 5,
1174 qr/Unicode non-character.*is not recommended for open interchange/
1175 ],
1176 [ "non-character code point U+6FFFF",
1177 (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
1178 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1179 'nonchar', 0x6FFFF,
1180 (isASCII) ? 4 : 5,
1181 qr/Unicode non-character.*is not recommended for open interchange/
1182 ],
1183 [ "non-character code point U+7FFFE",
1184 (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
1185 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1186 'nonchar', 0x7FFFE,
1187 (isASCII) ? 4 : 5,
1188 qr/Unicode non-character.*is not recommended for open interchange/
1189 ],
1190 [ "non-character code point U+7FFFF",
1191 (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
1192 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1193 'nonchar', 0x7FFFF,
1194 (isASCII) ? 4 : 5,
1195 qr/Unicode non-character.*is not recommended for open interchange/
1196 ],
1197 [ "non-character code point U+8FFFE",
1198 (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
1199 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1200 'nonchar', 0x8FFFE,
1201 (isASCII) ? 4 : 5,
1202 qr/Unicode non-character.*is not recommended for open interchange/
1203 ],
1204 [ "non-character code point U+8FFFF",
1205 (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
1206 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1207 'nonchar', 0x8FFFF,
1208 (isASCII) ? 4 : 5,
1209 qr/Unicode non-character.*is not recommended for open interchange/
1210 ],
1211 [ "non-character code point U+9FFFE",
1212 (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
1213 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1214 'nonchar', 0x9FFFE,
1215 (isASCII) ? 4 : 5,
1216 qr/Unicode non-character.*is not recommended for open interchange/
1217 ],
1218 [ "non-character code point U+9FFFF",
1219 (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
1220 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1221 'nonchar', 0x9FFFF,
1222 (isASCII) ? 4 : 5,
1223 qr/Unicode non-character.*is not recommended for open interchange/
1224 ],
1225 [ "non-character code point U+AFFFE",
1226 (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
1227 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1228 'nonchar', 0xAFFFE,
1229 (isASCII) ? 4 : 5,
1230 qr/Unicode non-character.*is not recommended for open interchange/
1231 ],
1232 [ "non-character code point U+AFFFF",
1233 (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
1234 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1235 'nonchar', 0xAFFFF,
1236 (isASCII) ? 4 : 5,
1237 qr/Unicode non-character.*is not recommended for open interchange/
1238 ],
1239 [ "non-character code point U+BFFFE",
1240 (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
1241 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1242 'nonchar', 0xBFFFE,
1243 (isASCII) ? 4 : 5,
1244 qr/Unicode non-character.*is not recommended for open interchange/
1245 ],
1246 [ "non-character code point U+BFFFF",
1247 (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
1248 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1249 'nonchar', 0xBFFFF,
1250 (isASCII) ? 4 : 5,
1251 qr/Unicode non-character.*is not recommended for open interchange/
1252 ],
1253 [ "non-character code point U+CFFFE",
1254 (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
1255 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1256 'nonchar', 0xCFFFE,
1257 (isASCII) ? 4 : 5,
1258 qr/Unicode non-character.*is not recommended for open interchange/
1259 ],
1260 [ "non-character code point U+CFFFF",
1261 (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
1262 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1263 'nonchar', 0xCFFFF,
1264 (isASCII) ? 4 : 5,
1265 qr/Unicode non-character.*is not recommended for open interchange/
1266 ],
1267 [ "non-character code point U+DFFFE",
1268 (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
1269 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1270 'nonchar', 0xDFFFE,
1271 (isASCII) ? 4 : 5,
1272 qr/Unicode non-character.*is not recommended for open interchange/
1273 ],
1274 [ "non-character code point U+DFFFF",
1275 (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
1276 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1277 'nonchar', 0xDFFFF,
1278 (isASCII) ? 4 : 5,
1279 qr/Unicode non-character.*is not recommended for open interchange/
1280 ],
1281 [ "non-character code point U+EFFFE",
1282 (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
1283 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1284 'nonchar', 0xEFFFE,
1285 (isASCII) ? 4 : 5,
1286 qr/Unicode non-character.*is not recommended for open interchange/
1287 ],
1288 [ "non-character code point U+EFFFF",
1289 (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
1290 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1291 'nonchar', 0xEFFFF,
1292 (isASCII) ? 4 : 5,
1293 qr/Unicode non-character.*is not recommended for open interchange/
1294 ],
1295 [ "non-character code point U+FFFFE",
1296 (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
1297 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1298 'nonchar', 0xFFFFE,
1299 (isASCII) ? 4 : 5,
1300 qr/Unicode non-character.*is not recommended for open interchange/
1301 ],
1302 [ "non-character code point U+FFFFF",
1303 (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
1304 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1305 'nonchar', 0xFFFFF,
1306 (isASCII) ? 4 : 5,
1307 qr/Unicode non-character.*is not recommended for open interchange/
1308 ],
1309 [ "non-character code point U+10FFFE",
1310 (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
1311 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1312 'nonchar', 0x10FFFE,
1313 (isASCII) ? 4 : 5,
1314 qr/Unicode non-character.*is not recommended for open interchange/
eb83ed87 1315 ],
7dfd8446
KW
1316 [ "non-character code point U+10FFFF",
1317 (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
1318 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1319 'nonchar', 0x10FFFF,
1320 (isASCII) ? 4 : 5,
ba707cdc 1321 qr/Unicode non-character.*is not recommended for open interchange/
eb83ed87 1322 ],
1d1c12d9 1323 [ "requires at least 32 bits",
c0236afe
KW
1324 (isASCII)
1325 ? "\xfe\x82\x80\x80\x80\x80\x80"
1326 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
eb83ed87 1327 # This code point is chosen so that it is representable in a UV on
2f8f112e 1328 # 32-bit machines
1d1c12d9 1329 $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
c0236afe 1330 'utf8', 0x80000000, (isASCII) ? 7 :14,
7dfd8446
KW
1331 qr/Code point 0x80000000 is not Unicode, and not portable/
1332 ],
1d1c12d9 1333 [ "requires at least 32 bits, and use SUPER-type flags, instead of ABOVE_31_BIT",
c0236afe
KW
1334 (isASCII)
1335 ? "\xfe\x82\x80\x80\x80\x80\x80"
1336 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
7dfd8446 1337 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
c0236afe 1338 'utf8', 0x80000000, (isASCII) ? 7 :14,
ea5ced44 1339 qr/Code point 0x80000000 is not Unicode, and not portable/
eb83ed87 1340 ],
1d1c12d9
KW
1341 [ "overflow with warnings/disallow for more than 31 bits",
1342 # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
1343 # with overflow. The overflow malformation is never allowed, so
1344 # preventing it takes precedence if the ABOVE_31_BIT options would
c0236afe 1345 # otherwise allow in an overflowing value. The ASCII code points (1
1d1c12d9
KW
1346 # for 32-bits; 1 for 64) were chosen because the old overflow
1347 # detection algorithm did not catch them; this means this test also
c0236afe
KW
1348 # checks for that fix. The EBCDIC are arbitrary overflowing ones
1349 # since we have no reports of failures with it.
1350 (($is64bit)
1351 ? ((isASCII)
9d2d0ecd 1352 ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
c0236afe
KW
1353 : I8_to_native("\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
1354 : ((isASCII)
9d2d0ecd 1355 ? "\xfe\x86\x80\x80\x80\x80\x80"
c0236afe 1356 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
ea5ced44 1357
1d1c12d9
KW
1358 # We include both warning categories to make sure the ABOVE_31_BIT one
1359 # has precedence
1360 "$UTF8_WARN_ABOVE_31_BIT|$UTF8_WARN_SUPER",
1361 "$UTF8_DISALLOW_ABOVE_31_BIT",
7dfd8446 1362 'utf8', 0,
c0236afe 1363 (! isASCII) ? 14 : ($is64bit) ? 13 : 7,
ea5ced44 1364 qr/overflow at byte .*, after start byte 0xf/
eb83ed87 1365 ],
c0236afe 1366);
2f8f112e 1367
c0236afe
KW
1368if ($is64bit) {
1369 no warnings qw{portable overflow};
2f8f112e 1370 push @tests,
c0236afe
KW
1371 [ "More than 32 bits",
1372 (isASCII)
9d2d0ecd 1373 ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
c0236afe 1374 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1d1c12d9 1375 $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
c0236afe 1376 'utf8', 0x1000000000, (isASCII) ? 13 : 14,
ea5ced44 1377 qr/Code point 0x.* is not Unicode, and not portable/
2f8f112e 1378 ];
83dc0f42
KW
1379 if (! isASCII) {
1380 push @tests, # These could falsely show wrongly in a naive implementation
1381 [ "requires at least 32 bits",
1382 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1383 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1384 'utf8', 0x800000000, 14,
1385 qr/Code point 0x800000000 is not Unicode, and not portable/
1386 ],
1387 [ "requires at least 32 bits",
1388 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1389 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1390 'utf8', 0x10000000000, 14,
1391 qr/Code point 0x10000000000 is not Unicode, and not portable/
1392 ],
1393 [ "requires at least 32 bits",
1394 I8_to_native("\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1395 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1396 'utf8', 0x200000000000, 14,
1397 qr/Code point 0x200000000000 is not Unicode, and not portable/
1398 ],
1399 [ "requires at least 32 bits",
1400 I8_to_native("\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1401 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1402 'utf8', 0x4000000000000, 14,
1403 qr/Code point 0x4000000000000 is not Unicode, and not portable/
1404 ],
1405 [ "requires at least 32 bits",
1406 I8_to_native("\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1407 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1408 'utf8', 0x80000000000000, 14,
1409 qr/Code point 0x80000000000000 is not Unicode, and not portable/
1410 ],
1411 [ "requires at least 32 bits",
1412 I8_to_native("\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1413 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1414 'utf8', 0x1000000000000000, 14,
1415 qr/Code point 0x1000000000000000 is not Unicode, and not portable/
1416 ];
1417 }
2f8f112e
KW
1418}
1419
1420foreach my $test (@tests) {
eb83ed87
KW
1421 my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test;
1422
1423 my $length = length $bytes;
2f8f112e 1424 my $will_overflow = $testname =~ /overflow/;
eb83ed87 1425
d7874298
KW
1426 {
1427 use warnings;
1428 undef @warnings;
1429 my $ret = test_isUTF8_CHAR($bytes, $length);
25e3a4e0 1430 my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
d7874298
KW
1431 if ($will_overflow) {
1432 is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
25e3a4e0 1433 is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
d7874298
KW
1434 }
1435 else {
1436 is($ret, $length,
1437 "isUTF8_CHAR() $testname: returns expected length: $length");
25e3a4e0
KW
1438 is($ret_flags, $length,
1439 "isUTF8_CHAR_flags(...,0) $testname: returns expected length: $length");
d7874298
KW
1440 }
1441 unless (is(scalar @warnings, 0,
25e3a4e0 1442 "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated no warnings"))
d7874298
KW
1443 {
1444 diag "The warnings were: " . join(", ", @warnings);
1445 }
3d56ecbe 1446
e23e8bc1
KW
1447 undef @warnings;
1448 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
1449 if ($will_overflow) {
1450 is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
1451 }
1452 else {
1453 my $expected_ret = ( $testname =~ /surrogate|non-character/
1454 || $allowed_uv > 0x10FFFF)
1455 ? 0
1456 : $length;
1457 is($ret, $expected_ret,
1458 "isSTRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
25e3a4e0
KW
1459 $ret = test_isUTF8_CHAR_flags($bytes, $length,
1460 $UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
1461 is($ret, $expected_ret,
1462 "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE') acts like isSTRICT_UTF8_CHAR");
e23e8bc1
KW
1463 }
1464 unless (is(scalar @warnings, 0,
25e3a4e0 1465 "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname: generated no warnings"))
e23e8bc1
KW
1466 {
1467 diag "The warnings were: " . join(", ", @warnings);
1468 }
1469
a82be82b
KW
1470 undef @warnings;
1471 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
1472 if ($will_overflow) {
1473 is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
1474 }
1475 else {
1476 my $expected_ret = ( $testname =~ /surrogate/
1477 || $allowed_uv > 0x10FFFF)
1478 ? 0
1479 : $length;
1480 is($ret, $expected_ret,
1481 "isC9_STRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
25e3a4e0
KW
1482 $ret = test_isUTF8_CHAR_flags($bytes, $length,
1483 $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
1484 is($ret, $expected_ret,
1485 "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like isC9_STRICT_UTF8_CHAR");
a82be82b
KW
1486 }
1487 unless (is(scalar @warnings, 0,
25e3a4e0 1488 "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname: generated no warnings"))
a82be82b
KW
1489 {
1490 diag "The warnings were: " . join(", ", @warnings);
1491 }
1492
3d56ecbe
KW
1493 # Test partial character handling, for each byte not a full character
1494 for my $j (1.. $length - 1) {
1495
1496 # Skip the test for the interaction between overflow and above-31
1497 # bit. It is really testing other things than the partial
1498 # character tests, for which other tests in this file are
1499 # sufficient
1500 last if $testname =~ /overflow/;
1501
1502 foreach my $disallow_flag (0, $disallow_flags) {
1503 my $partial = substr($bytes, 0, $j);
1504 my $ret_should_be;
1505 my $comment;
1506 if ($disallow_flag) {
1507 $ret_should_be = 0;
1508 $comment = "disallowed";
1509 }
1510 else {
1511 $ret_should_be = 1;
1512 $comment = "allowed";
1513 }
1514
1515 if ($disallow_flag) {
1516 if ($testname =~ /non-character/) {
1517 $ret_should_be = 1;
1518 $comment .= ", but but need full char to discern";
1519 }
1520 elsif ($testname =~ /surrogate/) {
1521 if ($j < 2) {
1522 $ret_should_be = 1;
1523 $comment .= ", but need 2 bytes to discern";
1524 }
1525 }
1526 elsif ($testname =~ /first non_unicode/ && $j < 2) {
1527 $ret_should_be = 1;
1528 $comment .= ", but need 2 bytes to discern";
1529 }
1530 }
1531
1532 undef @warnings;
1533
1534 $ret = test_is_utf8_valid_partial_char_flags($partial, $j, $disallow_flag);
1535 is($ret, $ret_should_be, "$testname: is_utf8_valid_partial_char_flags("
1536 . display_bytes($partial)
1537 . "), $comment: returns $ret_should_be");
1538 unless (is(scalar @warnings, 0,
1539 "$testname: is_utf8_valid_partial_char_flags() generated no warnings"))
1540 {
1541 diag "The warnings were: " . join(", ", @warnings);
1542 }
1543 }
1544 }
d7874298
KW
1545 }
1546
eb83ed87
KW
1547 # This is more complicated than the malformations tested earlier, as there
1548 # are several orthogonal variables involved. We test all the subclasses
1549 # of utf8 warnings to verify they work with and without the utf8 class,
1550 # and don't have effects on other sublass warnings
54f4afef 1551 foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
eb83ed87
KW
1552 foreach my $warn_flag (0, $warn_flags) {
1553 foreach my $disallow_flag (0, $disallow_flags) {
54f4afef 1554 foreach my $do_warning (0, 1) {
eb83ed87 1555
13d7a909
KW
1556 my $eval_warn = $do_warning
1557 ? "use warnings '$warning'"
1558 : $warning eq "utf8"
7dfd8446
KW
1559 ? "no warnings 'utf8'"
1560 : "use warnings 'utf8'; no warnings '$warning'";
2f8f112e 1561
13d7a909
KW
1562 # is effectively disallowed if will overflow, even if the
1563 # flag indicates it is allowed, fix up test name to
1564 # indicate this as well
1565 my $disallowed = $disallow_flag || $will_overflow;
2f8f112e 1566
046d01eb 1567 my $this_name = "utf8n_to_uvchr() $testname: " . (($disallow_flag)
13d7a909
KW
1568 ? 'disallowed'
1569 : ($disallowed)
1d1c12d9 1570 ? 'ABOVE_31_BIT allowed'
13d7a909
KW
1571 : 'allowed');
1572 $this_name .= ", $eval_warn";
1573 $this_name .= ", " . (($warn_flag)
1574 ? 'with warning flag'
1575 : 'no warning flag');
eb83ed87 1576
13d7a909
KW
1577 undef @warnings;
1578 my $ret_ref;
7dfd8446
KW
1579 my $display_bytes = display_bytes($bytes);
1580 my $call = "Call was: $eval_warn; \$ret_ref = test_utf8n_to_uvchr('$display_bytes', $length, $warn_flag|$disallow_flag)";
1581 my $eval_text = "$eval_warn; \$ret_ref = test_utf8n_to_uvchr('$bytes', $length, $warn_flag|$disallow_flag)";
13d7a909
KW
1582 eval "$eval_text";
1583 if (! ok ("$@ eq ''", "$this_name: eval succeeded")) {
7dfd8446 1584 diag "\$!='$!'; eval'd=\"$call\"";
13d7a909 1585 next;
54f4afef 1586 }
13d7a909 1587 if ($disallowed) {
7dfd8446
KW
1588 unless (is($ret_ref->[0], 0, "$this_name: Returns 0"))
1589 {
1590 diag $call;
1591 }
2f8f112e
KW
1592 }
1593 else {
7dfd8446 1594 unless (is($ret_ref->[0], $allowed_uv,
9d2d0ecd
KW
1595 "$this_name: Returns expected uv: "
1596 . sprintf("0x%04X", $allowed_uv)))
7dfd8446
KW
1597 {
1598 diag $call;
1599 }
1600 }
1601 unless (is($ret_ref->[1], $expected_len,
9d2d0ecd 1602 "$this_name: Returns expected length: $expected_len"))
7dfd8446
KW
1603 {
1604 diag $call;
13d7a909 1605 }
13d7a909
KW
1606
1607 if (! $do_warning
1608 && ($warning eq 'utf8' || $warning eq $category))
1609 {
1610 if (!is(scalar @warnings, 0,
1611 "$this_name: No warnings generated"))
1612 {
7dfd8446
KW
1613 diag $call;
1614 diag "The warnings were: " . join(", ", @warnings);
2f8f112e
KW
1615 }
1616 }
13d7a909
KW
1617 elsif ($will_overflow
1618 && ! $disallow_flag
1619 && $warning eq 'utf8')
1620 {
1621
1622 # Will get the overflow message instead of the expected
1623 # message under these circumstances, as they would
1624 # otherwise accept an overflowed value, which the code
1625 # should not allow, so falls back to overflow.
1626 if (is(scalar @warnings, 1,
1627 "$this_name: Got a single warning "))
1628 {
7dfd8446
KW
1629 unless (like($warnings[0], qr/overflow/,
1630 "$this_name: Got overflow warning"))
1631 {
1632 diag $call;
1633 }
13d7a909
KW
1634 }
1635 else {
7dfd8446 1636 diag $call;
13d7a909 1637 if (scalar @warnings) {
7dfd8446 1638 diag "The warnings were: "
13d7a909
KW
1639 . join(", ", @warnings);
1640 }
1641 }
eb83ed87 1642 }
13d7a909
KW
1643 elsif ($warn_flag
1644 && ($warning eq 'utf8' || $warning eq $category))
1645 {
1646 if (is(scalar @warnings, 1,
1647 "$this_name: Got a single warning "))
1648 {
7dfd8446
KW
1649 unless (like($warnings[0], $message,
1650 "$this_name: Got expected warning"))
1651 {
1652 diag $call;
1653 }
13d7a909
KW
1654 }
1655 else {
7dfd8446 1656 diag $call;
13d7a909 1657 if (scalar @warnings) {
7dfd8446 1658 diag "The warnings were: "
13d7a909
KW
1659 . join(", ", @warnings);
1660 }
eb83ed87
KW
1661 }
1662 }
eb83ed87 1663
13d7a909
KW
1664 # Check CHECK_ONLY results when the input is disallowed. Do
1665 # this when actually disallowed, not just when the
1666 # $disallow_flag is set
1667 if ($disallowed) {
1668 undef @warnings;
1669 $ret_ref = test_utf8n_to_uvchr($bytes, $length,
1670 $disallow_flag|$UTF8_CHECK_ONLY);
7dfd8446
KW
1671 unless (is($ret_ref->[0], 0, "$this_name, CHECK_ONLY: Returns 0")) {
1672 diag $call;
1673 }
1674 unless (is($ret_ref->[1], -1,
9d2d0ecd 1675 "$this_name: CHECK_ONLY: returns -1 for length"))
7dfd8446
KW
1676 {
1677 diag $call;
1678 }
13d7a909
KW
1679 if (! is(scalar @warnings, 0,
1680 "$this_name, CHECK_ONLY: no warnings generated"))
1681 {
7dfd8446
KW
1682 diag $call;
1683 diag "The warnings were: " . join(", ", @warnings);
13d7a909 1684 }
eb83ed87 1685 }
046d01eb
KW
1686
1687 # Now repeat some of the above, but for
1688 # uvchr_to_utf8_flags(). Since this comes from an
1689 # existing code point, it hasn't overflowed.
1690 next if $will_overflow;
1691
1692 # The warning and disallow flags passed in are for
1693 # utf8n_to_uvchr(). Convert them for
1694 # uvchr_to_utf8_flags().
1695 my $uvchr_warn_flag = 0;
1696 my $uvchr_disallow_flag = 0;
1697 if ($warn_flag) {
1698 if ($warn_flag == $UTF8_WARN_SURROGATE) {
1699 $uvchr_warn_flag = $UNICODE_WARN_SURROGATE
1700 }
1701 elsif ($warn_flag == $UTF8_WARN_NONCHAR) {
1702 $uvchr_warn_flag = $UNICODE_WARN_NONCHAR
1703 }
1704 elsif ($warn_flag == $UTF8_WARN_SUPER) {
1705 $uvchr_warn_flag = $UNICODE_WARN_SUPER
1706 }
1d1c12d9 1707 elsif ($warn_flag == $UTF8_WARN_ABOVE_31_BIT) {
046d01eb
KW
1708 $uvchr_warn_flag = $UNICODE_WARN_ABOVE_31_BIT;
1709 }
1710 else {
1711 fail(sprintf "Unexpected warn flag: %x",
1712 $warn_flag);
1713 next;
1714 }
1715 }
1716 if ($disallow_flag) {
1717 if ($disallow_flag == $UTF8_DISALLOW_SURROGATE) {
1718 $uvchr_disallow_flag = $UNICODE_DISALLOW_SURROGATE
1719 }
1720 elsif ($disallow_flag == $UTF8_DISALLOW_NONCHAR) {
1721 $uvchr_disallow_flag = $UNICODE_DISALLOW_NONCHAR
1722 }
1723 elsif ($disallow_flag == $UTF8_DISALLOW_SUPER) {
1724 $uvchr_disallow_flag = $UNICODE_DISALLOW_SUPER
1725 }
1d1c12d9 1726 elsif ($disallow_flag == $UTF8_DISALLOW_ABOVE_31_BIT) {
046d01eb
KW
1727 $uvchr_disallow_flag =
1728 $UNICODE_DISALLOW_ABOVE_31_BIT;
1729 }
1730 else {
1731 fail(sprintf "Unexpected disallow flag: %x",
1732 $disallow_flag);
1733 next;
1734 }
1735 }
1736
1737 $disallowed = $uvchr_disallow_flag;
1738
1739 $this_name = "uvchr_to_utf8_flags() $testname: "
1740 . (($uvchr_disallow_flag)
1741 ? 'disallowed'
1742 : ($disallowed)
1743 ? 'ABOVE_31_BIT allowed'
1744 : 'allowed');
1745 $this_name .= ", $eval_warn";
1746 $this_name .= ", " . (($uvchr_warn_flag)
1747 ? 'with warning flag'
1748 : 'no warning flag');
1749
1750 undef @warnings;
1751 my $ret;
1752 my $warn_flag = sprintf "0x%x", $uvchr_warn_flag;
1753 my $disallow_flag = sprintf "0x%x", $uvchr_disallow_flag;
1754 $call = sprintf "call was: $eval_warn; \$ret = test_uvchr_to_utf8_flags(0x%x, $warn_flag|$disallow_flag)", $allowed_uv;
1755 $eval_text = "$eval_warn; \$ret = test_uvchr_to_utf8_flags($allowed_uv, $warn_flag|$disallow_flag)";
1756 eval "$eval_text";
1757 if (! ok ("$@ eq ''", "$this_name: eval succeeded")) {
1758 diag "\$!='$!'; eval'd=\"$eval_text\"";
1759 next;
1760 }
1761 if ($disallowed) {
1762 unless (is($ret, undef, "$this_name: Returns undef")) {
1763 diag $call;
1764 }
1765 }
1766 else {
1767 unless (is($ret, $bytes, "$this_name: Returns expected string")) {
1768 diag $call;
1769 }
1770 }
1771 if (! $do_warning
1772 && ($warning eq 'utf8' || $warning eq $category))
1773 {
1774 if (!is(scalar @warnings, 0,
1775 "$this_name: No warnings generated"))
1776 {
1777 diag $call;
1778 diag "The warnings were: " . join(", ", @warnings);
1779 }
1780 }
1781 elsif ($uvchr_warn_flag
1782 && ($warning eq 'utf8' || $warning eq $category))
1783 {
1784 if (is(scalar @warnings, 1,
1785 "$this_name: Got a single warning "))
1786 {
046d01eb
KW
1787 unless (like($warnings[0], $message,
1788 "$this_name: Got expected warning"))
1789 {
1790 diag $call;
1791 }
046d01eb
KW
1792 }
1793 else {
1794 diag $call;
1795 if (scalar @warnings) {
1796 diag "The warnings were: "
1797 . join(", ", @warnings);
1798 }
1799 }
1800 }
eb83ed87
KW
1801 }
1802 }
1803 }
1804 }
1805}
6e3d6c02 1806
fed3ba5d 1807done_testing;