This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Backport is_utf8_invariant_string, is_invariant_string
[perl5.git] / dist / Devel-PPPort / parts / inc / utf8
CommitLineData
7899b636
KW
1=provides
2
3__UNDEFINED__
4utf8_to_uvchr_buf
5
6=implementation
7
8#define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b))
9
10__UNDEFINED__ UNICODE_REPLACEMENT 0xFFFD
11
12__UNDEFINED__ UTF8_MAXBYTES UTF8_MAXLEN
13
14__UNDEFINED__ UTF8_ALLOW_ANYUV 0
15__UNDEFINED__ UTF8_ALLOW_EMPTY 0x0001
16__UNDEFINED__ UTF8_ALLOW_CONTINUATION 0x0002
17__UNDEFINED__ UTF8_ALLOW_NON_CONTINUATION 0x0004
18__UNDEFINED__ UTF8_ALLOW_SHORT 0x0008
19__UNDEFINED__ UTF8_ALLOW_LONG 0x0010
20__UNDEFINED__ UTF8_ALLOW_OVERFLOW 0x0080
21__UNDEFINED__ UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \
22 |UTF8_ALLOW_NON_CONTINUATION \
23 |UTF8_ALLOW_SHORT \
24 |UTF8_ALLOW_LONG \
25 |UTF8_ALLOW_OVERFLOW)
26
27#if defined UTF8SKIP
28
29/* Don't use official version because it uses MIN, which may not be available */
30#undef UTF8_SAFE_SKIP
31
32__UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \
33 ((((e) - (s)) <= 0) \
34 ? 0 \
35 : D_PPP_MIN(((e) - (s)), UTF8SKIP(s))))
36#endif
37
d2b44e2b
KW
38#ifdef is_ascii_string
39__UNDEFINED__ is_invariant_string(s,l) is_ascii_string(s,l)
40__UNDEFINED__ is_utf8_invariant_string(s,l) is_ascii_string(s,l)
41
42/* Hint: is_utf8_invariant_string
43 Please use this instead of is_ascii_string or is_invariant_string
44*/
45#endif
46
7899b636
KW
47#if defined(is_utf8_string) && defined(UTF8SKIP)
48__UNDEFINED__ isUTF8_CHAR(s0, e) ( \
49 (e) <= (s0) || ! is_utf8_string(s0, D_PPP_MIN(UTF8SKIP(s0), (e) - (s0))) \
50 ? 0 \
51 : UTF8SKIP(s0))
52#endif
53
8fc7db65
KW
54#if 'A' == 65
55__UNDEFINED__ BOM_UTF8 "\xEF\xBB\xBF"
56__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xEF\xBF\xBD"
57#elif '^' == 95
58__UNDEFINED__ BOM_UTF8 "\xDD\x73\x66\x73"
59__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x73\x73\x71"
60#elif '^' == 176
61__UNDEFINED__ BOM_UTF8 "\xDD\x72\x65\x72"
62__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x72\x72\x70"
63#else
64# error Unknown character set
65#endif
66
7899b636
KW
67#if { VERSION < 5.31.3 }
68 /* Versions prior to this accepted things that are now considered
69 * malformations, and didn't return -1 on error with warnings enabled
70 * */
71# undef utf8_to_uvchr_buf
72#endif
73
74/* This implementation brings modern, generally more restricted standards to
75 * utf8_to_uvchr_buf. Some of these are security related, and clearly must
76 * be done. But its arguable that the others need not, and hence should not.
77 * The reason they're here is that a module that intends to play with the
78 * latest perls should be able to work the same in all releases. An example is
79 * that perl no longer accepts any UV for a code point, but limits them to
80 * IV_MAX or below. This is for future internal use of the larger code points.
81 * If it turns out that some of these changes are breaking code that isn't
82 * intended to work with modern perls, the tighter restrictions could be
83 * relaxed. khw thinks this is unlikely, but has been wrong in the past. */
84
85/* 5.6.0 is the first release with UTF-8, and we don't implement this function
86 * there due to its likely lack of still being in use, and the underlying
87 * implementation is very different from later ones, without the later
88 * safeguards, so would require extra work to deal with */
89#if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf)
90 /* Choose which underlying implementation to use. At least one must be
91 * present or the perl is too early to handle this function */
92# if defined(utf8n_to_uvchr) || defined(utf8_to_uv)
93# if defined(utf8n_to_uvchr) /* This is the preferred implementation */
94# define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr
95# else /* Must be at least 5.6.1 from #if above */
96# define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) utf8_to_uv((U8 *)(s), (curlen), (retlen), (flags))
97# endif
98# endif
99
100# if { NEED utf8_to_uvchr_buf }
101
102UV
103utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
104{
105 UV ret;
106 STRLEN curlen;
107 bool overflows = 0;
108 const U8 *cur_s = s;
109 const bool do_warnings = ckWARN_d(WARN_UTF8);
110# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
111 STRLEN overflow_length = 0;
112# endif
113
114 if (send > s) {
115 curlen = send - s;
116 }
117 else {
118 assert(0); /* Modern perls die under this circumstance */
119 curlen = 0;
120 if (! do_warnings) { /* Handle empty here if no warnings needed */
121 if (retlen) *retlen = 0;
122 return UNICODE_REPLACEMENT;
123 }
124 }
125
126# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
127
128 /* Perl did not properly detect overflow for much of its history on
129 * non-EBCDIC platforms, often returning an overlong value which may or may
130 * not have been tolerated in the call. Also, earlier versions, when they
131 * did detect overflow, may have disallowed it completely. Modern ones can
132 * replace it with the REPLACEMENT CHARACTER, depending on calling
133 * parameters. Therefore detect it ourselves in releases it was
134 * problematic in. */
135
136 if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {
137
138 /* First, on a 32-bit machine the first byte being at least \xFE
139 * automatically is overflow, as it indicates something requiring more
140 * than 31 bits */
141 if (sizeof(ret) < 8) {
142 overflows = 1;
143 overflow_length = 7;
144 }
145 else {
146 const U8 highest[] = /* 2*63-1 */
147 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
148 const U8 *cur_h = highest;
149
150 for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
151 if (UNLIKELY(*cur_s == *cur_h)) {
152 continue;
153 }
154
155 /* If this byte is larger than the corresponding highest UTF-8
156 * byte, the sequence overflows; otherwise the byte is less
157 * than (as we handled the equality case above), and so the
158 * sequence doesn't overflow */
159 overflows = *cur_s > *cur_h;
160 break;
161
162 }
163
164 /* Here, either we set the bool and broke out of the loop, or got
165 * to the end and all bytes are the same which indicates it doesn't
166 * overflow. If it did overflow, it would be this number of bytes
167 * */
168 overflow_length = 13;
169 }
170 }
171
172 if (UNLIKELY(overflows)) {
173 ret = 0;
174
175 if (! do_warnings && retlen) {
176 *retlen = overflow_length;
177 }
178 }
179 else
180
181# endif /* < 5.26 */
182
183 /* Here, we are either in a release that properly detects overflow, or
184 * we have checked for overflow and the next statement is executing as
185 * part of the above conditional where we know we don't have overflow.
186 *
187 * The modern versions allow anything that evaluates to a legal UV, but
188 * not overlongs nor an empty input */
189 ret = D_PPP_utf8_to_uvchr_buf_callee(
190 s, curlen, retlen, (UTF8_ALLOW_ANYUV
191 & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
192
193# if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }
194
195 /* But actually, more modern versions restrict the UV to being no more than
196 * what * an IV can hold, so it could, so it could still have gotten it
197 * wrong about overflowing. */
198 if (UNLIKELY(ret > IV_MAX)) {
199 overflows = 1;
200 }
201
202# endif
203
204 if (UNLIKELY(overflows)) {
205 if (! do_warnings) {
206 if (retlen) {
207 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
208 *retlen = D_PPP_MIN(*retlen, curlen);
209 }
210 return UNICODE_REPLACEMENT;
211 }
212 else {
213
214 /* We use the error message in use from 5.8-5.26 */
215 Perl_warner(aTHX_ packWARN(WARN_UTF8),
216 "Malformed UTF-8 character (overflow at 0x%" UVxf
217 ", byte 0x%02x, after start byte 0x%02x)",
218 ret, *cur_s, *s);
219 if (retlen) {
220 *retlen = (STRLEN) -1;
221 }
222 return 0;
223 }
224 }
225
226 /* Here, did not overflow, but if it failed for some other reason, and
227 * warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
228 * try again, allowing anything. (Note a return of 0 is ok if the input
229 * was '\0') */
230 if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
231
232 /* If curlen is 0, we already handled the case where warnings are
233 * disabled, so this 'if' will be true, and so later on, we know that
234 * 's' is dereferencible */
235 if (do_warnings) {
236 *retlen = (STRLEN) -1;
237 }
238 else {
239 ret = D_PPP_utf8_to_uvchr_buf_callee(
240 s, curlen, retlen, UTF8_ALLOW_ANY);
241 /* Override with the REPLACEMENT character, as that is what the
242 * modern version of this function returns */
243 ret = UNICODE_REPLACEMENT;
244
245# if { VERSION < 5.16.0 }
246
247 /* Versions earlier than this don't necessarily return the proper
248 * length. It should not extend past the end of string, nor past
249 * what the first byte indicates the length is, nor past the
250 * continuation characters */
251 if (retlen && *retlen >= 0) {
252 unsigned int i = 1;
253
254 *retlen = D_PPP_MIN(*retlen, curlen);
255 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
256 do {
257 if (s[i] < 0x80 || s[i] > 0xBF) {
258 *retlen = i;
259 break;
260 }
261 } while (++i < *retlen);
262 }
263
264# endif
265
266 }
267 }
268
269 return ret;
270}
271
272# endif
273#endif
274
275#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
276#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
277 to read past a NUL, making it much less likely to read
278 off the end of the buffer. A NUL indicates the start
279 of the next character anyway. If the input isn't
280 NUL-terminated, the function remains unsafe, as it
281 always has been. */
282
283__UNDEFINED__ utf8_to_uvchr(s, lp) \
284 ((*(s) == '\0') \
285 ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \
286 : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp)))
287
288#endif
289
290=xsinit
291
292#define NEED_utf8_to_uvchr_buf
293
294=xsubs
295
296#if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP)
297
298STRLEN
299UTF8_SAFE_SKIP(s, adjustment)
300 char * s
301 int adjustment
302 PREINIT:
303 const char *const_s;
304 CODE:
305 const_s = s;
306 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
307 RETVAL = UTF8_SAFE_SKIP(const_s, s + UTF8SKIP(s) + adjustment);
308 OUTPUT:
309 RETVAL
310
311#endif
312
313#ifdef isUTF8_CHAR
314
315STRLEN
316isUTF8_CHAR(s, adjustment)
317 unsigned char * s
318 int adjustment
319 PREINIT:
320 const unsigned char *const_s;
321 const unsigned char *const_e;
322 CODE:
323 const_s = s;
324 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
325 const_e = const_s + UTF8SKIP(const_s) + adjustment;
326 RETVAL = isUTF8_CHAR(const_s, const_e);
327 OUTPUT:
328 RETVAL
329
330#endif
331
332#ifdef utf8_to_uvchr_buf
333
334AV *
335utf8_to_uvchr_buf(s, adjustment)
336 unsigned char *s
337 int adjustment
338 PREINIT:
339 AV *av;
340 STRLEN len;
341 const unsigned char *const_s;
342 CODE:
343 av = newAV();
344 const_s = s;
345 av_push(av, newSVuv(utf8_to_uvchr_buf(const_s,
346 s + UTF8SKIP(s) + adjustment,
347 &len)));
348 if (len == (STRLEN) -1) {
349 av_push(av, newSViv(-1));
350 }
351 else {
352 av_push(av, newSVuv(len));
353 }
354 RETVAL = av;
355 OUTPUT:
356 RETVAL
357
358#endif
359
360#ifdef utf8_to_uvchr
361
362AV *
363utf8_to_uvchr(s)
364 unsigned char *s
365 PREINIT:
366 AV *av;
367 STRLEN len;
368 const unsigned char *const_s;
369 CODE:
370 av = newAV();
371 const_s = s;
372 av_push(av, newSVuv(utf8_to_uvchr(const_s, &len)));
373 if (len == (STRLEN) -1) {
374 av_push(av, newSViv(-1));
375 }
376 else {
377 av_push(av, newSVuv(len));
378 }
379 RETVAL = av;
380 OUTPUT:
381 RETVAL
382
383#endif
384
385=tests plan => 55
386
387BEGIN { require warnings if "$]" gt '5.006' }
388
389# skip tests on 5.6.0 and earlier
390if ("$]" le '5.006') {
391 skip 'skip: broken utf8 support', 0 for 1..55;
392 exit;
393}
394
395ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
396ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
397
398ok(&Devel::PPPort::isUTF8_CHAR("A", -1), 0);
399ok(&Devel::PPPort::isUTF8_CHAR("A", 0), 1);
400ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", -1), 0);
401ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", 0), 2);
402
403my $ret = &Devel::PPPort::utf8_to_uvchr("A");
404ok($ret->[0], ord("A"));
405ok($ret->[1], 1);
406
407$ret = &Devel::PPPort::utf8_to_uvchr("\0");
408ok($ret->[0], 0);
409ok($ret->[1], 1);
410
411$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
412ok($ret->[0], ord("A"));
413ok($ret->[1], 1);
414
415$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
416ok($ret->[0], 0);
417ok($ret->[1], 1);
418
419if (ord("A") != 65) { # tests not valid for EBCDIC
420 ok(1, 1) for 1 .. (2 + 4 + (7 * 5));
421}
422else {
423 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
424 ok($ret->[0], 0x100);
425 ok($ret->[1], 2);
426
427 my @warnings;
428 local $SIG{__WARN__} = sub { push @warnings, @_; };
429
430 {
431 BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' }
432 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
433 ok($ret->[0], 0);
434 ok($ret->[1], -1);
435
436 BEGIN { 'warnings'->unimport() if "$]" gt '5.006' }
437 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
438 ok($ret->[0], 0xFFFD);
439 ok($ret->[1], 1);
440 }
441
442 my @buf_tests = (
443 {
444 input => "A",
445 adjustment => -1,
446 warning => qr/empty/,
447 no_warnings_returned_length => 0,
448 },
449 {
450 input => "\xc4\xc5",
451 adjustment => 0,
452 warning => qr/non-continuation/,
453 no_warnings_returned_length => 1,
454 },
455 {
456 input => "\xc4\x80",
457 adjustment => -1,
458 warning => qr/short|1 byte, need 2/,
459 no_warnings_returned_length => 1,
460 },
461 {
462 input => "\xc0\x81",
463 adjustment => 0,
464 warning => qr/overlong|2 bytes, need 1/,
465 no_warnings_returned_length => 2,
466 },
467 {
468 input => "\xe0\x80\x81",
469 adjustment => 0,
470 warning => qr/overlong|3 bytes, need 1/,
471 no_warnings_returned_length => 3,
472 },
473 {
474 input => "\xf0\x80\x80\x81",
475 adjustment => 0,
476 warning => qr/overlong|4 bytes, need 1/,
477 no_warnings_returned_length => 4,
478 },
479 { # Old algorithm failed to detect this
480 input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
481 adjustment => 0,
482 warning => qr/overflow/,
483 no_warnings_returned_length => 13,
484 },
485 );
486
487 # An empty input is an assertion failure on debugging builds. It is
488 # deliberately the first test.
489 require Config; import Config;
490 use vars '%Config';
491 if ($Config{ccflags} =~ /-DDEBUGGING/) {
492 shift @buf_tests;
493 ok(1, 1) for 1..5;
494 }
495
496 for my $test (@buf_tests) {
497 my $input = $test->{'input'};
498 my $adjustment = $test->{'adjustment'};
499 my $display = 'utf8_to_uvchr_buf("';
500 for (my $i = 0; $i < length($input) + $adjustment; $i++) {
501 $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
502 }
503
504 $display .= '")';
505 my $warning = $test->{'warning'};
506
507 undef @warnings;
508 BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' }
509 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
510 ok($ret->[0], 0, "returned value $display; warnings enabled");
511 ok($ret->[1], -1, "returned length $display; warnings enabled");
512 my $all_warnings = join "; ", @warnings;
513 my $contains = grep { $_ =~ $warning } $all_warnings;
514 ok($contains, 1, $display
515 . "; Got: '$all_warnings', which should contain '$warning'");
516
517 undef @warnings;
518 BEGIN { 'warnings'->unimport('utf8') if "$]" gt '5.006' }
519 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
520 ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled");
521 ok($ret->[1], $test->{'no_warnings_returned_length'},
522 "returned length $display; warnings disabled");
523 }
524}