This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Backport foldEQ_utf8 using ibcmp_utf8
[perl5.git] / dist / Devel-PPPort / parts / inc / utf8
CommitLineData
7899b636
KW
1=provides
2
3__UNDEFINED__
4utf8_to_uvchr_buf
5
6=implementation
7
8#define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b))
9
10__UNDEFINED__ UNICODE_REPLACEMENT 0xFFFD
11
12__UNDEFINED__ UTF8_MAXBYTES UTF8_MAXLEN
13
14__UNDEFINED__ UTF8_ALLOW_ANYUV 0
15__UNDEFINED__ UTF8_ALLOW_EMPTY 0x0001
16__UNDEFINED__ UTF8_ALLOW_CONTINUATION 0x0002
17__UNDEFINED__ UTF8_ALLOW_NON_CONTINUATION 0x0004
18__UNDEFINED__ UTF8_ALLOW_SHORT 0x0008
19__UNDEFINED__ UTF8_ALLOW_LONG 0x0010
20__UNDEFINED__ UTF8_ALLOW_OVERFLOW 0x0080
21__UNDEFINED__ UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \
22 |UTF8_ALLOW_NON_CONTINUATION \
23 |UTF8_ALLOW_SHORT \
24 |UTF8_ALLOW_LONG \
25 |UTF8_ALLOW_OVERFLOW)
26
27#if defined UTF8SKIP
28
29/* Don't use official version because it uses MIN, which may not be available */
30#undef UTF8_SAFE_SKIP
31
32__UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \
33 ((((e) - (s)) <= 0) \
34 ? 0 \
35 : D_PPP_MIN(((e) - (s)), UTF8SKIP(s))))
36#endif
37
d2b44e2b
KW
38#ifdef is_ascii_string
39__UNDEFINED__ is_invariant_string(s,l) is_ascii_string(s,l)
40__UNDEFINED__ is_utf8_invariant_string(s,l) is_ascii_string(s,l)
41
42/* Hint: is_utf8_invariant_string
43 Please use this instead of is_ascii_string or is_invariant_string
44*/
45#endif
46
2ff9e5e8
KW
47#ifdef ibcmp_utf8
48__UNDEFINED__ foldEQ_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2) \
49 cBOOL(! ibcmp_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2))
50#endif
51
7899b636
KW
52#if defined(is_utf8_string) && defined(UTF8SKIP)
53__UNDEFINED__ isUTF8_CHAR(s0, e) ( \
54 (e) <= (s0) || ! is_utf8_string(s0, D_PPP_MIN(UTF8SKIP(s0), (e) - (s0))) \
55 ? 0 \
56 : UTF8SKIP(s0))
57#endif
58
8fc7db65
KW
59#if 'A' == 65
60__UNDEFINED__ BOM_UTF8 "\xEF\xBB\xBF"
61__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xEF\xBF\xBD"
62#elif '^' == 95
63__UNDEFINED__ BOM_UTF8 "\xDD\x73\x66\x73"
64__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x73\x73\x71"
65#elif '^' == 176
66__UNDEFINED__ BOM_UTF8 "\xDD\x72\x65\x72"
67__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x72\x72\x70"
68#else
69# error Unknown character set
70#endif
71
7899b636
KW
72#if { VERSION < 5.31.3 }
73 /* Versions prior to this accepted things that are now considered
74 * malformations, and didn't return -1 on error with warnings enabled
75 * */
76# undef utf8_to_uvchr_buf
77#endif
78
79/* This implementation brings modern, generally more restricted standards to
80 * utf8_to_uvchr_buf. Some of these are security related, and clearly must
81 * be done. But its arguable that the others need not, and hence should not.
82 * The reason they're here is that a module that intends to play with the
83 * latest perls should be able to work the same in all releases. An example is
84 * that perl no longer accepts any UV for a code point, but limits them to
85 * IV_MAX or below. This is for future internal use of the larger code points.
86 * If it turns out that some of these changes are breaking code that isn't
87 * intended to work with modern perls, the tighter restrictions could be
88 * relaxed. khw thinks this is unlikely, but has been wrong in the past. */
89
90/* 5.6.0 is the first release with UTF-8, and we don't implement this function
91 * there due to its likely lack of still being in use, and the underlying
92 * implementation is very different from later ones, without the later
93 * safeguards, so would require extra work to deal with */
94#if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf)
95 /* Choose which underlying implementation to use. At least one must be
96 * present or the perl is too early to handle this function */
97# if defined(utf8n_to_uvchr) || defined(utf8_to_uv)
98# if defined(utf8n_to_uvchr) /* This is the preferred implementation */
99# define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr
100# else /* Must be at least 5.6.1 from #if above */
101# define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) utf8_to_uv((U8 *)(s), (curlen), (retlen), (flags))
102# endif
103# endif
104
105# if { NEED utf8_to_uvchr_buf }
106
107UV
108utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
109{
110 UV ret;
111 STRLEN curlen;
112 bool overflows = 0;
113 const U8 *cur_s = s;
114 const bool do_warnings = ckWARN_d(WARN_UTF8);
115# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
116 STRLEN overflow_length = 0;
117# endif
118
119 if (send > s) {
120 curlen = send - s;
121 }
122 else {
123 assert(0); /* Modern perls die under this circumstance */
124 curlen = 0;
125 if (! do_warnings) { /* Handle empty here if no warnings needed */
126 if (retlen) *retlen = 0;
127 return UNICODE_REPLACEMENT;
128 }
129 }
130
131# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
132
133 /* Perl did not properly detect overflow for much of its history on
134 * non-EBCDIC platforms, often returning an overlong value which may or may
135 * not have been tolerated in the call. Also, earlier versions, when they
136 * did detect overflow, may have disallowed it completely. Modern ones can
137 * replace it with the REPLACEMENT CHARACTER, depending on calling
138 * parameters. Therefore detect it ourselves in releases it was
139 * problematic in. */
140
141 if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {
142
143 /* First, on a 32-bit machine the first byte being at least \xFE
144 * automatically is overflow, as it indicates something requiring more
145 * than 31 bits */
146 if (sizeof(ret) < 8) {
147 overflows = 1;
148 overflow_length = 7;
149 }
150 else {
151 const U8 highest[] = /* 2*63-1 */
152 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
153 const U8 *cur_h = highest;
154
155 for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
156 if (UNLIKELY(*cur_s == *cur_h)) {
157 continue;
158 }
159
160 /* If this byte is larger than the corresponding highest UTF-8
161 * byte, the sequence overflows; otherwise the byte is less
162 * than (as we handled the equality case above), and so the
163 * sequence doesn't overflow */
164 overflows = *cur_s > *cur_h;
165 break;
166
167 }
168
169 /* Here, either we set the bool and broke out of the loop, or got
170 * to the end and all bytes are the same which indicates it doesn't
171 * overflow. If it did overflow, it would be this number of bytes
172 * */
173 overflow_length = 13;
174 }
175 }
176
177 if (UNLIKELY(overflows)) {
178 ret = 0;
179
180 if (! do_warnings && retlen) {
181 *retlen = overflow_length;
182 }
183 }
184 else
185
186# endif /* < 5.26 */
187
188 /* Here, we are either in a release that properly detects overflow, or
189 * we have checked for overflow and the next statement is executing as
190 * part of the above conditional where we know we don't have overflow.
191 *
192 * The modern versions allow anything that evaluates to a legal UV, but
193 * not overlongs nor an empty input */
194 ret = D_PPP_utf8_to_uvchr_buf_callee(
195 s, curlen, retlen, (UTF8_ALLOW_ANYUV
196 & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
197
198# if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }
199
200 /* But actually, more modern versions restrict the UV to being no more than
201 * what * an IV can hold, so it could, so it could still have gotten it
202 * wrong about overflowing. */
203 if (UNLIKELY(ret > IV_MAX)) {
204 overflows = 1;
205 }
206
207# endif
208
209 if (UNLIKELY(overflows)) {
210 if (! do_warnings) {
211 if (retlen) {
212 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
213 *retlen = D_PPP_MIN(*retlen, curlen);
214 }
215 return UNICODE_REPLACEMENT;
216 }
217 else {
218
219 /* We use the error message in use from 5.8-5.26 */
220 Perl_warner(aTHX_ packWARN(WARN_UTF8),
221 "Malformed UTF-8 character (overflow at 0x%" UVxf
222 ", byte 0x%02x, after start byte 0x%02x)",
223 ret, *cur_s, *s);
224 if (retlen) {
225 *retlen = (STRLEN) -1;
226 }
227 return 0;
228 }
229 }
230
231 /* Here, did not overflow, but if it failed for some other reason, and
232 * warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
233 * try again, allowing anything. (Note a return of 0 is ok if the input
234 * was '\0') */
235 if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
236
237 /* If curlen is 0, we already handled the case where warnings are
238 * disabled, so this 'if' will be true, and so later on, we know that
239 * 's' is dereferencible */
240 if (do_warnings) {
241 *retlen = (STRLEN) -1;
242 }
243 else {
244 ret = D_PPP_utf8_to_uvchr_buf_callee(
245 s, curlen, retlen, UTF8_ALLOW_ANY);
246 /* Override with the REPLACEMENT character, as that is what the
247 * modern version of this function returns */
248 ret = UNICODE_REPLACEMENT;
249
250# if { VERSION < 5.16.0 }
251
252 /* Versions earlier than this don't necessarily return the proper
253 * length. It should not extend past the end of string, nor past
254 * what the first byte indicates the length is, nor past the
255 * continuation characters */
256 if (retlen && *retlen >= 0) {
257 unsigned int i = 1;
258
259 *retlen = D_PPP_MIN(*retlen, curlen);
260 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
261 do {
262 if (s[i] < 0x80 || s[i] > 0xBF) {
263 *retlen = i;
264 break;
265 }
266 } while (++i < *retlen);
267 }
268
269# endif
270
271 }
272 }
273
274 return ret;
275}
276
277# endif
278#endif
279
280#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
281#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
282 to read past a NUL, making it much less likely to read
283 off the end of the buffer. A NUL indicates the start
284 of the next character anyway. If the input isn't
285 NUL-terminated, the function remains unsafe, as it
286 always has been. */
287
288__UNDEFINED__ utf8_to_uvchr(s, lp) \
289 ((*(s) == '\0') \
290 ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \
291 : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp)))
292
293#endif
294
295=xsinit
296
297#define NEED_utf8_to_uvchr_buf
298
299=xsubs
300
301#if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP)
302
303STRLEN
304UTF8_SAFE_SKIP(s, adjustment)
305 char * s
306 int adjustment
307 PREINIT:
308 const char *const_s;
309 CODE:
310 const_s = s;
311 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
312 RETVAL = UTF8_SAFE_SKIP(const_s, s + UTF8SKIP(s) + adjustment);
313 OUTPUT:
314 RETVAL
315
316#endif
317
318#ifdef isUTF8_CHAR
319
320STRLEN
321isUTF8_CHAR(s, adjustment)
322 unsigned char * s
323 int adjustment
324 PREINIT:
325 const unsigned char *const_s;
326 const unsigned char *const_e;
327 CODE:
328 const_s = s;
329 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
330 const_e = const_s + UTF8SKIP(const_s) + adjustment;
331 RETVAL = isUTF8_CHAR(const_s, const_e);
332 OUTPUT:
333 RETVAL
334
335#endif
336
2ff9e5e8
KW
337
338#ifdef foldEQ_utf8
339
340STRLEN
341foldEQ_utf8(s1, l1, u1, s2, l2, u2)
342 char *s1
343 UV l1
344 bool u1
345 char *s2
346 UV l2
347 bool u2
348 PREINIT:
349 const char *const_s1;
350 const char *const_s2;
351 CODE:
352 const_s1 = s1;
353 const_s2 = s2;
354 RETVAL = foldEQ_utf8(const_s1, NULL, l1, u1, const_s2, NULL, l2, u2);
355 OUTPUT:
356 RETVAL
357
358#endif
359
7899b636
KW
360#ifdef utf8_to_uvchr_buf
361
362AV *
363utf8_to_uvchr_buf(s, adjustment)
364 unsigned char *s
365 int adjustment
366 PREINIT:
367 AV *av;
368 STRLEN len;
369 const unsigned char *const_s;
370 CODE:
371 av = newAV();
372 const_s = s;
373 av_push(av, newSVuv(utf8_to_uvchr_buf(const_s,
374 s + UTF8SKIP(s) + adjustment,
375 &len)));
376 if (len == (STRLEN) -1) {
377 av_push(av, newSViv(-1));
378 }
379 else {
380 av_push(av, newSVuv(len));
381 }
382 RETVAL = av;
383 OUTPUT:
384 RETVAL
385
386#endif
387
388#ifdef utf8_to_uvchr
389
390AV *
391utf8_to_uvchr(s)
392 unsigned char *s
393 PREINIT:
394 AV *av;
395 STRLEN len;
396 const unsigned char *const_s;
397 CODE:
398 av = newAV();
399 const_s = s;
400 av_push(av, newSVuv(utf8_to_uvchr(const_s, &len)));
401 if (len == (STRLEN) -1) {
402 av_push(av, newSViv(-1));
403 }
404 else {
405 av_push(av, newSVuv(len));
406 }
407 RETVAL = av;
408 OUTPUT:
409 RETVAL
410
411#endif
412
2ff9e5e8 413=tests plan => 58
7899b636
KW
414
415BEGIN { require warnings if "$]" gt '5.006' }
416
417# skip tests on 5.6.0 and earlier
418if ("$]" le '5.006') {
2ff9e5e8 419 skip 'skip: broken utf8 support', 0 for 1..58;
7899b636
KW
420 exit;
421}
422
423ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
424ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
425
426ok(&Devel::PPPort::isUTF8_CHAR("A", -1), 0);
427ok(&Devel::PPPort::isUTF8_CHAR("A", 0), 1);
428ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", -1), 0);
429ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", 0), 2);
430
2ff9e5e8
KW
431if ("$]" lt '5.008') {
432 ok(1, 1) for 1 ..3
433}
434else {
435 ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{101}", 3, 1), 1);
436 ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{102}", 3, 1), 0);
437 ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "b\x{101}", 3, 1), 0);
438}
439
7899b636
KW
440my $ret = &Devel::PPPort::utf8_to_uvchr("A");
441ok($ret->[0], ord("A"));
442ok($ret->[1], 1);
443
444$ret = &Devel::PPPort::utf8_to_uvchr("\0");
445ok($ret->[0], 0);
446ok($ret->[1], 1);
447
448$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
449ok($ret->[0], ord("A"));
450ok($ret->[1], 1);
451
452$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
453ok($ret->[0], 0);
454ok($ret->[1], 1);
455
456if (ord("A") != 65) { # tests not valid for EBCDIC
457 ok(1, 1) for 1 .. (2 + 4 + (7 * 5));
458}
459else {
460 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
461 ok($ret->[0], 0x100);
462 ok($ret->[1], 2);
463
464 my @warnings;
465 local $SIG{__WARN__} = sub { push @warnings, @_; };
466
467 {
468 BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' }
469 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
470 ok($ret->[0], 0);
471 ok($ret->[1], -1);
472
473 BEGIN { 'warnings'->unimport() if "$]" gt '5.006' }
474 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
475 ok($ret->[0], 0xFFFD);
476 ok($ret->[1], 1);
477 }
478
479 my @buf_tests = (
480 {
481 input => "A",
482 adjustment => -1,
483 warning => qr/empty/,
484 no_warnings_returned_length => 0,
485 },
486 {
487 input => "\xc4\xc5",
488 adjustment => 0,
489 warning => qr/non-continuation/,
490 no_warnings_returned_length => 1,
491 },
492 {
493 input => "\xc4\x80",
494 adjustment => -1,
495 warning => qr/short|1 byte, need 2/,
496 no_warnings_returned_length => 1,
497 },
498 {
499 input => "\xc0\x81",
500 adjustment => 0,
501 warning => qr/overlong|2 bytes, need 1/,
502 no_warnings_returned_length => 2,
503 },
504 {
505 input => "\xe0\x80\x81",
506 adjustment => 0,
507 warning => qr/overlong|3 bytes, need 1/,
508 no_warnings_returned_length => 3,
509 },
510 {
511 input => "\xf0\x80\x80\x81",
512 adjustment => 0,
513 warning => qr/overlong|4 bytes, need 1/,
514 no_warnings_returned_length => 4,
515 },
516 { # Old algorithm failed to detect this
517 input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
518 adjustment => 0,
519 warning => qr/overflow/,
520 no_warnings_returned_length => 13,
521 },
522 );
523
524 # An empty input is an assertion failure on debugging builds. It is
525 # deliberately the first test.
526 require Config; import Config;
527 use vars '%Config';
528 if ($Config{ccflags} =~ /-DDEBUGGING/) {
529 shift @buf_tests;
530 ok(1, 1) for 1..5;
531 }
532
533 for my $test (@buf_tests) {
534 my $input = $test->{'input'};
535 my $adjustment = $test->{'adjustment'};
536 my $display = 'utf8_to_uvchr_buf("';
537 for (my $i = 0; $i < length($input) + $adjustment; $i++) {
538 $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
539 }
540
541 $display .= '")';
542 my $warning = $test->{'warning'};
543
544 undef @warnings;
545 BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' }
546 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
547 ok($ret->[0], 0, "returned value $display; warnings enabled");
548 ok($ret->[1], -1, "returned length $display; warnings enabled");
549 my $all_warnings = join "; ", @warnings;
550 my $contains = grep { $_ =~ $warning } $all_warnings;
551 ok($contains, 1, $display
552 . "; Got: '$all_warnings', which should contain '$warning'");
553
554 undef @warnings;
555 BEGIN { 'warnings'->unimport('utf8') if "$]" gt '5.006' }
556 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
557 ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled");
558 ok($ret->[1], $test->{'no_warnings_returned_length'},
559 "returned length $display; warnings disabled");
560 }
561}