This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Backport BOM_UTF8 REPLACEMENT_CHARACTER_UTF8
[perl5.git] / dist / Devel-PPPort / parts / inc / utf8
CommitLineData
7899b636
KW
1=provides
2
3__UNDEFINED__
4utf8_to_uvchr_buf
5
6=implementation
7
8#define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b))
9
10__UNDEFINED__ UNICODE_REPLACEMENT 0xFFFD
11
12__UNDEFINED__ UTF8_MAXBYTES UTF8_MAXLEN
13
14__UNDEFINED__ UTF8_ALLOW_ANYUV 0
15__UNDEFINED__ UTF8_ALLOW_EMPTY 0x0001
16__UNDEFINED__ UTF8_ALLOW_CONTINUATION 0x0002
17__UNDEFINED__ UTF8_ALLOW_NON_CONTINUATION 0x0004
18__UNDEFINED__ UTF8_ALLOW_SHORT 0x0008
19__UNDEFINED__ UTF8_ALLOW_LONG 0x0010
20__UNDEFINED__ UTF8_ALLOW_OVERFLOW 0x0080
21__UNDEFINED__ UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \
22 |UTF8_ALLOW_NON_CONTINUATION \
23 |UTF8_ALLOW_SHORT \
24 |UTF8_ALLOW_LONG \
25 |UTF8_ALLOW_OVERFLOW)
26
27#if defined UTF8SKIP
28
29/* Don't use official version because it uses MIN, which may not be available */
30#undef UTF8_SAFE_SKIP
31
32__UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \
33 ((((e) - (s)) <= 0) \
34 ? 0 \
35 : D_PPP_MIN(((e) - (s)), UTF8SKIP(s))))
36#endif
37
38#if defined(is_utf8_string) && defined(UTF8SKIP)
39__UNDEFINED__ isUTF8_CHAR(s0, e) ( \
40 (e) <= (s0) || ! is_utf8_string(s0, D_PPP_MIN(UTF8SKIP(s0), (e) - (s0))) \
41 ? 0 \
42 : UTF8SKIP(s0))
43#endif
44
8fc7db65
KW
45#if 'A' == 65
46__UNDEFINED__ BOM_UTF8 "\xEF\xBB\xBF"
47__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xEF\xBF\xBD"
48#elif '^' == 95
49__UNDEFINED__ BOM_UTF8 "\xDD\x73\x66\x73"
50__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x73\x73\x71"
51#elif '^' == 176
52__UNDEFINED__ BOM_UTF8 "\xDD\x72\x65\x72"
53__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x72\x72\x70"
54#else
55# error Unknown character set
56#endif
57
7899b636
KW
58#if { VERSION < 5.31.3 }
59 /* Versions prior to this accepted things that are now considered
60 * malformations, and didn't return -1 on error with warnings enabled
61 * */
62# undef utf8_to_uvchr_buf
63#endif
64
65/* This implementation brings modern, generally more restricted standards to
66 * utf8_to_uvchr_buf. Some of these are security related, and clearly must
67 * be done. But its arguable that the others need not, and hence should not.
68 * The reason they're here is that a module that intends to play with the
69 * latest perls should be able to work the same in all releases. An example is
70 * that perl no longer accepts any UV for a code point, but limits them to
71 * IV_MAX or below. This is for future internal use of the larger code points.
72 * If it turns out that some of these changes are breaking code that isn't
73 * intended to work with modern perls, the tighter restrictions could be
74 * relaxed. khw thinks this is unlikely, but has been wrong in the past. */
75
76/* 5.6.0 is the first release with UTF-8, and we don't implement this function
77 * there due to its likely lack of still being in use, and the underlying
78 * implementation is very different from later ones, without the later
79 * safeguards, so would require extra work to deal with */
80#if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf)
81 /* Choose which underlying implementation to use. At least one must be
82 * present or the perl is too early to handle this function */
83# if defined(utf8n_to_uvchr) || defined(utf8_to_uv)
84# if defined(utf8n_to_uvchr) /* This is the preferred implementation */
85# define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr
86# else /* Must be at least 5.6.1 from #if above */
87# define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) utf8_to_uv((U8 *)(s), (curlen), (retlen), (flags))
88# endif
89# endif
90
91# if { NEED utf8_to_uvchr_buf }
92
93UV
94utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
95{
96 UV ret;
97 STRLEN curlen;
98 bool overflows = 0;
99 const U8 *cur_s = s;
100 const bool do_warnings = ckWARN_d(WARN_UTF8);
101# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
102 STRLEN overflow_length = 0;
103# endif
104
105 if (send > s) {
106 curlen = send - s;
107 }
108 else {
109 assert(0); /* Modern perls die under this circumstance */
110 curlen = 0;
111 if (! do_warnings) { /* Handle empty here if no warnings needed */
112 if (retlen) *retlen = 0;
113 return UNICODE_REPLACEMENT;
114 }
115 }
116
117# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
118
119 /* Perl did not properly detect overflow for much of its history on
120 * non-EBCDIC platforms, often returning an overlong value which may or may
121 * not have been tolerated in the call. Also, earlier versions, when they
122 * did detect overflow, may have disallowed it completely. Modern ones can
123 * replace it with the REPLACEMENT CHARACTER, depending on calling
124 * parameters. Therefore detect it ourselves in releases it was
125 * problematic in. */
126
127 if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {
128
129 /* First, on a 32-bit machine the first byte being at least \xFE
130 * automatically is overflow, as it indicates something requiring more
131 * than 31 bits */
132 if (sizeof(ret) < 8) {
133 overflows = 1;
134 overflow_length = 7;
135 }
136 else {
137 const U8 highest[] = /* 2*63-1 */
138 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
139 const U8 *cur_h = highest;
140
141 for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
142 if (UNLIKELY(*cur_s == *cur_h)) {
143 continue;
144 }
145
146 /* If this byte is larger than the corresponding highest UTF-8
147 * byte, the sequence overflows; otherwise the byte is less
148 * than (as we handled the equality case above), and so the
149 * sequence doesn't overflow */
150 overflows = *cur_s > *cur_h;
151 break;
152
153 }
154
155 /* Here, either we set the bool and broke out of the loop, or got
156 * to the end and all bytes are the same which indicates it doesn't
157 * overflow. If it did overflow, it would be this number of bytes
158 * */
159 overflow_length = 13;
160 }
161 }
162
163 if (UNLIKELY(overflows)) {
164 ret = 0;
165
166 if (! do_warnings && retlen) {
167 *retlen = overflow_length;
168 }
169 }
170 else
171
172# endif /* < 5.26 */
173
174 /* Here, we are either in a release that properly detects overflow, or
175 * we have checked for overflow and the next statement is executing as
176 * part of the above conditional where we know we don't have overflow.
177 *
178 * The modern versions allow anything that evaluates to a legal UV, but
179 * not overlongs nor an empty input */
180 ret = D_PPP_utf8_to_uvchr_buf_callee(
181 s, curlen, retlen, (UTF8_ALLOW_ANYUV
182 & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
183
184# if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }
185
186 /* But actually, more modern versions restrict the UV to being no more than
187 * what * an IV can hold, so it could, so it could still have gotten it
188 * wrong about overflowing. */
189 if (UNLIKELY(ret > IV_MAX)) {
190 overflows = 1;
191 }
192
193# endif
194
195 if (UNLIKELY(overflows)) {
196 if (! do_warnings) {
197 if (retlen) {
198 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
199 *retlen = D_PPP_MIN(*retlen, curlen);
200 }
201 return UNICODE_REPLACEMENT;
202 }
203 else {
204
205 /* We use the error message in use from 5.8-5.26 */
206 Perl_warner(aTHX_ packWARN(WARN_UTF8),
207 "Malformed UTF-8 character (overflow at 0x%" UVxf
208 ", byte 0x%02x, after start byte 0x%02x)",
209 ret, *cur_s, *s);
210 if (retlen) {
211 *retlen = (STRLEN) -1;
212 }
213 return 0;
214 }
215 }
216
217 /* Here, did not overflow, but if it failed for some other reason, and
218 * warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
219 * try again, allowing anything. (Note a return of 0 is ok if the input
220 * was '\0') */
221 if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
222
223 /* If curlen is 0, we already handled the case where warnings are
224 * disabled, so this 'if' will be true, and so later on, we know that
225 * 's' is dereferencible */
226 if (do_warnings) {
227 *retlen = (STRLEN) -1;
228 }
229 else {
230 ret = D_PPP_utf8_to_uvchr_buf_callee(
231 s, curlen, retlen, UTF8_ALLOW_ANY);
232 /* Override with the REPLACEMENT character, as that is what the
233 * modern version of this function returns */
234 ret = UNICODE_REPLACEMENT;
235
236# if { VERSION < 5.16.0 }
237
238 /* Versions earlier than this don't necessarily return the proper
239 * length. It should not extend past the end of string, nor past
240 * what the first byte indicates the length is, nor past the
241 * continuation characters */
242 if (retlen && *retlen >= 0) {
243 unsigned int i = 1;
244
245 *retlen = D_PPP_MIN(*retlen, curlen);
246 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
247 do {
248 if (s[i] < 0x80 || s[i] > 0xBF) {
249 *retlen = i;
250 break;
251 }
252 } while (++i < *retlen);
253 }
254
255# endif
256
257 }
258 }
259
260 return ret;
261}
262
263# endif
264#endif
265
266#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
267#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
268 to read past a NUL, making it much less likely to read
269 off the end of the buffer. A NUL indicates the start
270 of the next character anyway. If the input isn't
271 NUL-terminated, the function remains unsafe, as it
272 always has been. */
273
274__UNDEFINED__ utf8_to_uvchr(s, lp) \
275 ((*(s) == '\0') \
276 ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \
277 : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp)))
278
279#endif
280
281=xsinit
282
283#define NEED_utf8_to_uvchr_buf
284
285=xsubs
286
287#if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP)
288
289STRLEN
290UTF8_SAFE_SKIP(s, adjustment)
291 char * s
292 int adjustment
293 PREINIT:
294 const char *const_s;
295 CODE:
296 const_s = s;
297 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
298 RETVAL = UTF8_SAFE_SKIP(const_s, s + UTF8SKIP(s) + adjustment);
299 OUTPUT:
300 RETVAL
301
302#endif
303
304#ifdef isUTF8_CHAR
305
306STRLEN
307isUTF8_CHAR(s, adjustment)
308 unsigned char * s
309 int adjustment
310 PREINIT:
311 const unsigned char *const_s;
312 const unsigned char *const_e;
313 CODE:
314 const_s = s;
315 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
316 const_e = const_s + UTF8SKIP(const_s) + adjustment;
317 RETVAL = isUTF8_CHAR(const_s, const_e);
318 OUTPUT:
319 RETVAL
320
321#endif
322
323#ifdef utf8_to_uvchr_buf
324
325AV *
326utf8_to_uvchr_buf(s, adjustment)
327 unsigned char *s
328 int adjustment
329 PREINIT:
330 AV *av;
331 STRLEN len;
332 const unsigned char *const_s;
333 CODE:
334 av = newAV();
335 const_s = s;
336 av_push(av, newSVuv(utf8_to_uvchr_buf(const_s,
337 s + UTF8SKIP(s) + adjustment,
338 &len)));
339 if (len == (STRLEN) -1) {
340 av_push(av, newSViv(-1));
341 }
342 else {
343 av_push(av, newSVuv(len));
344 }
345 RETVAL = av;
346 OUTPUT:
347 RETVAL
348
349#endif
350
351#ifdef utf8_to_uvchr
352
353AV *
354utf8_to_uvchr(s)
355 unsigned char *s
356 PREINIT:
357 AV *av;
358 STRLEN len;
359 const unsigned char *const_s;
360 CODE:
361 av = newAV();
362 const_s = s;
363 av_push(av, newSVuv(utf8_to_uvchr(const_s, &len)));
364 if (len == (STRLEN) -1) {
365 av_push(av, newSViv(-1));
366 }
367 else {
368 av_push(av, newSVuv(len));
369 }
370 RETVAL = av;
371 OUTPUT:
372 RETVAL
373
374#endif
375
376=tests plan => 55
377
378BEGIN { require warnings if "$]" gt '5.006' }
379
380# skip tests on 5.6.0 and earlier
381if ("$]" le '5.006') {
382 skip 'skip: broken utf8 support', 0 for 1..55;
383 exit;
384}
385
386ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
387ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
388
389ok(&Devel::PPPort::isUTF8_CHAR("A", -1), 0);
390ok(&Devel::PPPort::isUTF8_CHAR("A", 0), 1);
391ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", -1), 0);
392ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", 0), 2);
393
394my $ret = &Devel::PPPort::utf8_to_uvchr("A");
395ok($ret->[0], ord("A"));
396ok($ret->[1], 1);
397
398$ret = &Devel::PPPort::utf8_to_uvchr("\0");
399ok($ret->[0], 0);
400ok($ret->[1], 1);
401
402$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
403ok($ret->[0], ord("A"));
404ok($ret->[1], 1);
405
406$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
407ok($ret->[0], 0);
408ok($ret->[1], 1);
409
410if (ord("A") != 65) { # tests not valid for EBCDIC
411 ok(1, 1) for 1 .. (2 + 4 + (7 * 5));
412}
413else {
414 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
415 ok($ret->[0], 0x100);
416 ok($ret->[1], 2);
417
418 my @warnings;
419 local $SIG{__WARN__} = sub { push @warnings, @_; };
420
421 {
422 BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' }
423 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
424 ok($ret->[0], 0);
425 ok($ret->[1], -1);
426
427 BEGIN { 'warnings'->unimport() if "$]" gt '5.006' }
428 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
429 ok($ret->[0], 0xFFFD);
430 ok($ret->[1], 1);
431 }
432
433 my @buf_tests = (
434 {
435 input => "A",
436 adjustment => -1,
437 warning => qr/empty/,
438 no_warnings_returned_length => 0,
439 },
440 {
441 input => "\xc4\xc5",
442 adjustment => 0,
443 warning => qr/non-continuation/,
444 no_warnings_returned_length => 1,
445 },
446 {
447 input => "\xc4\x80",
448 adjustment => -1,
449 warning => qr/short|1 byte, need 2/,
450 no_warnings_returned_length => 1,
451 },
452 {
453 input => "\xc0\x81",
454 adjustment => 0,
455 warning => qr/overlong|2 bytes, need 1/,
456 no_warnings_returned_length => 2,
457 },
458 {
459 input => "\xe0\x80\x81",
460 adjustment => 0,
461 warning => qr/overlong|3 bytes, need 1/,
462 no_warnings_returned_length => 3,
463 },
464 {
465 input => "\xf0\x80\x80\x81",
466 adjustment => 0,
467 warning => qr/overlong|4 bytes, need 1/,
468 no_warnings_returned_length => 4,
469 },
470 { # Old algorithm failed to detect this
471 input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
472 adjustment => 0,
473 warning => qr/overflow/,
474 no_warnings_returned_length => 13,
475 },
476 );
477
478 # An empty input is an assertion failure on debugging builds. It is
479 # deliberately the first test.
480 require Config; import Config;
481 use vars '%Config';
482 if ($Config{ccflags} =~ /-DDEBUGGING/) {
483 shift @buf_tests;
484 ok(1, 1) for 1..5;
485 }
486
487 for my $test (@buf_tests) {
488 my $input = $test->{'input'};
489 my $adjustment = $test->{'adjustment'};
490 my $display = 'utf8_to_uvchr_buf("';
491 for (my $i = 0; $i < length($input) + $adjustment; $i++) {
492 $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
493 }
494
495 $display .= '")';
496 my $warning = $test->{'warning'};
497
498 undef @warnings;
499 BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' }
500 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
501 ok($ret->[0], 0, "returned value $display; warnings enabled");
502 ok($ret->[1], -1, "returned length $display; warnings enabled");
503 my $all_warnings = join "; ", @warnings;
504 my $contains = grep { $_ =~ $warning } $all_warnings;
505 ok($contains, 1, $display
506 . "; Got: '$all_warnings', which should contain '$warning'");
507
508 undef @warnings;
509 BEGIN { 'warnings'->unimport('utf8') if "$]" gt '5.006' }
510 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
511 ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled");
512 ok($ret->[1], $test->{'no_warnings_returned_length'},
513 "returned length $display; warnings disabled");
514 }
515}