8 #define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b))
10 __UNDEFINED__ UNICODE_REPLACEMENT 0xFFFD
12 __UNDEFINED__ UTF8_MAXBYTES UTF8_MAXLEN
14 __UNDEFINED__ UTF8_ALLOW_ANYUV 0
15 __UNDEFINED__ UTF8_ALLOW_EMPTY 0x0001
16 __UNDEFINED__ UTF8_ALLOW_CONTINUATION 0x0002
17 __UNDEFINED__ UTF8_ALLOW_NON_CONTINUATION 0x0004
18 __UNDEFINED__ UTF8_ALLOW_SHORT 0x0008
19 __UNDEFINED__ UTF8_ALLOW_LONG 0x0010
20 __UNDEFINED__ UTF8_ALLOW_OVERFLOW 0x0080
21 __UNDEFINED__ UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \
22 |UTF8_ALLOW_NON_CONTINUATION \
29 /* Don't use official version because it uses MIN, which may not be available */
32 __UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \
35 : D_PPP_MIN(((e) - (s)), UTF8SKIP(s))))
38 #ifdef is_ascii_string
39 __UNDEFINED__ is_invariant_string(s,l) is_ascii_string(s,l)
40 __UNDEFINED__ is_utf8_invariant_string(s,l) is_ascii_string(s,l)
42 /* Hint: is_utf8_invariant_string
43 Please use this instead of is_ascii_string or is_invariant_string
48 __UNDEFINED__ foldEQ_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2) \
49 cBOOL(! ibcmp_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2))
52 #if defined(is_utf8_string) && defined(UTF8SKIP)
53 __UNDEFINED__ isUTF8_CHAR(s0, e) ( \
54 (e) <= (s0) || ! is_utf8_string(s0, D_PPP_MIN(UTF8SKIP(s0), (e) - (s0))) \
60 __UNDEFINED__ BOM_UTF8 "\xEF\xBB\xBF"
61 __UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xEF\xBF\xBD"
63 __UNDEFINED__ BOM_UTF8 "\xDD\x73\x66\x73"
64 __UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x73\x73\x71"
66 __UNDEFINED__ BOM_UTF8 "\xDD\x72\x65\x72"
67 __UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x72\x72\x70"
69 # error Unknown character set
72 #if { VERSION < 5.31.3 }
73 /* Versions prior to this accepted things that are now considered
74 * malformations, and didn't return -1 on error with warnings enabled
76 # undef utf8_to_uvchr_buf
79 /* This implementation brings modern, generally more restricted standards to
80 * utf8_to_uvchr_buf. Some of these are security related, and clearly must
81 * be done. But its arguable that the others need not, and hence should not.
82 * The reason they're here is that a module that intends to play with the
83 * latest perls should be able to work the same in all releases. An example is
84 * that perl no longer accepts any UV for a code point, but limits them to
85 * IV_MAX or below. This is for future internal use of the larger code points.
86 * If it turns out that some of these changes are breaking code that isn't
87 * intended to work with modern perls, the tighter restrictions could be
88 * relaxed. khw thinks this is unlikely, but has been wrong in the past. */
90 /* 5.6.0 is the first release with UTF-8, and we don't implement this function
91 * there due to its likely lack of still being in use, and the underlying
92 * implementation is very different from later ones, without the later
93 * safeguards, so would require extra work to deal with */
94 #if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf)
95 /* Choose which underlying implementation to use. At least one must be
96 * present or the perl is too early to handle this function */
97 # if defined(utf8n_to_uvchr) || defined(utf8_to_uv)
98 # if defined(utf8n_to_uvchr) /* This is the preferred implementation */
99 # define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr
100 # else /* Must be at least 5.6.1 from #if above */
101 # define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) utf8_to_uv((U8 *)(s), (curlen), (retlen), (flags))
105 # if { NEED utf8_to_uvchr_buf }
108 utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
114 const bool do_warnings = ckWARN_d(WARN_UTF8);
115 # if { VERSION < 5.26.0 } && ! defined(EBCDIC)
116 STRLEN overflow_length = 0;
123 assert(0); /* Modern perls die under this circumstance */
125 if (! do_warnings) { /* Handle empty here if no warnings needed */
126 if (retlen) *retlen = 0;
127 return UNICODE_REPLACEMENT;
131 # if { VERSION < 5.26.0 } && ! defined(EBCDIC)
133 /* Perl did not properly detect overflow for much of its history on
134 * non-EBCDIC platforms, often returning an overlong value which may or may
135 * not have been tolerated in the call. Also, earlier versions, when they
136 * did detect overflow, may have disallowed it completely. Modern ones can
137 * replace it with the REPLACEMENT CHARACTER, depending on calling
138 * parameters. Therefore detect it ourselves in releases it was
141 if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {
143 /* First, on a 32-bit machine the first byte being at least \xFE
144 * automatically is overflow, as it indicates something requiring more
146 if (sizeof(ret) < 8) {
151 const U8 highest[] = /* 2*63-1 */
152 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
153 const U8 *cur_h = highest;
155 for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
156 if (UNLIKELY(*cur_s == *cur_h)) {
160 /* If this byte is larger than the corresponding highest UTF-8
161 * byte, the sequence overflows; otherwise the byte is less
162 * than (as we handled the equality case above), and so the
163 * sequence doesn't overflow */
164 overflows = *cur_s > *cur_h;
169 /* Here, either we set the bool and broke out of the loop, or got
170 * to the end and all bytes are the same which indicates it doesn't
171 * overflow. If it did overflow, it would be this number of bytes
173 overflow_length = 13;
177 if (UNLIKELY(overflows)) {
180 if (! do_warnings && retlen) {
181 *retlen = overflow_length;
188 /* Here, we are either in a release that properly detects overflow, or
189 * we have checked for overflow and the next statement is executing as
190 * part of the above conditional where we know we don't have overflow.
192 * The modern versions allow anything that evaluates to a legal UV, but
193 * not overlongs nor an empty input */
194 ret = D_PPP_utf8_to_uvchr_buf_callee(
195 s, curlen, retlen, (UTF8_ALLOW_ANYUV
196 & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
198 # if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }
200 /* But actually, more modern versions restrict the UV to being no more than
201 * what * an IV can hold, so it could, so it could still have gotten it
202 * wrong about overflowing. */
203 if (UNLIKELY(ret > IV_MAX)) {
209 if (UNLIKELY(overflows)) {
212 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
213 *retlen = D_PPP_MIN(*retlen, curlen);
215 return UNICODE_REPLACEMENT;
219 /* We use the error message in use from 5.8-5.26 */
220 Perl_warner(aTHX_ packWARN(WARN_UTF8),
221 "Malformed UTF-8 character (overflow at 0x%" UVxf
222 ", byte 0x%02x, after start byte 0x%02x)",
225 *retlen = (STRLEN) -1;
231 /* Here, did not overflow, but if it failed for some other reason, and
232 * warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
233 * try again, allowing anything. (Note a return of 0 is ok if the input
235 if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
237 /* If curlen is 0, we already handled the case where warnings are
238 * disabled, so this 'if' will be true, and so later on, we know that
239 * 's' is dereferencible */
241 *retlen = (STRLEN) -1;
244 ret = D_PPP_utf8_to_uvchr_buf_callee(
245 s, curlen, retlen, UTF8_ALLOW_ANY);
246 /* Override with the REPLACEMENT character, as that is what the
247 * modern version of this function returns */
248 ret = UNICODE_REPLACEMENT;
250 # if { VERSION < 5.16.0 }
252 /* Versions earlier than this don't necessarily return the proper
253 * length. It should not extend past the end of string, nor past
254 * what the first byte indicates the length is, nor past the
255 * continuation characters */
256 if (retlen && *retlen >= 0) {
259 *retlen = D_PPP_MIN(*retlen, curlen);
260 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
262 if (s[i] < 0x80 || s[i] > 0xBF) {
266 } while (++i < *retlen);
280 #if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
281 #undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
282 to read past a NUL, making it much less likely to read
283 off the end of the buffer. A NUL indicates the start
284 of the next character anyway. If the input isn't
285 NUL-terminated, the function remains unsafe, as it
288 __UNDEFINED__ utf8_to_uvchr(s, lp) \
290 ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \
291 : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp)))
297 #define NEED_utf8_to_uvchr_buf
301 #if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP)
304 UTF8_SAFE_SKIP(s, adjustment)
311 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
312 RETVAL = UTF8_SAFE_SKIP(const_s, s + UTF8SKIP(s) + adjustment);
321 isUTF8_CHAR(s, adjustment)
325 const unsigned char *const_s;
326 const unsigned char *const_e;
329 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
330 const_e = const_s + UTF8SKIP(const_s) + adjustment;
331 RETVAL = isUTF8_CHAR(const_s, const_e);
341 foldEQ_utf8(s1, l1, u1, s2, l2, u2)
349 const char *const_s1;
350 const char *const_s2;
354 RETVAL = foldEQ_utf8(const_s1, NULL, l1, u1, const_s2, NULL, l2, u2);
360 #ifdef utf8_to_uvchr_buf
363 utf8_to_uvchr_buf(s, adjustment)
369 const unsigned char *const_s;
373 av_push(av, newSVuv(utf8_to_uvchr_buf(const_s,
374 s + UTF8SKIP(s) + adjustment,
376 if (len == (STRLEN) -1) {
377 av_push(av, newSViv(-1));
380 av_push(av, newSVuv(len));
396 const unsigned char *const_s;
400 av_push(av, newSVuv(utf8_to_uvchr(const_s, &len)));
401 if (len == (STRLEN) -1) {
402 av_push(av, newSViv(-1));
405 av_push(av, newSVuv(len));
415 BEGIN { require warnings if "$]" gt '5.006' }
417 # skip tests on 5.6.0 and earlier
418 if ("$]" le '5.006') {
419 skip 'skip: broken utf8 support', 0 for 1..58;
423 ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
424 ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
426 ok(&Devel::PPPort::isUTF8_CHAR("A", -1), 0);
427 ok(&Devel::PPPort::isUTF8_CHAR("A", 0), 1);
428 ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", -1), 0);
429 ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", 0), 2);
431 if ("$]" lt '5.008') {
435 ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{101}", 3, 1), 1);
436 ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{102}", 3, 1), 0);
437 ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "b\x{101}", 3, 1), 0);
440 my $ret = &Devel::PPPort::utf8_to_uvchr("A");
441 ok($ret->[0], ord("A"));
444 $ret = &Devel::PPPort::utf8_to_uvchr("\0");
448 $ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
449 ok($ret->[0], ord("A"));
452 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
456 if (ord("A") != 65) { # tests not valid for EBCDIC
457 ok(1, 1) for 1 .. (2 + 4 + (7 * 5));
460 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
461 ok($ret->[0], 0x100);
465 local $SIG{__WARN__} = sub { push @warnings, @_; };
468 BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' }
469 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
473 BEGIN { 'warnings'->unimport() if "$]" gt '5.006' }
474 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
475 ok($ret->[0], 0xFFFD);
483 warning => qr/empty/,
484 no_warnings_returned_length => 0,
489 warning => qr/non-continuation/,
490 no_warnings_returned_length => 1,
495 warning => qr/short|1 byte, need 2/,
496 no_warnings_returned_length => 1,
501 warning => qr/overlong|2 bytes, need 1/,
502 no_warnings_returned_length => 2,
505 input => "\xe0\x80\x81",
507 warning => qr/overlong|3 bytes, need 1/,
508 no_warnings_returned_length => 3,
511 input => "\xf0\x80\x80\x81",
513 warning => qr/overlong|4 bytes, need 1/,
514 no_warnings_returned_length => 4,
516 { # Old algorithm failed to detect this
517 input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
519 warning => qr/overflow/,
520 no_warnings_returned_length => 13,
524 # An empty input is an assertion failure on debugging builds. It is
525 # deliberately the first test.
526 require Config; import Config;
528 if ($Config{ccflags} =~ /-DDEBUGGING/) {
533 for my $test (@buf_tests) {
534 my $input = $test->{'input'};
535 my $adjustment = $test->{'adjustment'};
536 my $display = 'utf8_to_uvchr_buf("';
537 for (my $i = 0; $i < length($input) + $adjustment; $i++) {
538 $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
542 my $warning = $test->{'warning'};
545 BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' }
546 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
547 ok($ret->[0], 0, "returned value $display; warnings enabled");
548 ok($ret->[1], -1, "returned length $display; warnings enabled");
549 my $all_warnings = join "; ", @warnings;
550 my $contains = grep { $_ =~ $warning } $all_warnings;
551 ok($contains, 1, $display
552 . "; Got: '$all_warnings', which should contain '$warning'");
555 BEGIN { 'warnings'->unimport('utf8') if "$]" gt '5.006' }
556 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
557 ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled");
558 ok($ret->[1], $test->{'no_warnings_returned_length'},
559 "returned length $display; warnings disabled");