1 ################################################################################
3 ## Version 3.x, Copyright (C) 2004-2013, Marcus Holland-Moritz.
4 ## Version 2.x, Copyright (C) 2001, Paul Marquess.
5 ## Version 1.x, Copyright (C) 1999, Kenneth Albanowski.
7 ## This program is free software; you can redistribute it and/or
8 ## modify it under the same terms as Perl itself.
10 ################################################################################
20 __UNDEFINED__ sv_setuv(sv, uv) \
23 if (TeMpUv <= IV_MAX) \
24 sv_setiv(sv, TeMpUv); \
26 sv_setnv(sv, (double)TeMpUv); \
29 __UNDEFINED__ newSVuv(uv) ((uv) <= IV_MAX ? newSViv((IV)uv) : newSVnv((NV)uv))
31 #if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)
32 __UNDEFINED__ sv_2uv(sv) ({ SV *_sv = (sv); (UV) (SvNOK(_sv) ? SvNV(_sv) : sv_2nv(_sv)); })
34 __UNDEFINED__ sv_2uv(sv) ((PL_Sv = (sv)), (UV) (SvNOK(PL_Sv) ? SvNV(PL_Sv) : sv_2nv(PL_Sv)))
37 __UNDEFINED__ SvUVX(sv) ((UV)SvIVX(sv))
38 __UNDEFINED__ SvUVXx(sv) SvUVX(sv)
39 __UNDEFINED__ SvUV(sv) (SvIOK(sv) ? SvUVX(sv) : sv_2uv(sv))
41 #if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)
42 __UNDEFINED__ SvUVx(sv) ({ SV *_sv = (sv)); SvUV(_sv); })
44 __UNDEFINED__ SvUVx(sv) ((PL_Sv = (sv)), SvUV(PL_Sv))
48 * Always use the SvUVx() macro instead of sv_uv().
50 /* Replace sv_uv with SvUVx */
51 __UNDEFINED__ sv_uv(sv) SvUVx(sv)
53 #if !defined(SvUOK) && defined(SvIOK_UV)
54 # define SvUOK(sv) SvIOK_UV(sv)
57 __UNDEFINED__ XST_mUV(i,v) (ST(i) = sv_2mortal(newSVuv(v)) )
58 __UNDEFINED__ XSRETURN_UV(v) STMT_START { XST_mUV(0,v); XSRETURN(1); } STMT_END
60 __UNDEFINED__ PUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); PUSHTARG; } STMT_END
61 __UNDEFINED__ XPUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); XPUSHTARG; } STMT_END
63 #if !defined(my_strnlen)
64 #if { NEED my_strnlen }
67 my_strnlen(const char *str, Size_t maxlen)
80 #if { VERSION < 5.31.4 }
81 /* Versions prior to this accepted things that are now considered
82 * malformations, and didn't return -1 on error with warnings enabled
84 # undef utf8_to_uvchr_buf
87 /* This implementation brings modern, generally more restricted standards to
88 * utf8_to_uvchr_buf. Some of these are security related, and clearly must
89 * be done. But its arguable that the others need not, and hence should not.
90 * The reason they're here is that a module that intends to play with the
91 * latest perls should be able to work the same in all releases. An example is
92 * that perl no longer accepts any UV for a code point, but limits them to
93 * IV_MAX or below. This is for future internal use of the larger code points.
94 * If it turns out that some of these changes are breaking code that isn't
95 * intended to work with modern perls, the tighter restrictions could be
96 * relaxed. khw thinks this is unlikely, but has been wrong in the past. */
98 /* 5.6.0 is the first release with UTF-8, and we don't implement this function
99 * there due to its likely lack of still being in use, and the underlying
100 * implementation is very different from later ones, without the later
101 * safeguards, so would require extra work to deal with */
102 #if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf)
103 /* Choose which underlying implementation to use. At least one must be
104 * present or the perl is too early to handle this function */
105 # if defined(utf8n_to_uvchr) || defined(utf8_to_uv)
106 # if defined(utf8n_to_uvchr) /* This is the preferred implementation */
107 # define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr
108 # else /* Must be at least 5.6.1 from #if above */
109 # define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) utf8_to_uv((U8 *)(s), (curlen), (retlen), (flags))
113 # if { NEED utf8_to_uvchr_buf }
116 utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
122 const bool do_warnings = ckWARN_d(WARN_UTF8);
123 # if { VERSION < 5.26.0 } && ! defined(EBCDIC)
124 STRLEN overflow_length = 0;
131 assert(0); /* Modern perls die under this circumstance */
133 if (! do_warnings) { /* Handle empty here if no warnings needed */
134 if (retlen) *retlen = 0;
135 return UNICODE_REPLACEMENT;
139 # if { VERSION < 5.26.0 } && ! defined(EBCDIC)
141 /* Perl did not properly detect overflow for much of its history on
142 * non-EBCDIC platforms, often returning an overlong value which may or may
143 * not have been tolerated in the call. Also, earlier versions, when they
144 * did detect overflow, may have disallowed it completely. Modern ones can
145 * replace it with the REPLACEMENT CHARACTER, depending on calling
146 * parameters. Therefore detect it ourselves in releases it was
149 if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {
151 /* First, on a 32-bit machine the first byte being at least \xFE
152 * automatically is overflow, as it indicates something requiring more
154 if (sizeof(ret) < 8) {
159 const U8 highest[] = /* 2*63-1 */
160 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
161 const U8 *cur_h = highest;
163 for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
164 if (UNLIKELY(*cur_s == *cur_h)) {
168 /* If this byte is larger than the corresponding highest UTF-8
169 * byte, the sequence overflows; otherwise the byte is less
170 * than (as we handled the equality case above), and so the
171 * sequence doesn't overflow */
172 overflows = *cur_s > *cur_h;
177 /* Here, either we set the bool and broke out of the loop, or got
178 * to the end and all bytes are the same which indicates it doesn't
179 * overflow. If it did overflow, it would be this number of bytes
181 overflow_length = 13;
185 if (UNLIKELY(overflows)) {
188 if (! do_warnings && retlen) {
189 *retlen = overflow_length;
196 /* Here, we are either in a release that properly detects overflow, or
197 * we have checked for overflow and the next statement is executing as
198 * part of the above conditional where we know we don't have overflow.
200 * The modern versions allow anything that evaluates to a legal UV, but
201 * not overlongs nor an empty input */
202 ret = D_PPP_utf8_to_uvchr_buf_callee(
203 s, curlen, retlen, (UTF8_ALLOW_ANYUV
204 & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
206 # if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }
208 /* But actually, more modern versions restrict the UV to being no more than
209 * what * an IV can hold, so it could, so it could still have gotten it
210 * wrong about overflowing. */
211 if (UNLIKELY(ret > IV_MAX)) {
217 if (UNLIKELY(overflows)) {
220 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
221 *retlen = D_PPP_MIN(*retlen, curlen);
223 return UNICODE_REPLACEMENT;
227 /* We use the error message in use from 5.8-5.26 */
228 Perl_warner(aTHX_ packWARN(WARN_UTF8),
229 "Malformed UTF-8 character (overflow at 0x%" UVxf
230 ", byte 0x%02x, after start byte 0x%02x)",
233 *retlen = (STRLEN) -1;
239 /* Here, did not overflow, but if it failed for some other reason, and
240 * warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
241 * try again, allowing anything. (Note a return of 0 is ok if the input
243 if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
245 /* If curlen is 0, we already handled the case where warnings are
246 * disabled, so this 'if' will be true, and so later on, we know that
247 * 's' is dereferencible */
249 *retlen = (STRLEN) -1;
252 ret = D_PPP_utf8_to_uvchr_buf_callee(
253 s, curlen, retlen, UTF8_ALLOW_ANY);
254 /* Override with the REPLACEMENT character, as that is what the
255 * modern version of this function returns */
256 ret = UNICODE_REPLACEMENT;
258 # if { VERSION < 5.16.0 }
260 /* Versions earlier than this don't necessarily return the proper
261 * length. It should not extend past the end of string, nor past
262 * what the first byte indicates the length is, nor past the
263 * continuation characters */
264 if (retlen && *retlen >= 0) {
267 *retlen = D_PPP_MIN(*retlen, curlen);
268 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
270 if (s[i] < 0x80 || s[i] > 0xBF) {
274 } while (++i < *retlen);
288 #if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
289 #undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
290 to read past a NUL, making it much less likely to read
291 off the end of the buffer. A NUL indicates the start
292 of the next character anyway. If the input isn't
293 NUL-terminated, the function remains unsafe, as it
296 __UNDEFINED__ utf8_to_uvchr(s, lp) \
298 ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \
299 : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp)))
305 #define NEED_my_strnlen
306 #define NEED_utf8_to_uvchr_buf
315 sv_setuv(RETVAL, uv);
323 RETVAL = newSVuv(uv);
340 RETVAL = SvUVx(++sv);
354 TARG = sv_newmortal();
364 TARG = sv_newmortal();
373 RETVAL= my_strnlen(s, max);
379 BEGIN { require warnings if "$]" gt '5.006' }
381 ok(&Devel::PPPort::sv_setuv(42), 42);
382 ok(&Devel::PPPort::newSVuv(123), 123);
383 ok(&Devel::PPPort::sv_2uv("4711"), 4711);
384 ok(&Devel::PPPort::sv_2uv("1735928559"), 1735928559);
385 ok(&Devel::PPPort::SvUVx("1735928559"), 1735928559);
386 ok(&Devel::PPPort::SvUVx(1735928559), 1735928559);
387 ok(&Devel::PPPort::SvUVx(0xdeadbeef), 0xdeadbeef);
388 ok(&Devel::PPPort::XSRETURN_UV(), 42);
389 ok(&Devel::PPPort::PUSHu(), 42);
390 ok(&Devel::PPPort::XPUSHu(), 43);
391 ok(&Devel::PPPort::my_strnlen("abc\0def", 7), 3);
393 # skip tests on 5.6.0 and earlier
394 if ("$]" le '5.006') {
395 skip 'skip: broken utf8 support', 0 for 1..51;
399 ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
400 ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
402 my $ret = &Devel::PPPort::utf8_to_uvchr("A");
403 ok($ret->[0], ord("A"));
406 $ret = &Devel::PPPort::utf8_to_uvchr("\0");
410 $ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
411 ok($ret->[0], ord("A"));
414 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
418 if (ord("A") != 65) { # tests not valid for EBCDIC
419 ok(1, 1) for 1 .. (2 + 4 + (7 * 5));
422 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
423 ok($ret->[0], 0x100);
427 local $SIG{__WARN__} = sub { push @warnings, @_; };
430 BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' }
431 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
435 BEGIN { 'warnings'->unimport() if "$]" gt '5.006' }
436 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
437 ok($ret->[0], 0xFFFD);
445 warning => qr/empty/,
446 no_warnings_returned_length => 0,
451 warning => qr/non-continuation/,
452 no_warnings_returned_length => 1,
457 warning => qr/short|1 byte, need 2/,
458 no_warnings_returned_length => 1,
463 warning => qr/overlong|2 bytes, need 1/,
464 no_warnings_returned_length => 2,
467 input => "\xe0\x80\x81",
469 warning => qr/overlong|3 bytes, need 1/,
470 no_warnings_returned_length => 3,
473 input => "\xf0\x80\x80\x81",
475 warning => qr/overlong|4 bytes, need 1/,
476 no_warnings_returned_length => 4,
478 { # Old algorithm failed to detect this
479 input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
481 warning => qr/overflow/,
482 no_warnings_returned_length => 13,
486 # An empty input is an assertion failure on debugging builds. It is
487 # deliberately the first test.
488 require Config; import Config;
490 if ($Config{ccflags} =~ /-DDEBUGGING/
491 || $^O eq 'VMS' && $Config{usedebugging_perl} eq 'Y') {
496 for my $test (@buf_tests) {
497 my $input = $test->{'input'};
498 my $adjustment = $test->{'adjustment'};
499 my $display = 'utf8_to_uvchr_buf("';
500 for (my $i = 0; $i < length($input) + $adjustment; $i++) {
501 $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
505 my $warning = $test->{'warning'};
508 BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' }
509 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
510 ok($ret->[0], 0, "returned value $display; warnings enabled");
511 ok($ret->[1], -1, "returned length $display; warnings enabled");
512 my $all_warnings = join "; ", @warnings;
513 my $contains = grep { $_ =~ $warning } $all_warnings;
514 ok($contains, 1, $display
515 . "; Got: '$all_warnings', which should contain '$warning'");
518 BEGIN { 'warnings'->unimport('utf8') if "$]" gt '5.006' }
519 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
520 ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled");
521 ok($ret->[1], $test->{'no_warnings_returned_length'},
522 "returned length $display; warnings disabled");