This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
parts/inc/uv: Use > to compare $]
[perl5.git] / dist / Devel-PPPort / parts / inc / uv
CommitLineData
adfe19db
MHM
1################################################################################
2##
b2049988 3## Version 3.x, Copyright (C) 2004-2013, Marcus Holland-Moritz.
adfe19db
MHM
4## Version 2.x, Copyright (C) 2001, Paul Marquess.
5## Version 1.x, Copyright (C) 1999, Kenneth Albanowski.
6##
7## This program is free software; you can redistribute it and/or
8## modify it under the same terms as Perl itself.
9##
10################################################################################
11
12=provides
13
adfe19db 14__UNDEFINED__
e28192fd 15my_strnlen
679ad62d 16SvUOK
aadf4f9e 17
adfe19db
MHM
18=implementation
19
0d0f8426
MHM
20__UNDEFINED__ sv_setuv(sv, uv) \
21 STMT_START { \
22 UV TeMpUv = uv; \
23 if (TeMpUv <= IV_MAX) \
24 sv_setiv(sv, TeMpUv); \
25 else \
26 sv_setnv(sv, (double)TeMpUv); \
27 } STMT_END
28
29__UNDEFINED__ newSVuv(uv) ((uv) <= IV_MAX ? newSViv((IV)uv) : newSVnv((NV)uv))
adfe19db 30
4cda2410
P
31#if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)
32__UNDEFINED__ sv_2uv(sv) ({ SV *_sv = (sv); (UV) (SvNOK(_sv) ? SvNV(_sv) : sv_2nv(_sv)); })
33#else
adfe19db 34__UNDEFINED__ sv_2uv(sv) ((PL_Sv = (sv)), (UV) (SvNOK(PL_Sv) ? SvNV(PL_Sv) : sv_2nv(PL_Sv)))
4cda2410
P
35#endif
36
adfe19db
MHM
37__UNDEFINED__ SvUVX(sv) ((UV)SvIVX(sv))
38__UNDEFINED__ SvUVXx(sv) SvUVX(sv)
39__UNDEFINED__ SvUV(sv) (SvIOK(sv) ? SvUVX(sv) : sv_2uv(sv))
4cda2410
P
40
41#if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)
42__UNDEFINED__ SvUVx(sv) ({ SV *_sv = (sv)); SvUV(_sv); })
43#else
adfe19db 44__UNDEFINED__ SvUVx(sv) ((PL_Sv = (sv)), SvUV(PL_Sv))
4cda2410 45#endif
adfe19db
MHM
46
47/* Hint: sv_uv
48 * Always use the SvUVx() macro instead of sv_uv().
49 */
c4e3740a 50/* Replace sv_uv with SvUVx */
adfe19db
MHM
51__UNDEFINED__ sv_uv(sv) SvUVx(sv)
52
679ad62d
MHM
53#if !defined(SvUOK) && defined(SvIOK_UV)
54# define SvUOK(sv) SvIOK_UV(sv)
55#endif
56
adfe19db
MHM
57__UNDEFINED__ XST_mUV(i,v) (ST(i) = sv_2mortal(newSVuv(v)) )
58__UNDEFINED__ XSRETURN_UV(v) STMT_START { XST_mUV(0,v); XSRETURN(1); } STMT_END
59
96ad942f
MHM
60__UNDEFINED__ PUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); PUSHTARG; } STMT_END
61__UNDEFINED__ XPUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); XPUSHTARG; } STMT_END
62
e28192fd
KW
63#if !defined(my_strnlen)
64#if { NEED my_strnlen }
65
66STRLEN
67my_strnlen(const char *str, Size_t maxlen)
68{
69 const char *p = str;
70
71 while(maxlen-- && *p)
72 p++;
73
74 return p - str;
75}
76
77#endif
78#endif
4c28bdc5 79
4a5ab54a 80#if { VERSION < 5.31.4 }
aadf4f9e
N
81 /* Versions prior to this accepted things that are now considered
82 * malformations, and didn't return -1 on error with warnings enabled
83 * */
84# undef utf8_to_uvchr_buf
85#endif
4c28bdc5 86
aadf4f9e
N
87/* This implementation brings modern, generally more restricted standards to
88 * utf8_to_uvchr_buf. Some of these are security related, and clearly must
89 * be done. But its arguable that the others need not, and hence should not.
90 * The reason they're here is that a module that intends to play with the
6858ac42 91 * latest perls should be able to work the same in all releases. An example is
aadf4f9e
N
92 * that perl no longer accepts any UV for a code point, but limits them to
93 * IV_MAX or below. This is for future internal use of the larger code points.
94 * If it turns out that some of these changes are breaking code that isn't
95 * intended to work with modern perls, the tighter restrictions could be
96 * relaxed. khw thinks this is unlikely, but has been wrong in the past. */
97
6858ac42
KW
98/* 5.6.0 is the first release with UTF-8, and we don't implement this function
99 * there due to its likely lack of still being in use, and the underlying
100 * implementation is very different from later ones, without the later
101 * safeguards, so would require extra work to deal with */
e4a76720 102#if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf)
aadf4f9e
N
103 /* Choose which underlying implementation to use. At least one must be
104 * present or the perl is too early to handle this function */
105# if defined(utf8n_to_uvchr) || defined(utf8_to_uv)
106# if defined(utf8n_to_uvchr) /* This is the preferred implementation */
bbe8b705 107# define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr
e4a76720 108# else /* Must be at least 5.6.1 from #if above */
612c5515 109# define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) utf8_to_uv((U8 *)(s), (curlen), (retlen), (flags))
aadf4f9e 110# endif
aadf4f9e
N
111# endif
112
aadf4f9e 113# if { NEED utf8_to_uvchr_buf }
4c28bdc5 114
aadf4f9e 115UV
612c5515 116utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
aadf4f9e
N
117{
118 UV ret;
119 STRLEN curlen;
120 bool overflows = 0;
121 const U8 *cur_s = s;
122 const bool do_warnings = ckWARN_d(WARN_UTF8);
19e23593 123# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
05fc829b 124 STRLEN overflow_length = 0;
19e23593 125# endif
aadf4f9e
N
126
127 if (send > s) {
128 curlen = send - s;
129 }
130 else {
131 assert(0); /* Modern perls die under this circumstance */
132 curlen = 0;
133 if (! do_warnings) { /* Handle empty here if no warnings needed */
134 if (retlen) *retlen = 0;
135 return UNICODE_REPLACEMENT;
136 }
137 }
138
8ec545d9 139# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
aadf4f9e 140
6858ac42
KW
141 /* Perl did not properly detect overflow for much of its history on
142 * non-EBCDIC platforms, often returning an overlong value which may or may
143 * not have been tolerated in the call. Also, earlier versions, when they
144 * did detect overflow, may have disallowed it completely. Modern ones can
145 * replace it with the REPLACEMENT CHARACTER, depending on calling
146 * parameters. Therefore detect it ourselves in releases it was
147 * problematic in. */
aadf4f9e 148
05fc829b 149 if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {
aadf4f9e 150
6858ac42
KW
151 /* First, on a 32-bit machine the first byte being at least \xFE
152 * automatically is overflow, as it indicates something requiring more
153 * than 31 bits */
aadf4f9e
N
154 if (sizeof(ret) < 8) {
155 overflows = 1;
05fc829b 156 overflow_length = 7;
aadf4f9e
N
157 }
158 else {
159 const U8 highest[] = /* 2*63-1 */
160 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
161 const U8 *cur_h = highest;
162
163 for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
164 if (UNLIKELY(*cur_s == *cur_h)) {
165 continue;
166 }
167
168 /* If this byte is larger than the corresponding highest UTF-8
6858ac42
KW
169 * byte, the sequence overflows; otherwise the byte is less
170 * than (as we handled the equality case above), and so the
171 * sequence doesn't overflow */
aadf4f9e
N
172 overflows = *cur_s > *cur_h;
173 break;
174
175 }
176
177 /* Here, either we set the bool and broke out of the loop, or got
178 * to the end and all bytes are the same which indicates it doesn't
05fc829b
KW
179 * overflow. If it did overflow, it would be this number of bytes
180 * */
181 overflow_length = 13;
182 }
183 }
184
185 if (UNLIKELY(overflows)) {
186 ret = 0;
187
188 if (! do_warnings && retlen) {
189 *retlen = overflow_length;
aadf4f9e
N
190 }
191 }
05fc829b 192 else
aadf4f9e 193
aadf4f9e
N
194# endif /* < 5.26 */
195
05fc829b
KW
196 /* Here, we are either in a release that properly detects overflow, or
197 * we have checked for overflow and the next statement is executing as
198 * part of the above conditional where we know we don't have overflow.
199 *
200 * The modern versions allow anything that evaluates to a legal UV, but
201 * not overlongs nor an empty input */
bbe8b705 202 ret = D_PPP_utf8_to_uvchr_buf_callee(
05fc829b
KW
203 s, curlen, retlen, (UTF8_ALLOW_ANYUV
204 & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
205
206# if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }
207
208 /* But actually, more modern versions restrict the UV to being no more than
209 * what * an IV can hold, so it could, so it could still have gotten it
210 * wrong about overflowing. */
211 if (UNLIKELY(ret > IV_MAX)) {
212 overflows = 1;
213 }
214
215# endif
216
aadf4f9e
N
217 if (UNLIKELY(overflows)) {
218 if (! do_warnings) {
219 if (retlen) {
bbe8b705
KW
220 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
221 *retlen = D_PPP_MIN(*retlen, curlen);
aadf4f9e
N
222 }
223 return UNICODE_REPLACEMENT;
224 }
225 else {
226
6858ac42 227 /* We use the error message in use from 5.8-5.26 */
de222e8b
KW
228 Perl_warner(aTHX_ packWARN(WARN_UTF8),
229 "Malformed UTF-8 character (overflow at 0x%" UVxf
230 ", byte 0x%02x, after start byte 0x%02x)",
231 ret, *cur_s, *s);
aadf4f9e
N
232 if (retlen) {
233 *retlen = (STRLEN) -1;
234 }
235 return 0;
236 }
237 }
238
6858ac42
KW
239 /* Here, did not overflow, but if it failed for some other reason, and
240 * warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
241 * try again, allowing anything. (Note a return of 0 is ok if the input
242 * was '\0') */
aadf4f9e
N
243 if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
244
245 /* If curlen is 0, we already handled the case where warnings are
6858ac42
KW
246 * disabled, so this 'if' will be true, and so later on, we know that
247 * 's' is dereferencible */
aadf4f9e
N
248 if (do_warnings) {
249 *retlen = (STRLEN) -1;
250 }
251 else {
bbe8b705 252 ret = D_PPP_utf8_to_uvchr_buf_callee(
aadf4f9e
N
253 s, curlen, retlen, UTF8_ALLOW_ANY);
254 /* Override with the REPLACEMENT character, as that is what the
255 * modern version of this function returns */
256 ret = UNICODE_REPLACEMENT;
257
de222e8b 258# if { VERSION < 5.16.0 }
aadf4f9e
N
259
260 /* Versions earlier than this don't necessarily return the proper
261 * length. It should not extend past the end of string, nor past
262 * what the first byte indicates the length is, nor past the
263 * continuation characters */
264 if (retlen && *retlen >= 0) {
f1590d76 265 unsigned int i = 1;
bbe8b705
KW
266
267 *retlen = D_PPP_MIN(*retlen, curlen);
268 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
aadf4f9e
N
269 do {
270 if (s[i] < 0x80 || s[i] > 0xBF) {
271 *retlen = i;
272 break;
273 }
274 } while (++i < *retlen);
275 }
276
de222e8b 277# endif
aadf4f9e
N
278
279 }
280 }
281
282 return ret;
283}
e28192fd 284
aadf4f9e
N
285# endif
286#endif
39d7245c 287
aadf4f9e
N
288#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
289#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
290 to read past a NUL, making it much less likely to read
291 off the end of the buffer. A NUL indicates the start
292 of the next character anyway. If the input isn't
293 NUL-terminated, the function remains unsafe, as it
294 always has been. */
39d7245c
KW
295
296__UNDEFINED__ utf8_to_uvchr(s, lp) \
297 ((*(s) == '\0') \
298 ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \
299 : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp)))
300
aadf4f9e
N
301#endif
302
e28192fd
KW
303=xsinit
304
305#define NEED_my_strnlen
aadf4f9e 306#define NEED_utf8_to_uvchr_buf
e28192fd 307
adfe19db
MHM
308=xsubs
309
310SV *
311sv_setuv(uv)
b2049988
MHM
312 UV uv
313 CODE:
314 RETVAL = newSViv(1);
315 sv_setuv(RETVAL, uv);
316 OUTPUT:
317 RETVAL
adfe19db
MHM
318
319SV *
320newSVuv(uv)
b2049988
MHM
321 UV uv
322 CODE:
323 RETVAL = newSVuv(uv);
324 OUTPUT:
325 RETVAL
adfe19db
MHM
326
327UV
328sv_2uv(sv)
b2049988
MHM
329 SV *sv
330 CODE:
331 RETVAL = sv_2uv(sv);
332 OUTPUT:
333 RETVAL
adfe19db
MHM
334
335UV
336SvUVx(sv)
b2049988
MHM
337 SV *sv
338 CODE:
339 sv--;
340 RETVAL = SvUVx(++sv);
341 OUTPUT:
342 RETVAL
adfe19db
MHM
343
344void
345XSRETURN_UV()
b2049988
MHM
346 PPCODE:
347 XSRETURN_UV(42);
adfe19db 348
96ad942f
MHM
349void
350PUSHu()
b2049988
MHM
351 PREINIT:
352 dTARG;
353 PPCODE:
354 TARG = sv_newmortal();
355 EXTEND(SP, 1);
356 PUSHu(42);
357 XSRETURN(1);
96ad942f
MHM
358
359void
360XPUSHu()
b2049988
MHM
361 PREINIT:
362 dTARG;
363 PPCODE:
364 TARG = sv_newmortal();
365 XPUSHu(43);
366 XSRETURN(1);
96ad942f 367
e28192fd
KW
368STRLEN
369my_strnlen(s, max)
370 char * s
371 STRLEN max
372 CODE:
373 RETVAL= my_strnlen(s, max);
374 OUTPUT:
375 RETVAL
376
5ef77e59 377=tests plan => 11
adfe19db 378
f5feb9e7 379BEGIN { require warnings if "$]" > '5.006' }
933ffe6b 380
adfe19db
MHM
381ok(&Devel::PPPort::sv_setuv(42), 42);
382ok(&Devel::PPPort::newSVuv(123), 123);
383ok(&Devel::PPPort::sv_2uv("4711"), 4711);
384ok(&Devel::PPPort::sv_2uv("1735928559"), 1735928559);
385ok(&Devel::PPPort::SvUVx("1735928559"), 1735928559);
386ok(&Devel::PPPort::SvUVx(1735928559), 1735928559);
387ok(&Devel::PPPort::SvUVx(0xdeadbeef), 0xdeadbeef);
388ok(&Devel::PPPort::XSRETURN_UV(), 42);
96ad942f
MHM
389ok(&Devel::PPPort::PUSHu(), 42);
390ok(&Devel::PPPort::XPUSHu(), 43);
e28192fd 391ok(&Devel::PPPort::my_strnlen("abc\0def", 7), 3);
aadf4f9e 392
89d8c7cd
KW
393# skip tests on 5.6.0 and earlier
394if ("$]" le '5.006') {
933ffe6b 395 skip 'skip: broken utf8 support', 0 for 1..51;
89d8c7cd
KW
396 exit;
397}
398
933ffe6b
P
399ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
400ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
401
aadf4f9e 402my $ret = &Devel::PPPort::utf8_to_uvchr("A");
4c28bdc5
KW
403ok($ret->[0], ord("A"));
404ok($ret->[1], 1);
aadf4f9e
N
405
406$ret = &Devel::PPPort::utf8_to_uvchr("\0");
4c28bdc5
KW
407ok($ret->[0], 0);
408ok($ret->[1], 1);
aadf4f9e
N
409
410$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
39d7245c
KW
411ok($ret->[0], ord("A"));
412ok($ret->[1], 1);
aadf4f9e
N
413
414$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
39d7245c
KW
415ok($ret->[0], 0);
416ok($ret->[1], 1);
aadf4f9e
N
417
418if (ord("A") != 65) { # tests not valid for EBCDIC
5f93efa0 419 ok(1, 1) for 1 .. (2 + 4 + (7 * 5));
aadf4f9e
N
420}
421else {
422 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
423 ok($ret->[0], 0x100);
424 ok($ret->[1], 2);
425
426 my @warnings;
427 local $SIG{__WARN__} = sub { push @warnings, @_; };
428
429 {
933ffe6b 430 BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' }
aadf4f9e
N
431 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
432 ok($ret->[0], 0);
433 ok($ret->[1], -1);
434
933ffe6b 435 BEGIN { 'warnings'->unimport() if "$]" gt '5.006' }
aadf4f9e
N
436 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
437 ok($ret->[0], 0xFFFD);
438 ok($ret->[1], 1);
439 }
440
441 my @buf_tests = (
442 {
443 input => "A",
444 adjustment => -1,
445 warning => qr/empty/,
446 no_warnings_returned_length => 0,
447 },
448 {
449 input => "\xc4\xc5",
450 adjustment => 0,
451 warning => qr/non-continuation/,
452 no_warnings_returned_length => 1,
453 },
454 {
455 input => "\xc4\x80",
456 adjustment => -1,
457 warning => qr/short|1 byte, need 2/,
458 no_warnings_returned_length => 1,
459 },
460 {
461 input => "\xc0\x81",
462 adjustment => 0,
463 warning => qr/overlong|2 bytes, need 1/,
464 no_warnings_returned_length => 2,
465 },
5f93efa0
KW
466 {
467 input => "\xe0\x80\x81",
468 adjustment => 0,
469 warning => qr/overlong|3 bytes, need 1/,
470 no_warnings_returned_length => 3,
471 },
472 {
473 input => "\xf0\x80\x80\x81",
474 adjustment => 0,
475 warning => qr/overlong|4 bytes, need 1/,
476 no_warnings_returned_length => 4,
477 },
6858ac42 478 { # Old algorithm failed to detect this
aadf4f9e
N
479 input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
480 adjustment => 0,
05fc829b 481 warning => qr/overflow/,
aadf4f9e
N
482 no_warnings_returned_length => 13,
483 },
484 );
485
486 # An empty input is an assertion failure on debugging builds. It is
487 # deliberately the first test.
488 require Config; import Config;
489 use vars '%Config';
72c9c989
CB
490 if ($Config{ccflags} =~ /-DDEBUGGING/
491 || $^O eq 'VMS' && $Config{usedebugging_perl} eq 'Y') {
aadf4f9e
N
492 shift @buf_tests;
493 ok(1, 1) for 1..5;
494 }
495
496 for my $test (@buf_tests) {
497 my $input = $test->{'input'};
498 my $adjustment = $test->{'adjustment'};
499 my $display = 'utf8_to_uvchr_buf("';
500 for (my $i = 0; $i < length($input) + $adjustment; $i++) {
501 $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
502 }
503
504 $display .= '")';
505 my $warning = $test->{'warning'};
506
507 undef @warnings;
933ffe6b 508 BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' }
aadf4f9e
N
509 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
510 ok($ret->[0], 0, "returned value $display; warnings enabled");
511 ok($ret->[1], -1, "returned length $display; warnings enabled");
512 my $all_warnings = join "; ", @warnings;
513 my $contains = grep { $_ =~ $warning } $all_warnings;
17c135f5
KW
514 ok($contains, 1, $display
515 . "; Got: '$all_warnings', which should contain '$warning'");
aadf4f9e
N
516
517 undef @warnings;
933ffe6b 518 BEGIN { 'warnings'->unimport('utf8') if "$]" gt '5.006' }
aadf4f9e
N
519 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
520 ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled");
521 ok($ret->[1], $test->{'no_warnings_returned_length'},
522 "returned length $display; warnings disabled");
523 }
524}