This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
parts/inc/uv: Add two tests of overlongs
[perl5.git] / dist / Devel-PPPort / parts / inc / uv
CommitLineData
adfe19db
MHM
1################################################################################
2##
b2049988 3## Version 3.x, Copyright (C) 2004-2013, Marcus Holland-Moritz.
adfe19db
MHM
4## Version 2.x, Copyright (C) 2001, Paul Marquess.
5## Version 1.x, Copyright (C) 1999, Kenneth Albanowski.
6##
7## This program is free software; you can redistribute it and/or
8## modify it under the same terms as Perl itself.
9##
10################################################################################
11
12=provides
13
adfe19db 14__UNDEFINED__
e28192fd 15my_strnlen
679ad62d 16SvUOK
aadf4f9e
N
17utf8_to_uvchr_buf
18
19=dontwarn
20
21_ppport_utf8_to_uvchr_buf_callee
22_ppport_MIN
adfe19db
MHM
23
24=implementation
25
aadf4f9e
N
26#define _ppport_MIN(a,b) (((a) <= (b)) ? (a) : (b))
27
0d0f8426
MHM
28__UNDEFINED__ sv_setuv(sv, uv) \
29 STMT_START { \
30 UV TeMpUv = uv; \
31 if (TeMpUv <= IV_MAX) \
32 sv_setiv(sv, TeMpUv); \
33 else \
34 sv_setnv(sv, (double)TeMpUv); \
35 } STMT_END
36
37__UNDEFINED__ newSVuv(uv) ((uv) <= IV_MAX ? newSViv((IV)uv) : newSVnv((NV)uv))
adfe19db
MHM
38
39__UNDEFINED__ sv_2uv(sv) ((PL_Sv = (sv)), (UV) (SvNOK(PL_Sv) ? SvNV(PL_Sv) : sv_2nv(PL_Sv)))
40__UNDEFINED__ SvUVX(sv) ((UV)SvIVX(sv))
41__UNDEFINED__ SvUVXx(sv) SvUVX(sv)
42__UNDEFINED__ SvUV(sv) (SvIOK(sv) ? SvUVX(sv) : sv_2uv(sv))
43__UNDEFINED__ SvUVx(sv) ((PL_Sv = (sv)), SvUV(PL_Sv))
44
45/* Hint: sv_uv
46 * Always use the SvUVx() macro instead of sv_uv().
47 */
48__UNDEFINED__ sv_uv(sv) SvUVx(sv)
49
679ad62d
MHM
50#if !defined(SvUOK) && defined(SvIOK_UV)
51# define SvUOK(sv) SvIOK_UV(sv)
52#endif
53
adfe19db
MHM
54__UNDEFINED__ XST_mUV(i,v) (ST(i) = sv_2mortal(newSVuv(v)) )
55__UNDEFINED__ XSRETURN_UV(v) STMT_START { XST_mUV(0,v); XSRETURN(1); } STMT_END
56
96ad942f
MHM
57__UNDEFINED__ PUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); PUSHTARG; } STMT_END
58__UNDEFINED__ XPUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); XPUSHTARG; } STMT_END
59
f2e3e4ce 60#if defined UTF8SKIP
94a2c4f7
KW
61
62/* Don't use official version because it uses MIN, which may not be available */
63#undef UTF8_SAFE_SKIP
64
aadf4f9e
N
65__UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \
66 ((((e) - (s)) <= 0) \
f2e3e4ce 67 ? 0 \
aadf4f9e 68 : _ppport_MIN(((e) - (s)), UTF8SKIP(s))))
f2e3e4ce
KW
69#endif
70
e28192fd
KW
71#if !defined(my_strnlen)
72#if { NEED my_strnlen }
73
74STRLEN
75my_strnlen(const char *str, Size_t maxlen)
76{
77 const char *p = str;
78
79 while(maxlen-- && *p)
80 p++;
81
82 return p - str;
83}
84
85#endif
86#endif
4c28bdc5 87
97d83eb2 88#if { VERSION < 5.31.3 }
aadf4f9e
N
89 /* Versions prior to this accepted things that are now considered
90 * malformations, and didn't return -1 on error with warnings enabled
91 * */
92# undef utf8_to_uvchr_buf
93#endif
4c28bdc5 94
aadf4f9e
N
95/* This implementation brings modern, generally more restricted standards to
96 * utf8_to_uvchr_buf. Some of these are security related, and clearly must
97 * be done. But its arguable that the others need not, and hence should not.
98 * The reason they're here is that a module that intends to play with the
99 * latest perls shoud be able to work the same in all releases. An example is
100 * that perl no longer accepts any UV for a code point, but limits them to
101 * IV_MAX or below. This is for future internal use of the larger code points.
102 * If it turns out that some of these changes are breaking code that isn't
103 * intended to work with modern perls, the tighter restrictions could be
104 * relaxed. khw thinks this is unlikely, but has been wrong in the past. */
105
9aa6a863
N
106#if { VERSION < 5.10.0 }
107# define D_PPP_CU8 U8
108#else
109# define D_PPP_CU8 const U8
110#endif
111
aadf4f9e
N
112#ifndef utf8_to_uvchr_buf
113 /* Choose which underlying implementation to use. At least one must be
114 * present or the perl is too early to handle this function */
115# if defined(utf8n_to_uvchr) || defined(utf8_to_uv)
116# if defined(utf8n_to_uvchr) /* This is the preferred implementation */
117# define _ppport_utf8_to_uvchr_buf_callee utf8n_to_uvchr
9aa6a863 118# elif { VERSION >= 5.6.1 }
aadf4f9e
N
119# define _ppport_utf8_to_uvchr_buf_callee utf8_to_uv
120# endif
121
122# endif
123
124#ifdef _ppport_utf8_to_uvchr_buf_callee
125# if { NEED utf8_to_uvchr_buf }
4c28bdc5 126
aadf4f9e 127UV
9aa6a863 128utf8_to_uvchr_buf(pTHX_ D_PPP_CU8 *s, const U8 *send, STRLEN *retlen)
aadf4f9e
N
129{
130 UV ret;
131 STRLEN curlen;
132 bool overflows = 0;
133 const U8 *cur_s = s;
134 const bool do_warnings = ckWARN_d(WARN_UTF8);
135
136 if (send > s) {
137 curlen = send - s;
138 }
139 else {
140 assert(0); /* Modern perls die under this circumstance */
141 curlen = 0;
142 if (! do_warnings) { /* Handle empty here if no warnings needed */
143 if (retlen) *retlen = 0;
144 return UNICODE_REPLACEMENT;
145 }
146 }
147
148 /* The modern version allows anything that evaluates to a legal UV, but not
149 * overlongs nor an empty input */
150 ret = _ppport_utf8_to_uvchr_buf_callee(
151 s, curlen, retlen, (UTF8_ALLOW_ANYUV
152 & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
153
154 /* But actually, modern versions restrict the UV to being no more than what
155 * an IV can hold */
156 if (ret > PERL_INT_MAX) {
157 overflows = 1;
158 }
159
160# if { VERSION < 5.26.0 }
161# ifndef EBCDIC
162
163 /* There are bugs in versions earlier than this on non-EBCDIC platforms
164 * in which it did not detect all instances of overflow, which could be
165 * a security hole. Also, earlier versions did not allow the overflow
166 * malformation under any circumstances, and modern ones do. So we
167 * need to check here. */
168
169 else if (curlen > 0 && *s >= 0xFE) {
170
171 /* If the main routine detected overflow, great; it returned 0. But if the
172 * input's first byte indicates it could overflow, we need to verify.
173 * First, on a 32-bit machine the first byte being at least \xFE
174 * automatically is overflow */
175 if (sizeof(ret) < 8) {
176 overflows = 1;
177 }
178 else {
179 const U8 highest[] = /* 2*63-1 */
180 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
181 const U8 *cur_h = highest;
182
183 for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
184 if (UNLIKELY(*cur_s == *cur_h)) {
185 continue;
186 }
187
188 /* If this byte is larger than the corresponding highest UTF-8
189 * byte, the sequence overflows; otherwise the byte is less than
190 * (as we handled the equality case above), and so the sequence
191 * doesn't overflow */
192 overflows = *cur_s > *cur_h;
193 break;
194
195 }
196
197 /* Here, either we set the bool and broke out of the loop, or got
198 * to the end and all bytes are the same which indicates it doesn't
199 * overflow. */
200 }
201 }
202
203# endif
204# endif /* < 5.26 */
205
206 if (UNLIKELY(overflows)) {
207 if (! do_warnings) {
208 if (retlen) {
209 *retlen = _ppport_MIN(*retlen, UTF8SKIP(s));
210 *retlen = _ppport_MIN(*retlen, curlen);
211 }
212 return UNICODE_REPLACEMENT;
213 }
214 else {
215
216 /* On versions that correctly detect overflow, but forbid it
217 * always, 0 will be returned, but also a warning will have been
218 * raised. Don't repeat it */
219 if (ret != 0) {
220 /* We use the error message in use from 5.8-5.14 */
221 Perl_warner(aTHX_ packWARN(WARN_UTF8),
222 "Malformed UTF-8 character (overflow at 0x%" UVxf
223 ", byte 0x%02x, after start byte 0x%02x)",
224 ret, *cur_s, *s);
225 }
226 if (retlen) {
227 *retlen = (STRLEN) -1;
228 }
229 return 0;
230 }
231 }
232
233 /* If failed and warnings are off, to emulate the behavior of the real
234 * utf8_to_uvchr(), try again, allowing anything. (Note a return of 0 is
235 * ok if the input was '\0') */
236 if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
237
238 /* If curlen is 0, we already handled the case where warnings are
239 * disabled, so this 'if' will be true, and we won't look at the
240 * contents of 's' */
241 if (do_warnings) {
242 *retlen = (STRLEN) -1;
243 }
244 else {
245 ret = _ppport_utf8_to_uvchr_buf_callee(
246 s, curlen, retlen, UTF8_ALLOW_ANY);
247 /* Override with the REPLACEMENT character, as that is what the
248 * modern version of this function returns */
249 ret = UNICODE_REPLACEMENT;
250
251# if { VERSION < 5.16.0 }
252
253 /* Versions earlier than this don't necessarily return the proper
254 * length. It should not extend past the end of string, nor past
255 * what the first byte indicates the length is, nor past the
256 * continuation characters */
257 if (retlen && *retlen >= 0) {
f1590d76 258 unsigned int i = 1;
aadf4f9e
N
259 *retlen = _ppport_MIN(*retlen, curlen);
260 *retlen = _ppport_MIN(*retlen, UTF8SKIP(s));
aadf4f9e
N
261 do {
262 if (s[i] < 0x80 || s[i] > 0xBF) {
263 *retlen = i;
264 break;
265 }
266 } while (++i < *retlen);
267 }
268
269# endif
270
271 }
272 }
273
274 return ret;
275}
e28192fd 276
aadf4f9e
N
277# endif
278#endif
279#endif
39d7245c 280
aadf4f9e
N
281#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
282#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
283 to read past a NUL, making it much less likely to read
284 off the end of the buffer. A NUL indicates the start
285 of the next character anyway. If the input isn't
286 NUL-terminated, the function remains unsafe, as it
287 always has been. */
39d7245c
KW
288
289__UNDEFINED__ utf8_to_uvchr(s, lp) \
290 ((*(s) == '\0') \
291 ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \
292 : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp)))
293
aadf4f9e
N
294#endif
295
e28192fd
KW
296=xsinit
297
298#define NEED_my_strnlen
aadf4f9e 299#define NEED_utf8_to_uvchr_buf
e28192fd 300
adfe19db
MHM
301=xsubs
302
303SV *
304sv_setuv(uv)
b2049988
MHM
305 UV uv
306 CODE:
307 RETVAL = newSViv(1);
308 sv_setuv(RETVAL, uv);
309 OUTPUT:
310 RETVAL
adfe19db
MHM
311
312SV *
313newSVuv(uv)
b2049988
MHM
314 UV uv
315 CODE:
316 RETVAL = newSVuv(uv);
317 OUTPUT:
318 RETVAL
adfe19db
MHM
319
320UV
321sv_2uv(sv)
b2049988
MHM
322 SV *sv
323 CODE:
324 RETVAL = sv_2uv(sv);
325 OUTPUT:
326 RETVAL
adfe19db
MHM
327
328UV
329SvUVx(sv)
b2049988
MHM
330 SV *sv
331 CODE:
332 sv--;
333 RETVAL = SvUVx(++sv);
334 OUTPUT:
335 RETVAL
adfe19db
MHM
336
337void
338XSRETURN_UV()
b2049988
MHM
339 PPCODE:
340 XSRETURN_UV(42);
adfe19db 341
96ad942f
MHM
342void
343PUSHu()
b2049988
MHM
344 PREINIT:
345 dTARG;
346 PPCODE:
347 TARG = sv_newmortal();
348 EXTEND(SP, 1);
349 PUSHu(42);
350 XSRETURN(1);
96ad942f
MHM
351
352void
353XPUSHu()
b2049988
MHM
354 PREINIT:
355 dTARG;
356 PPCODE:
357 TARG = sv_newmortal();
358 XPUSHu(43);
359 XSRETURN(1);
96ad942f 360
f2e3e4ce
KW
361STRLEN
362UTF8_SAFE_SKIP(s, adjustment)
363 unsigned char * s
364 int adjustment
365 CODE:
366 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
9aa6a863 367#if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP)
f2e3e4ce 368 RETVAL = UTF8_SAFE_SKIP(s, s + UTF8SKIP(s) + adjustment);
9aa6a863
N
369#else
370 RETVAL = 0;
371#endif
f2e3e4ce
KW
372 OUTPUT:
373 RETVAL
374
e28192fd
KW
375STRLEN
376my_strnlen(s, max)
377 char * s
378 STRLEN max
379 CODE:
380 RETVAL= my_strnlen(s, max);
381 OUTPUT:
382 RETVAL
383
4c28bdc5 384AV *
aadf4f9e 385utf8_to_uvchr_buf(s, adjustment)
4c28bdc5 386 unsigned char *s
aadf4f9e 387 int adjustment
4c28bdc5
KW
388 PREINIT:
389 AV *av;
390 STRLEN len;
391 CODE:
392 av = newAV();
9aa6a863 393#ifdef utf8_to_uvchr_buf
aadf4f9e
N
394 av_push(av, newSVuv(utf8_to_uvchr_buf(s,
395 s + UTF8SKIP(s) + adjustment,
396 &len)));
9aa6a863
N
397#else
398 av_push(av, newSVuv(0));
399 len = (STRLEN) -1;
400#endif
721db7b0
N
401 if (len == (STRLEN) -1) {
402 av_push(av, newSViv(-1));
403 }
404 else {
405 av_push(av, newSVuv(len));
406 }
4c28bdc5
KW
407 RETVAL = av;
408 OUTPUT:
409 RETVAL
410
39d7245c
KW
411AV *
412utf8_to_uvchr(s)
413 unsigned char *s
414 PREINIT:
415 AV *av;
416 STRLEN len;
417 CODE:
418 av = newAV();
9aa6a863 419#ifdef utf8_to_uvchr
39d7245c 420 av_push(av, newSVuv(utf8_to_uvchr(s, &len)));
9aa6a863
N
421#else
422 av_push(av, newSVuv(0));
423 len = (STRLEN) -1;
424#endif
721db7b0
N
425 if (len == (STRLEN) -1) {
426 av_push(av, newSViv(-1));
427 }
428 else {
429 av_push(av, newSVuv(len));
430 }
39d7245c
KW
431 RETVAL = av;
432 OUTPUT:
433 RETVAL
434
5f93efa0 435=tests plan => 62
adfe19db
MHM
436
437ok(&Devel::PPPort::sv_setuv(42), 42);
438ok(&Devel::PPPort::newSVuv(123), 123);
439ok(&Devel::PPPort::sv_2uv("4711"), 4711);
440ok(&Devel::PPPort::sv_2uv("1735928559"), 1735928559);
441ok(&Devel::PPPort::SvUVx("1735928559"), 1735928559);
442ok(&Devel::PPPort::SvUVx(1735928559), 1735928559);
443ok(&Devel::PPPort::SvUVx(0xdeadbeef), 0xdeadbeef);
444ok(&Devel::PPPort::XSRETURN_UV(), 42);
96ad942f
MHM
445ok(&Devel::PPPort::PUSHu(), 42);
446ok(&Devel::PPPort::XPUSHu(), 43);
f2e3e4ce
KW
447ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
448ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
e28192fd 449ok(&Devel::PPPort::my_strnlen("abc\0def", 7), 3);
aadf4f9e 450
89d8c7cd
KW
451# skip tests on 5.6.0 and earlier
452if ("$]" le '5.006') {
5f93efa0 453 skip 'skip: broken utf8 support', 0 for 1..49;
89d8c7cd
KW
454 exit;
455}
456
aadf4f9e 457my $ret = &Devel::PPPort::utf8_to_uvchr("A");
4c28bdc5
KW
458ok($ret->[0], ord("A"));
459ok($ret->[1], 1);
aadf4f9e
N
460
461$ret = &Devel::PPPort::utf8_to_uvchr("\0");
4c28bdc5
KW
462ok($ret->[0], 0);
463ok($ret->[1], 1);
aadf4f9e
N
464
465$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
39d7245c
KW
466ok($ret->[0], ord("A"));
467ok($ret->[1], 1);
aadf4f9e
N
468
469$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
39d7245c
KW
470ok($ret->[0], 0);
471ok($ret->[1], 1);
aadf4f9e
N
472
473if (ord("A") != 65) { # tests not valid for EBCDIC
5f93efa0 474 ok(1, 1) for 1 .. (2 + 4 + (7 * 5));
aadf4f9e
N
475}
476else {
477 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
478 ok($ret->[0], 0x100);
479 ok($ret->[1], 2);
480
481 my @warnings;
482 local $SIG{__WARN__} = sub { push @warnings, @_; };
483
484 {
485 use warnings 'utf8';
486 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
487 ok($ret->[0], 0);
488 ok($ret->[1], -1);
489
490 no warnings;
491 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
492 ok($ret->[0], 0xFFFD);
493 ok($ret->[1], 1);
494 }
495
496 my @buf_tests = (
497 {
498 input => "A",
499 adjustment => -1,
500 warning => qr/empty/,
501 no_warnings_returned_length => 0,
502 },
503 {
504 input => "\xc4\xc5",
505 adjustment => 0,
506 warning => qr/non-continuation/,
507 no_warnings_returned_length => 1,
508 },
509 {
510 input => "\xc4\x80",
511 adjustment => -1,
512 warning => qr/short|1 byte, need 2/,
513 no_warnings_returned_length => 1,
514 },
515 {
516 input => "\xc0\x81",
517 adjustment => 0,
518 warning => qr/overlong|2 bytes, need 1/,
519 no_warnings_returned_length => 2,
520 },
5f93efa0
KW
521 {
522 input => "\xe0\x80\x81",
523 adjustment => 0,
524 warning => qr/overlong|3 bytes, need 1/,
525 no_warnings_returned_length => 3,
526 },
527 {
528 input => "\xf0\x80\x80\x81",
529 adjustment => 0,
530 warning => qr/overlong|4 bytes, need 1/,
531 no_warnings_returned_length => 4,
532 },
aadf4f9e
N
533 { # Old algorithm supposedly failed to detect this
534 input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
535 adjustment => 0,
9aa6a863 536 warning => ("$]" le 5.008006) ? qr/Malformed UTF-8 character/ : qr/overflow/,
aadf4f9e
N
537 no_warnings_returned_length => 13,
538 },
539 );
540
541 # An empty input is an assertion failure on debugging builds. It is
542 # deliberately the first test.
543 require Config; import Config;
544 use vars '%Config';
545 if ($Config{ccflags} =~ /-DDEBUGGING/) {
546 shift @buf_tests;
547 ok(1, 1) for 1..5;
548 }
549
550 for my $test (@buf_tests) {
551 my $input = $test->{'input'};
552 my $adjustment = $test->{'adjustment'};
553 my $display = 'utf8_to_uvchr_buf("';
554 for (my $i = 0; $i < length($input) + $adjustment; $i++) {
555 $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
556 }
557
558 $display .= '")';
559 my $warning = $test->{'warning'};
560
561 undef @warnings;
562 use warnings 'utf8';
563 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
564 ok($ret->[0], 0, "returned value $display; warnings enabled");
565 ok($ret->[1], -1, "returned length $display; warnings enabled");
566 my $all_warnings = join "; ", @warnings;
567 my $contains = grep { $_ =~ $warning } $all_warnings;
17c135f5
KW
568 ok($contains, 1, $display
569 . "; Got: '$all_warnings', which should contain '$warning'");
aadf4f9e
N
570
571 undef @warnings;
572 no warnings 'utf8';
573 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
574 ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled");
575 ok($ret->[1], $test->{'no_warnings_returned_length'},
576 "returned length $display; warnings disabled");
577 }
578}