This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
parts/inc/uv: White-space changes only
[perl5.git] / dist / Devel-PPPort / parts / inc / uv
CommitLineData
adfe19db
MHM
1################################################################################
2##
b2049988 3## Version 3.x, Copyright (C) 2004-2013, Marcus Holland-Moritz.
adfe19db
MHM
4## Version 2.x, Copyright (C) 2001, Paul Marquess.
5## Version 1.x, Copyright (C) 1999, Kenneth Albanowski.
6##
7## This program is free software; you can redistribute it and/or
8## modify it under the same terms as Perl itself.
9##
10################################################################################
11
12=provides
13
adfe19db 14__UNDEFINED__
e28192fd 15my_strnlen
679ad62d 16SvUOK
aadf4f9e
N
17utf8_to_uvchr_buf
18
19=dontwarn
20
21_ppport_utf8_to_uvchr_buf_callee
22_ppport_MIN
adfe19db
MHM
23
24=implementation
25
aadf4f9e
N
26#define _ppport_MIN(a,b) (((a) <= (b)) ? (a) : (b))
27
0d0f8426
MHM
28__UNDEFINED__ sv_setuv(sv, uv) \
29 STMT_START { \
30 UV TeMpUv = uv; \
31 if (TeMpUv <= IV_MAX) \
32 sv_setiv(sv, TeMpUv); \
33 else \
34 sv_setnv(sv, (double)TeMpUv); \
35 } STMT_END
36
37__UNDEFINED__ newSVuv(uv) ((uv) <= IV_MAX ? newSViv((IV)uv) : newSVnv((NV)uv))
adfe19db
MHM
38
39__UNDEFINED__ sv_2uv(sv) ((PL_Sv = (sv)), (UV) (SvNOK(PL_Sv) ? SvNV(PL_Sv) : sv_2nv(PL_Sv)))
40__UNDEFINED__ SvUVX(sv) ((UV)SvIVX(sv))
41__UNDEFINED__ SvUVXx(sv) SvUVX(sv)
42__UNDEFINED__ SvUV(sv) (SvIOK(sv) ? SvUVX(sv) : sv_2uv(sv))
43__UNDEFINED__ SvUVx(sv) ((PL_Sv = (sv)), SvUV(PL_Sv))
44
45/* Hint: sv_uv
46 * Always use the SvUVx() macro instead of sv_uv().
47 */
48__UNDEFINED__ sv_uv(sv) SvUVx(sv)
49
679ad62d
MHM
50#if !defined(SvUOK) && defined(SvIOK_UV)
51# define SvUOK(sv) SvIOK_UV(sv)
52#endif
53
adfe19db
MHM
54__UNDEFINED__ XST_mUV(i,v) (ST(i) = sv_2mortal(newSVuv(v)) )
55__UNDEFINED__ XSRETURN_UV(v) STMT_START { XST_mUV(0,v); XSRETURN(1); } STMT_END
56
96ad942f
MHM
57__UNDEFINED__ PUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); PUSHTARG; } STMT_END
58__UNDEFINED__ XPUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); XPUSHTARG; } STMT_END
59
f2e3e4ce 60#if defined UTF8SKIP
94a2c4f7
KW
61
62/* Don't use official version because it uses MIN, which may not be available */
63#undef UTF8_SAFE_SKIP
64
aadf4f9e
N
65__UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \
66 ((((e) - (s)) <= 0) \
f2e3e4ce 67 ? 0 \
aadf4f9e 68 : _ppport_MIN(((e) - (s)), UTF8SKIP(s))))
f2e3e4ce
KW
69#endif
70
e28192fd
KW
71#if !defined(my_strnlen)
72#if { NEED my_strnlen }
73
74STRLEN
75my_strnlen(const char *str, Size_t maxlen)
76{
77 const char *p = str;
78
79 while(maxlen-- && *p)
80 p++;
81
82 return p - str;
83}
84
85#endif
86#endif
4c28bdc5 87
97d83eb2 88#if { VERSION < 5.31.3 }
aadf4f9e
N
89 /* Versions prior to this accepted things that are now considered
90 * malformations, and didn't return -1 on error with warnings enabled
91 * */
92# undef utf8_to_uvchr_buf
93#endif
4c28bdc5 94
aadf4f9e
N
95/* This implementation brings modern, generally more restricted standards to
96 * utf8_to_uvchr_buf. Some of these are security related, and clearly must
97 * be done. But its arguable that the others need not, and hence should not.
98 * The reason they're here is that a module that intends to play with the
6858ac42 99 * latest perls should be able to work the same in all releases. An example is
aadf4f9e
N
100 * that perl no longer accepts any UV for a code point, but limits them to
101 * IV_MAX or below. This is for future internal use of the larger code points.
102 * If it turns out that some of these changes are breaking code that isn't
103 * intended to work with modern perls, the tighter restrictions could be
104 * relaxed. khw thinks this is unlikely, but has been wrong in the past. */
105
6858ac42
KW
106/* 5.6.0 is the first release with UTF-8, and we don't implement this function
107 * there due to its likely lack of still being in use, and the underlying
108 * implementation is very different from later ones, without the later
109 * safeguards, so would require extra work to deal with */
e4a76720 110#if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf)
de222e8b
KW
111# if { VERSION < 5.10.0 } /* Was non-const before this */
112# define D_PPP_CU8 U8
113# else
114# define D_PPP_CU8 const U8
115# endif
9aa6a863 116
aadf4f9e
N
117 /* Choose which underlying implementation to use. At least one must be
118 * present or the perl is too early to handle this function */
119# if defined(utf8n_to_uvchr) || defined(utf8_to_uv)
120# if defined(utf8n_to_uvchr) /* This is the preferred implementation */
121# define _ppport_utf8_to_uvchr_buf_callee utf8n_to_uvchr
e4a76720 122# else /* Must be at least 5.6.1 from #if above */
aadf4f9e
N
123# define _ppport_utf8_to_uvchr_buf_callee utf8_to_uv
124# endif
aadf4f9e
N
125# endif
126
aadf4f9e 127# if { NEED utf8_to_uvchr_buf }
4c28bdc5 128
aadf4f9e 129UV
9aa6a863 130utf8_to_uvchr_buf(pTHX_ D_PPP_CU8 *s, const U8 *send, STRLEN *retlen)
aadf4f9e
N
131{
132 UV ret;
133 STRLEN curlen;
134 bool overflows = 0;
135 const U8 *cur_s = s;
136 const bool do_warnings = ckWARN_d(WARN_UTF8);
05fc829b 137 STRLEN overflow_length = 0;
aadf4f9e
N
138
139 if (send > s) {
140 curlen = send - s;
141 }
142 else {
143 assert(0); /* Modern perls die under this circumstance */
144 curlen = 0;
145 if (! do_warnings) { /* Handle empty here if no warnings needed */
146 if (retlen) *retlen = 0;
147 return UNICODE_REPLACEMENT;
148 }
149 }
150
8ec545d9 151# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
aadf4f9e 152
6858ac42
KW
153 /* Perl did not properly detect overflow for much of its history on
154 * non-EBCDIC platforms, often returning an overlong value which may or may
155 * not have been tolerated in the call. Also, earlier versions, when they
156 * did detect overflow, may have disallowed it completely. Modern ones can
157 * replace it with the REPLACEMENT CHARACTER, depending on calling
158 * parameters. Therefore detect it ourselves in releases it was
159 * problematic in. */
aadf4f9e 160
05fc829b 161 if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {
aadf4f9e 162
6858ac42
KW
163 /* First, on a 32-bit machine the first byte being at least \xFE
164 * automatically is overflow, as it indicates something requiring more
165 * than 31 bits */
aadf4f9e
N
166 if (sizeof(ret) < 8) {
167 overflows = 1;
05fc829b 168 overflow_length = 7;
aadf4f9e
N
169 }
170 else {
171 const U8 highest[] = /* 2*63-1 */
172 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
173 const U8 *cur_h = highest;
174
175 for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
176 if (UNLIKELY(*cur_s == *cur_h)) {
177 continue;
178 }
179
180 /* If this byte is larger than the corresponding highest UTF-8
6858ac42
KW
181 * byte, the sequence overflows; otherwise the byte is less
182 * than (as we handled the equality case above), and so the
183 * sequence doesn't overflow */
aadf4f9e
N
184 overflows = *cur_s > *cur_h;
185 break;
186
187 }
188
189 /* Here, either we set the bool and broke out of the loop, or got
190 * to the end and all bytes are the same which indicates it doesn't
05fc829b
KW
191 * overflow. If it did overflow, it would be this number of bytes
192 * */
193 overflow_length = 13;
194 }
195 }
196
197 if (UNLIKELY(overflows)) {
198 ret = 0;
199
200 if (! do_warnings && retlen) {
201 *retlen = overflow_length;
aadf4f9e
N
202 }
203 }
05fc829b 204 else
aadf4f9e 205
aadf4f9e
N
206# endif /* < 5.26 */
207
05fc829b
KW
208 /* Here, we are either in a release that properly detects overflow, or
209 * we have checked for overflow and the next statement is executing as
210 * part of the above conditional where we know we don't have overflow.
211 *
212 * The modern versions allow anything that evaluates to a legal UV, but
213 * not overlongs nor an empty input */
214 ret = _ppport_utf8_to_uvchr_buf_callee(
215 s, curlen, retlen, (UTF8_ALLOW_ANYUV
216 & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
217
218# if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }
219
220 /* But actually, more modern versions restrict the UV to being no more than
221 * what * an IV can hold, so it could, so it could still have gotten it
222 * wrong about overflowing. */
223 if (UNLIKELY(ret > IV_MAX)) {
224 overflows = 1;
225 }
226
227# endif
228
aadf4f9e
N
229 if (UNLIKELY(overflows)) {
230 if (! do_warnings) {
231 if (retlen) {
232 *retlen = _ppport_MIN(*retlen, UTF8SKIP(s));
233 *retlen = _ppport_MIN(*retlen, curlen);
234 }
235 return UNICODE_REPLACEMENT;
236 }
237 else {
238
6858ac42 239 /* We use the error message in use from 5.8-5.26 */
de222e8b
KW
240 Perl_warner(aTHX_ packWARN(WARN_UTF8),
241 "Malformed UTF-8 character (overflow at 0x%" UVxf
242 ", byte 0x%02x, after start byte 0x%02x)",
243 ret, *cur_s, *s);
aadf4f9e
N
244 if (retlen) {
245 *retlen = (STRLEN) -1;
246 }
247 return 0;
248 }
249 }
250
6858ac42
KW
251 /* Here, did not overflow, but if it failed for some other reason, and
252 * warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
253 * try again, allowing anything. (Note a return of 0 is ok if the input
254 * was '\0') */
aadf4f9e
N
255 if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
256
257 /* If curlen is 0, we already handled the case where warnings are
6858ac42
KW
258 * disabled, so this 'if' will be true, and so later on, we know that
259 * 's' is dereferencible */
aadf4f9e
N
260 if (do_warnings) {
261 *retlen = (STRLEN) -1;
262 }
263 else {
264 ret = _ppport_utf8_to_uvchr_buf_callee(
265 s, curlen, retlen, UTF8_ALLOW_ANY);
266 /* Override with the REPLACEMENT character, as that is what the
267 * modern version of this function returns */
268 ret = UNICODE_REPLACEMENT;
269
de222e8b 270# if { VERSION < 5.16.0 }
aadf4f9e
N
271
272 /* Versions earlier than this don't necessarily return the proper
273 * length. It should not extend past the end of string, nor past
274 * what the first byte indicates the length is, nor past the
275 * continuation characters */
276 if (retlen && *retlen >= 0) {
f1590d76 277 unsigned int i = 1;
aadf4f9e
N
278 *retlen = _ppport_MIN(*retlen, curlen);
279 *retlen = _ppport_MIN(*retlen, UTF8SKIP(s));
aadf4f9e
N
280 do {
281 if (s[i] < 0x80 || s[i] > 0xBF) {
282 *retlen = i;
283 break;
284 }
285 } while (++i < *retlen);
286 }
287
de222e8b 288# endif
aadf4f9e
N
289
290 }
291 }
292
293 return ret;
294}
e28192fd 295
aadf4f9e
N
296# endif
297#endif
39d7245c 298
aadf4f9e
N
299#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
300#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
301 to read past a NUL, making it much less likely to read
302 off the end of the buffer. A NUL indicates the start
303 of the next character anyway. If the input isn't
304 NUL-terminated, the function remains unsafe, as it
305 always has been. */
39d7245c
KW
306
307__UNDEFINED__ utf8_to_uvchr(s, lp) \
308 ((*(s) == '\0') \
309 ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \
310 : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp)))
311
aadf4f9e
N
312#endif
313
e28192fd
KW
314=xsinit
315
316#define NEED_my_strnlen
aadf4f9e 317#define NEED_utf8_to_uvchr_buf
e28192fd 318
adfe19db
MHM
319=xsubs
320
321SV *
322sv_setuv(uv)
b2049988
MHM
323 UV uv
324 CODE:
325 RETVAL = newSViv(1);
326 sv_setuv(RETVAL, uv);
327 OUTPUT:
328 RETVAL
adfe19db
MHM
329
330SV *
331newSVuv(uv)
b2049988
MHM
332 UV uv
333 CODE:
334 RETVAL = newSVuv(uv);
335 OUTPUT:
336 RETVAL
adfe19db
MHM
337
338UV
339sv_2uv(sv)
b2049988
MHM
340 SV *sv
341 CODE:
342 RETVAL = sv_2uv(sv);
343 OUTPUT:
344 RETVAL
adfe19db
MHM
345
346UV
347SvUVx(sv)
b2049988
MHM
348 SV *sv
349 CODE:
350 sv--;
351 RETVAL = SvUVx(++sv);
352 OUTPUT:
353 RETVAL
adfe19db
MHM
354
355void
356XSRETURN_UV()
b2049988
MHM
357 PPCODE:
358 XSRETURN_UV(42);
adfe19db 359
96ad942f
MHM
360void
361PUSHu()
b2049988
MHM
362 PREINIT:
363 dTARG;
364 PPCODE:
365 TARG = sv_newmortal();
366 EXTEND(SP, 1);
367 PUSHu(42);
368 XSRETURN(1);
96ad942f
MHM
369
370void
371XPUSHu()
b2049988
MHM
372 PREINIT:
373 dTARG;
374 PPCODE:
375 TARG = sv_newmortal();
376 XPUSHu(43);
377 XSRETURN(1);
96ad942f 378
f2e3e4ce
KW
379STRLEN
380UTF8_SAFE_SKIP(s, adjustment)
381 unsigned char * s
382 int adjustment
383 CODE:
384 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
9aa6a863 385#if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP)
f2e3e4ce 386 RETVAL = UTF8_SAFE_SKIP(s, s + UTF8SKIP(s) + adjustment);
9aa6a863
N
387#else
388 RETVAL = 0;
389#endif
f2e3e4ce
KW
390 OUTPUT:
391 RETVAL
392
e28192fd
KW
393STRLEN
394my_strnlen(s, max)
395 char * s
396 STRLEN max
397 CODE:
398 RETVAL= my_strnlen(s, max);
399 OUTPUT:
400 RETVAL
401
4c28bdc5 402AV *
aadf4f9e 403utf8_to_uvchr_buf(s, adjustment)
4c28bdc5 404 unsigned char *s
aadf4f9e 405 int adjustment
4c28bdc5
KW
406 PREINIT:
407 AV *av;
408 STRLEN len;
409 CODE:
410 av = newAV();
9aa6a863 411#ifdef utf8_to_uvchr_buf
aadf4f9e
N
412 av_push(av, newSVuv(utf8_to_uvchr_buf(s,
413 s + UTF8SKIP(s) + adjustment,
414 &len)));
9aa6a863
N
415#else
416 av_push(av, newSVuv(0));
417 len = (STRLEN) -1;
418#endif
721db7b0
N
419 if (len == (STRLEN) -1) {
420 av_push(av, newSViv(-1));
421 }
422 else {
423 av_push(av, newSVuv(len));
424 }
4c28bdc5
KW
425 RETVAL = av;
426 OUTPUT:
427 RETVAL
428
39d7245c
KW
429AV *
430utf8_to_uvchr(s)
431 unsigned char *s
432 PREINIT:
433 AV *av;
434 STRLEN len;
435 CODE:
436 av = newAV();
9aa6a863 437#ifdef utf8_to_uvchr
39d7245c 438 av_push(av, newSVuv(utf8_to_uvchr(s, &len)));
9aa6a863
N
439#else
440 av_push(av, newSVuv(0));
441 len = (STRLEN) -1;
442#endif
721db7b0
N
443 if (len == (STRLEN) -1) {
444 av_push(av, newSViv(-1));
445 }
446 else {
447 av_push(av, newSVuv(len));
448 }
39d7245c
KW
449 RETVAL = av;
450 OUTPUT:
451 RETVAL
452
5f93efa0 453=tests plan => 62
adfe19db
MHM
454
455ok(&Devel::PPPort::sv_setuv(42), 42);
456ok(&Devel::PPPort::newSVuv(123), 123);
457ok(&Devel::PPPort::sv_2uv("4711"), 4711);
458ok(&Devel::PPPort::sv_2uv("1735928559"), 1735928559);
459ok(&Devel::PPPort::SvUVx("1735928559"), 1735928559);
460ok(&Devel::PPPort::SvUVx(1735928559), 1735928559);
461ok(&Devel::PPPort::SvUVx(0xdeadbeef), 0xdeadbeef);
462ok(&Devel::PPPort::XSRETURN_UV(), 42);
96ad942f
MHM
463ok(&Devel::PPPort::PUSHu(), 42);
464ok(&Devel::PPPort::XPUSHu(), 43);
f2e3e4ce
KW
465ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
466ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
e28192fd 467ok(&Devel::PPPort::my_strnlen("abc\0def", 7), 3);
aadf4f9e 468
89d8c7cd
KW
469# skip tests on 5.6.0 and earlier
470if ("$]" le '5.006') {
5f93efa0 471 skip 'skip: broken utf8 support', 0 for 1..49;
89d8c7cd
KW
472 exit;
473}
474
aadf4f9e 475my $ret = &Devel::PPPort::utf8_to_uvchr("A");
4c28bdc5
KW
476ok($ret->[0], ord("A"));
477ok($ret->[1], 1);
aadf4f9e
N
478
479$ret = &Devel::PPPort::utf8_to_uvchr("\0");
4c28bdc5
KW
480ok($ret->[0], 0);
481ok($ret->[1], 1);
aadf4f9e
N
482
483$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
39d7245c
KW
484ok($ret->[0], ord("A"));
485ok($ret->[1], 1);
aadf4f9e
N
486
487$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
39d7245c
KW
488ok($ret->[0], 0);
489ok($ret->[1], 1);
aadf4f9e
N
490
491if (ord("A") != 65) { # tests not valid for EBCDIC
5f93efa0 492 ok(1, 1) for 1 .. (2 + 4 + (7 * 5));
aadf4f9e
N
493}
494else {
495 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
496 ok($ret->[0], 0x100);
497 ok($ret->[1], 2);
498
499 my @warnings;
500 local $SIG{__WARN__} = sub { push @warnings, @_; };
501
502 {
503 use warnings 'utf8';
504 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
505 ok($ret->[0], 0);
506 ok($ret->[1], -1);
507
508 no warnings;
509 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
510 ok($ret->[0], 0xFFFD);
511 ok($ret->[1], 1);
512 }
513
514 my @buf_tests = (
515 {
516 input => "A",
517 adjustment => -1,
518 warning => qr/empty/,
519 no_warnings_returned_length => 0,
520 },
521 {
522 input => "\xc4\xc5",
523 adjustment => 0,
524 warning => qr/non-continuation/,
525 no_warnings_returned_length => 1,
526 },
527 {
528 input => "\xc4\x80",
529 adjustment => -1,
530 warning => qr/short|1 byte, need 2/,
531 no_warnings_returned_length => 1,
532 },
533 {
534 input => "\xc0\x81",
535 adjustment => 0,
536 warning => qr/overlong|2 bytes, need 1/,
537 no_warnings_returned_length => 2,
538 },
5f93efa0
KW
539 {
540 input => "\xe0\x80\x81",
541 adjustment => 0,
542 warning => qr/overlong|3 bytes, need 1/,
543 no_warnings_returned_length => 3,
544 },
545 {
546 input => "\xf0\x80\x80\x81",
547 adjustment => 0,
548 warning => qr/overlong|4 bytes, need 1/,
549 no_warnings_returned_length => 4,
550 },
6858ac42 551 { # Old algorithm failed to detect this
aadf4f9e
N
552 input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
553 adjustment => 0,
05fc829b 554 warning => qr/overflow/,
aadf4f9e
N
555 no_warnings_returned_length => 13,
556 },
557 );
558
559 # An empty input is an assertion failure on debugging builds. It is
560 # deliberately the first test.
561 require Config; import Config;
562 use vars '%Config';
563 if ($Config{ccflags} =~ /-DDEBUGGING/) {
564 shift @buf_tests;
565 ok(1, 1) for 1..5;
566 }
567
568 for my $test (@buf_tests) {
569 my $input = $test->{'input'};
570 my $adjustment = $test->{'adjustment'};
571 my $display = 'utf8_to_uvchr_buf("';
572 for (my $i = 0; $i < length($input) + $adjustment; $i++) {
573 $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
574 }
575
576 $display .= '")';
577 my $warning = $test->{'warning'};
578
579 undef @warnings;
580 use warnings 'utf8';
581 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
582 ok($ret->[0], 0, "returned value $display; warnings enabled");
583 ok($ret->[1], -1, "returned length $display; warnings enabled");
584 my $all_warnings = join "; ", @warnings;
585 my $contains = grep { $_ =~ $warning } $all_warnings;
17c135f5
KW
586 ok($contains, 1, $display
587 . "; Got: '$all_warnings', which should contain '$warning'");
aadf4f9e
N
588
589 undef @warnings;
590 no warnings 'utf8';
591 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
592 ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled");
593 ok($ret->[1], $test->{'no_warnings_returned_length'},
594 "returned length $display; warnings disabled");
595 }
596}