This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
parts/inc/uv: Use D_PP_ prefix for internal symbols
[perl5.git] / dist / Devel-PPPort / parts / inc / uv
CommitLineData
adfe19db
MHM
1################################################################################
2##
b2049988 3## Version 3.x, Copyright (C) 2004-2013, Marcus Holland-Moritz.
adfe19db
MHM
4## Version 2.x, Copyright (C) 2001, Paul Marquess.
5## Version 1.x, Copyright (C) 1999, Kenneth Albanowski.
6##
7## This program is free software; you can redistribute it and/or
8## modify it under the same terms as Perl itself.
9##
10################################################################################
11
12=provides
13
adfe19db 14__UNDEFINED__
e28192fd 15my_strnlen
679ad62d 16SvUOK
aadf4f9e
N
17utf8_to_uvchr_buf
18
adfe19db
MHM
19=implementation
20
bbe8b705 21#define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b))
aadf4f9e 22
0d0f8426
MHM
23__UNDEFINED__ sv_setuv(sv, uv) \
24 STMT_START { \
25 UV TeMpUv = uv; \
26 if (TeMpUv <= IV_MAX) \
27 sv_setiv(sv, TeMpUv); \
28 else \
29 sv_setnv(sv, (double)TeMpUv); \
30 } STMT_END
31
32__UNDEFINED__ newSVuv(uv) ((uv) <= IV_MAX ? newSViv((IV)uv) : newSVnv((NV)uv))
adfe19db
MHM
33
34__UNDEFINED__ sv_2uv(sv) ((PL_Sv = (sv)), (UV) (SvNOK(PL_Sv) ? SvNV(PL_Sv) : sv_2nv(PL_Sv)))
35__UNDEFINED__ SvUVX(sv) ((UV)SvIVX(sv))
36__UNDEFINED__ SvUVXx(sv) SvUVX(sv)
37__UNDEFINED__ SvUV(sv) (SvIOK(sv) ? SvUVX(sv) : sv_2uv(sv))
38__UNDEFINED__ SvUVx(sv) ((PL_Sv = (sv)), SvUV(PL_Sv))
39
40/* Hint: sv_uv
41 * Always use the SvUVx() macro instead of sv_uv().
42 */
43__UNDEFINED__ sv_uv(sv) SvUVx(sv)
44
679ad62d
MHM
45#if !defined(SvUOK) && defined(SvIOK_UV)
46# define SvUOK(sv) SvIOK_UV(sv)
47#endif
48
adfe19db
MHM
49__UNDEFINED__ XST_mUV(i,v) (ST(i) = sv_2mortal(newSVuv(v)) )
50__UNDEFINED__ XSRETURN_UV(v) STMT_START { XST_mUV(0,v); XSRETURN(1); } STMT_END
51
96ad942f
MHM
52__UNDEFINED__ PUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); PUSHTARG; } STMT_END
53__UNDEFINED__ XPUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); XPUSHTARG; } STMT_END
54
f2e3e4ce 55#if defined UTF8SKIP
94a2c4f7
KW
56
57/* Don't use official version because it uses MIN, which may not be available */
58#undef UTF8_SAFE_SKIP
59
aadf4f9e
N
60__UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \
61 ((((e) - (s)) <= 0) \
f2e3e4ce 62 ? 0 \
bbe8b705 63 : D_PPP_MIN(((e) - (s)), UTF8SKIP(s))))
f2e3e4ce
KW
64#endif
65
e28192fd
KW
66#if !defined(my_strnlen)
67#if { NEED my_strnlen }
68
69STRLEN
70my_strnlen(const char *str, Size_t maxlen)
71{
72 const char *p = str;
73
74 while(maxlen-- && *p)
75 p++;
76
77 return p - str;
78}
79
80#endif
81#endif
4c28bdc5 82
97d83eb2 83#if { VERSION < 5.31.3 }
aadf4f9e
N
84 /* Versions prior to this accepted things that are now considered
85 * malformations, and didn't return -1 on error with warnings enabled
86 * */
87# undef utf8_to_uvchr_buf
88#endif
4c28bdc5 89
aadf4f9e
N
90/* This implementation brings modern, generally more restricted standards to
91 * utf8_to_uvchr_buf. Some of these are security related, and clearly must
92 * be done. But its arguable that the others need not, and hence should not.
93 * The reason they're here is that a module that intends to play with the
6858ac42 94 * latest perls should be able to work the same in all releases. An example is
aadf4f9e
N
95 * that perl no longer accepts any UV for a code point, but limits them to
96 * IV_MAX or below. This is for future internal use of the larger code points.
97 * If it turns out that some of these changes are breaking code that isn't
98 * intended to work with modern perls, the tighter restrictions could be
99 * relaxed. khw thinks this is unlikely, but has been wrong in the past. */
100
6858ac42
KW
101/* 5.6.0 is the first release with UTF-8, and we don't implement this function
102 * there due to its likely lack of still being in use, and the underlying
103 * implementation is very different from later ones, without the later
104 * safeguards, so would require extra work to deal with */
e4a76720 105#if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf)
de222e8b
KW
106# if { VERSION < 5.10.0 } /* Was non-const before this */
107# define D_PPP_CU8 U8
108# else
109# define D_PPP_CU8 const U8
110# endif
9aa6a863 111
aadf4f9e
N
112 /* Choose which underlying implementation to use. At least one must be
113 * present or the perl is too early to handle this function */
114# if defined(utf8n_to_uvchr) || defined(utf8_to_uv)
115# if defined(utf8n_to_uvchr) /* This is the preferred implementation */
bbe8b705 116# define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr
e4a76720 117# else /* Must be at least 5.6.1 from #if above */
bbe8b705 118# define D_PPP_utf8_to_uvchr_buf_callee utf8_to_uv
aadf4f9e 119# endif
aadf4f9e
N
120# endif
121
aadf4f9e 122# if { NEED utf8_to_uvchr_buf }
4c28bdc5 123
aadf4f9e 124UV
9aa6a863 125utf8_to_uvchr_buf(pTHX_ D_PPP_CU8 *s, const U8 *send, STRLEN *retlen)
aadf4f9e
N
126{
127 UV ret;
128 STRLEN curlen;
129 bool overflows = 0;
130 const U8 *cur_s = s;
131 const bool do_warnings = ckWARN_d(WARN_UTF8);
05fc829b 132 STRLEN overflow_length = 0;
aadf4f9e
N
133
134 if (send > s) {
135 curlen = send - s;
136 }
137 else {
138 assert(0); /* Modern perls die under this circumstance */
139 curlen = 0;
140 if (! do_warnings) { /* Handle empty here if no warnings needed */
141 if (retlen) *retlen = 0;
142 return UNICODE_REPLACEMENT;
143 }
144 }
145
8ec545d9 146# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
aadf4f9e 147
6858ac42
KW
148 /* Perl did not properly detect overflow for much of its history on
149 * non-EBCDIC platforms, often returning an overlong value which may or may
150 * not have been tolerated in the call. Also, earlier versions, when they
151 * did detect overflow, may have disallowed it completely. Modern ones can
152 * replace it with the REPLACEMENT CHARACTER, depending on calling
153 * parameters. Therefore detect it ourselves in releases it was
154 * problematic in. */
aadf4f9e 155
05fc829b 156 if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {
aadf4f9e 157
6858ac42
KW
158 /* First, on a 32-bit machine the first byte being at least \xFE
159 * automatically is overflow, as it indicates something requiring more
160 * than 31 bits */
aadf4f9e
N
161 if (sizeof(ret) < 8) {
162 overflows = 1;
05fc829b 163 overflow_length = 7;
aadf4f9e
N
164 }
165 else {
166 const U8 highest[] = /* 2*63-1 */
167 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
168 const U8 *cur_h = highest;
169
170 for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
171 if (UNLIKELY(*cur_s == *cur_h)) {
172 continue;
173 }
174
175 /* If this byte is larger than the corresponding highest UTF-8
6858ac42
KW
176 * byte, the sequence overflows; otherwise the byte is less
177 * than (as we handled the equality case above), and so the
178 * sequence doesn't overflow */
aadf4f9e
N
179 overflows = *cur_s > *cur_h;
180 break;
181
182 }
183
184 /* Here, either we set the bool and broke out of the loop, or got
185 * to the end and all bytes are the same which indicates it doesn't
05fc829b
KW
186 * overflow. If it did overflow, it would be this number of bytes
187 * */
188 overflow_length = 13;
189 }
190 }
191
192 if (UNLIKELY(overflows)) {
193 ret = 0;
194
195 if (! do_warnings && retlen) {
196 *retlen = overflow_length;
aadf4f9e
N
197 }
198 }
05fc829b 199 else
aadf4f9e 200
aadf4f9e
N
201# endif /* < 5.26 */
202
05fc829b
KW
203 /* Here, we are either in a release that properly detects overflow, or
204 * we have checked for overflow and the next statement is executing as
205 * part of the above conditional where we know we don't have overflow.
206 *
207 * The modern versions allow anything that evaluates to a legal UV, but
208 * not overlongs nor an empty input */
bbe8b705 209 ret = D_PPP_utf8_to_uvchr_buf_callee(
05fc829b
KW
210 s, curlen, retlen, (UTF8_ALLOW_ANYUV
211 & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
212
213# if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }
214
215 /* But actually, more modern versions restrict the UV to being no more than
216 * what * an IV can hold, so it could, so it could still have gotten it
217 * wrong about overflowing. */
218 if (UNLIKELY(ret > IV_MAX)) {
219 overflows = 1;
220 }
221
222# endif
223
aadf4f9e
N
224 if (UNLIKELY(overflows)) {
225 if (! do_warnings) {
226 if (retlen) {
bbe8b705
KW
227 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
228 *retlen = D_PPP_MIN(*retlen, curlen);
aadf4f9e
N
229 }
230 return UNICODE_REPLACEMENT;
231 }
232 else {
233
6858ac42 234 /* We use the error message in use from 5.8-5.26 */
de222e8b
KW
235 Perl_warner(aTHX_ packWARN(WARN_UTF8),
236 "Malformed UTF-8 character (overflow at 0x%" UVxf
237 ", byte 0x%02x, after start byte 0x%02x)",
238 ret, *cur_s, *s);
aadf4f9e
N
239 if (retlen) {
240 *retlen = (STRLEN) -1;
241 }
242 return 0;
243 }
244 }
245
6858ac42
KW
246 /* Here, did not overflow, but if it failed for some other reason, and
247 * warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
248 * try again, allowing anything. (Note a return of 0 is ok if the input
249 * was '\0') */
aadf4f9e
N
250 if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
251
252 /* If curlen is 0, we already handled the case where warnings are
6858ac42
KW
253 * disabled, so this 'if' will be true, and so later on, we know that
254 * 's' is dereferencible */
aadf4f9e
N
255 if (do_warnings) {
256 *retlen = (STRLEN) -1;
257 }
258 else {
bbe8b705 259 ret = D_PPP_utf8_to_uvchr_buf_callee(
aadf4f9e
N
260 s, curlen, retlen, UTF8_ALLOW_ANY);
261 /* Override with the REPLACEMENT character, as that is what the
262 * modern version of this function returns */
263 ret = UNICODE_REPLACEMENT;
264
de222e8b 265# if { VERSION < 5.16.0 }
aadf4f9e
N
266
267 /* Versions earlier than this don't necessarily return the proper
268 * length. It should not extend past the end of string, nor past
269 * what the first byte indicates the length is, nor past the
270 * continuation characters */
271 if (retlen && *retlen >= 0) {
f1590d76 272 unsigned int i = 1;
bbe8b705
KW
273
274 *retlen = D_PPP_MIN(*retlen, curlen);
275 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
aadf4f9e
N
276 do {
277 if (s[i] < 0x80 || s[i] > 0xBF) {
278 *retlen = i;
279 break;
280 }
281 } while (++i < *retlen);
282 }
283
de222e8b 284# endif
aadf4f9e
N
285
286 }
287 }
288
289 return ret;
290}
e28192fd 291
aadf4f9e
N
292# endif
293#endif
39d7245c 294
aadf4f9e
N
295#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
296#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
297 to read past a NUL, making it much less likely to read
298 off the end of the buffer. A NUL indicates the start
299 of the next character anyway. If the input isn't
300 NUL-terminated, the function remains unsafe, as it
301 always has been. */
39d7245c
KW
302
303__UNDEFINED__ utf8_to_uvchr(s, lp) \
304 ((*(s) == '\0') \
305 ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \
306 : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp)))
307
aadf4f9e
N
308#endif
309
e28192fd
KW
310=xsinit
311
312#define NEED_my_strnlen
aadf4f9e 313#define NEED_utf8_to_uvchr_buf
e28192fd 314
adfe19db
MHM
315=xsubs
316
317SV *
318sv_setuv(uv)
b2049988
MHM
319 UV uv
320 CODE:
321 RETVAL = newSViv(1);
322 sv_setuv(RETVAL, uv);
323 OUTPUT:
324 RETVAL
adfe19db
MHM
325
326SV *
327newSVuv(uv)
b2049988
MHM
328 UV uv
329 CODE:
330 RETVAL = newSVuv(uv);
331 OUTPUT:
332 RETVAL
adfe19db
MHM
333
334UV
335sv_2uv(sv)
b2049988
MHM
336 SV *sv
337 CODE:
338 RETVAL = sv_2uv(sv);
339 OUTPUT:
340 RETVAL
adfe19db
MHM
341
342UV
343SvUVx(sv)
b2049988
MHM
344 SV *sv
345 CODE:
346 sv--;
347 RETVAL = SvUVx(++sv);
348 OUTPUT:
349 RETVAL
adfe19db
MHM
350
351void
352XSRETURN_UV()
b2049988
MHM
353 PPCODE:
354 XSRETURN_UV(42);
adfe19db 355
96ad942f
MHM
356void
357PUSHu()
b2049988
MHM
358 PREINIT:
359 dTARG;
360 PPCODE:
361 TARG = sv_newmortal();
362 EXTEND(SP, 1);
363 PUSHu(42);
364 XSRETURN(1);
96ad942f
MHM
365
366void
367XPUSHu()
b2049988
MHM
368 PREINIT:
369 dTARG;
370 PPCODE:
371 TARG = sv_newmortal();
372 XPUSHu(43);
373 XSRETURN(1);
96ad942f 374
f2e3e4ce
KW
375STRLEN
376UTF8_SAFE_SKIP(s, adjustment)
377 unsigned char * s
378 int adjustment
379 CODE:
380 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
9aa6a863 381#if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP)
f2e3e4ce 382 RETVAL = UTF8_SAFE_SKIP(s, s + UTF8SKIP(s) + adjustment);
9aa6a863
N
383#else
384 RETVAL = 0;
385#endif
f2e3e4ce
KW
386 OUTPUT:
387 RETVAL
388
e28192fd
KW
389STRLEN
390my_strnlen(s, max)
391 char * s
392 STRLEN max
393 CODE:
394 RETVAL= my_strnlen(s, max);
395 OUTPUT:
396 RETVAL
397
4c28bdc5 398AV *
aadf4f9e 399utf8_to_uvchr_buf(s, adjustment)
4c28bdc5 400 unsigned char *s
aadf4f9e 401 int adjustment
4c28bdc5
KW
402 PREINIT:
403 AV *av;
404 STRLEN len;
405 CODE:
406 av = newAV();
9aa6a863 407#ifdef utf8_to_uvchr_buf
aadf4f9e
N
408 av_push(av, newSVuv(utf8_to_uvchr_buf(s,
409 s + UTF8SKIP(s) + adjustment,
410 &len)));
9aa6a863
N
411#else
412 av_push(av, newSVuv(0));
413 len = (STRLEN) -1;
414#endif
721db7b0
N
415 if (len == (STRLEN) -1) {
416 av_push(av, newSViv(-1));
417 }
418 else {
419 av_push(av, newSVuv(len));
420 }
4c28bdc5
KW
421 RETVAL = av;
422 OUTPUT:
423 RETVAL
424
39d7245c
KW
425AV *
426utf8_to_uvchr(s)
427 unsigned char *s
428 PREINIT:
429 AV *av;
430 STRLEN len;
431 CODE:
432 av = newAV();
9aa6a863 433#ifdef utf8_to_uvchr
39d7245c 434 av_push(av, newSVuv(utf8_to_uvchr(s, &len)));
9aa6a863
N
435#else
436 av_push(av, newSVuv(0));
437 len = (STRLEN) -1;
438#endif
721db7b0
N
439 if (len == (STRLEN) -1) {
440 av_push(av, newSViv(-1));
441 }
442 else {
443 av_push(av, newSVuv(len));
444 }
39d7245c
KW
445 RETVAL = av;
446 OUTPUT:
447 RETVAL
448
5f93efa0 449=tests plan => 62
adfe19db
MHM
450
451ok(&Devel::PPPort::sv_setuv(42), 42);
452ok(&Devel::PPPort::newSVuv(123), 123);
453ok(&Devel::PPPort::sv_2uv("4711"), 4711);
454ok(&Devel::PPPort::sv_2uv("1735928559"), 1735928559);
455ok(&Devel::PPPort::SvUVx("1735928559"), 1735928559);
456ok(&Devel::PPPort::SvUVx(1735928559), 1735928559);
457ok(&Devel::PPPort::SvUVx(0xdeadbeef), 0xdeadbeef);
458ok(&Devel::PPPort::XSRETURN_UV(), 42);
96ad942f
MHM
459ok(&Devel::PPPort::PUSHu(), 42);
460ok(&Devel::PPPort::XPUSHu(), 43);
f2e3e4ce
KW
461ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
462ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
e28192fd 463ok(&Devel::PPPort::my_strnlen("abc\0def", 7), 3);
aadf4f9e 464
89d8c7cd
KW
465# skip tests on 5.6.0 and earlier
466if ("$]" le '5.006') {
5f93efa0 467 skip 'skip: broken utf8 support', 0 for 1..49;
89d8c7cd
KW
468 exit;
469}
470
aadf4f9e 471my $ret = &Devel::PPPort::utf8_to_uvchr("A");
4c28bdc5
KW
472ok($ret->[0], ord("A"));
473ok($ret->[1], 1);
aadf4f9e
N
474
475$ret = &Devel::PPPort::utf8_to_uvchr("\0");
4c28bdc5
KW
476ok($ret->[0], 0);
477ok($ret->[1], 1);
aadf4f9e
N
478
479$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
39d7245c
KW
480ok($ret->[0], ord("A"));
481ok($ret->[1], 1);
aadf4f9e
N
482
483$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
39d7245c
KW
484ok($ret->[0], 0);
485ok($ret->[1], 1);
aadf4f9e
N
486
487if (ord("A") != 65) { # tests not valid for EBCDIC
5f93efa0 488 ok(1, 1) for 1 .. (2 + 4 + (7 * 5));
aadf4f9e
N
489}
490else {
491 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
492 ok($ret->[0], 0x100);
493 ok($ret->[1], 2);
494
495 my @warnings;
496 local $SIG{__WARN__} = sub { push @warnings, @_; };
497
498 {
499 use warnings 'utf8';
500 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
501 ok($ret->[0], 0);
502 ok($ret->[1], -1);
503
504 no warnings;
505 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
506 ok($ret->[0], 0xFFFD);
507 ok($ret->[1], 1);
508 }
509
510 my @buf_tests = (
511 {
512 input => "A",
513 adjustment => -1,
514 warning => qr/empty/,
515 no_warnings_returned_length => 0,
516 },
517 {
518 input => "\xc4\xc5",
519 adjustment => 0,
520 warning => qr/non-continuation/,
521 no_warnings_returned_length => 1,
522 },
523 {
524 input => "\xc4\x80",
525 adjustment => -1,
526 warning => qr/short|1 byte, need 2/,
527 no_warnings_returned_length => 1,
528 },
529 {
530 input => "\xc0\x81",
531 adjustment => 0,
532 warning => qr/overlong|2 bytes, need 1/,
533 no_warnings_returned_length => 2,
534 },
5f93efa0
KW
535 {
536 input => "\xe0\x80\x81",
537 adjustment => 0,
538 warning => qr/overlong|3 bytes, need 1/,
539 no_warnings_returned_length => 3,
540 },
541 {
542 input => "\xf0\x80\x80\x81",
543 adjustment => 0,
544 warning => qr/overlong|4 bytes, need 1/,
545 no_warnings_returned_length => 4,
546 },
6858ac42 547 { # Old algorithm failed to detect this
aadf4f9e
N
548 input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
549 adjustment => 0,
05fc829b 550 warning => qr/overflow/,
aadf4f9e
N
551 no_warnings_returned_length => 13,
552 },
553 );
554
555 # An empty input is an assertion failure on debugging builds. It is
556 # deliberately the first test.
557 require Config; import Config;
558 use vars '%Config';
559 if ($Config{ccflags} =~ /-DDEBUGGING/) {
560 shift @buf_tests;
561 ok(1, 1) for 1..5;
562 }
563
564 for my $test (@buf_tests) {
565 my $input = $test->{'input'};
566 my $adjustment = $test->{'adjustment'};
567 my $display = 'utf8_to_uvchr_buf("';
568 for (my $i = 0; $i < length($input) + $adjustment; $i++) {
569 $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
570 }
571
572 $display .= '")';
573 my $warning = $test->{'warning'};
574
575 undef @warnings;
576 use warnings 'utf8';
577 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
578 ok($ret->[0], 0, "returned value $display; warnings enabled");
579 ok($ret->[1], -1, "returned length $display; warnings enabled");
580 my $all_warnings = join "; ", @warnings;
581 my $contains = grep { $_ =~ $warning } $all_warnings;
17c135f5
KW
582 ok($contains, 1, $display
583 . "; Got: '$all_warnings', which should contain '$warning'");
aadf4f9e
N
584
585 undef @warnings;
586 no warnings 'utf8';
587 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
588 ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled");
589 ok($ret->[1], $test->{'no_warnings_returned_length'},
590 "returned length $display; warnings disabled");
591 }
592}