dist/Devel-PPPort/parts/inc/uv

   1 ################################################################################
   2 ##
   3 ##  Version 3.x, Copyright (C) 2004-2013, Marcus Holland-Moritz.
   4 ##  Version 2.x, Copyright (C) 2001, Paul Marquess.
   5 ##  Version 1.x, Copyright (C) 1999, Kenneth Albanowski.
   6 ##
   7 ##  This program is free software; you can redistribute it and/or
   8 ##  modify it under the same terms as Perl itself.
   9 ##
  10 ################################################################################
  11
  12 =provides
  13
  14 __UNDEFINED__
  15 my_strnlen
  16 SvUOK
  17 utf8_to_uvchr_buf
  18
  19 =implementation
  20
  21 #define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b))
  22
  23 __UNDEFINED__  sv_setuv(sv, uv)                     \
  24                STMT_START {                         \
  25                  UV TeMpUv = uv;                    \
  26                  if (TeMpUv <= IV_MAX)              \
  27                    sv_setiv(sv, TeMpUv);            \
  28                  else                               \
  29                    sv_setnv(sv, (double)TeMpUv);    \
  30                } STMT_END
  31
  32 __UNDEFINED__  newSVuv(uv)     ((uv) <= IV_MAX ? newSViv((IV)uv) : newSVnv((NV)uv))
  33
  34 #if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)
  35 __UNDEFINED__  sv_2uv(sv)      ({ SV *_sv = (sv); (UV) (SvNOK(_sv) ? SvNV(_sv) : sv_2nv(_sv)); })
  36 #else
  37 __UNDEFINED__  sv_2uv(sv)      ((PL_Sv = (sv)), (UV) (SvNOK(PL_Sv) ? SvNV(PL_Sv) : sv_2nv(PL_Sv)))
  38 #endif
  39
  40 __UNDEFINED__  SvUVX(sv)       ((UV)SvIVX(sv))
  41 __UNDEFINED__  SvUVXx(sv)      SvUVX(sv)
  42 __UNDEFINED__  SvUV(sv)        (SvIOK(sv) ? SvUVX(sv) : sv_2uv(sv))
  43
  44 #if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)
  45 __UNDEFINED__  SvUVx(sv)       ({ SV *_sv = (sv)); SvUV(_sv); })
  46 #else
  47 __UNDEFINED__  SvUVx(sv)       ((PL_Sv = (sv)), SvUV(PL_Sv))
  48 #endif
  49
  50 /* Hint: sv_uv
  51  * Always use the SvUVx() macro instead of sv_uv().
  52  */
  53 __UNDEFINED__  sv_uv(sv)       SvUVx(sv)
  54
  55 #if !defined(SvUOK) && defined(SvIOK_UV)
  56 #  define SvUOK(sv) SvIOK_UV(sv)
  57 #endif
  58
  59 __UNDEFINED__  XST_mUV(i,v)    (ST(i) = sv_2mortal(newSVuv(v))  )
  60 __UNDEFINED__  XSRETURN_UV(v)  STMT_START { XST_mUV(0,v);  XSRETURN(1); } STMT_END
  61
  62 __UNDEFINED__  PUSHu(u)        STMT_START { sv_setuv(TARG, (UV)(u)); PUSHTARG;  } STMT_END
  63 __UNDEFINED__  XPUSHu(u)       STMT_START { sv_setuv(TARG, (UV)(u)); XPUSHTARG; } STMT_END
  64
  65 #if defined UTF8SKIP
  66
  67 /* Don't use official version because it uses MIN, which may not be available */
  68 #undef UTF8_SAFE_SKIP
  69
  70 __UNDEFINED__  UTF8_SAFE_SKIP(s, e)  (                                          \
  71                                       ((((e) - (s)) <= 0)                       \
  72                                       ? 0                                       \
  73                                       : D_PPP_MIN(((e) - (s)), UTF8SKIP(s))))
  74 #endif
  75
  76 #if !defined(my_strnlen)
  77 #if { NEED my_strnlen }
  78
  79 STRLEN
  80 my_strnlen(const char *str, Size_t maxlen)
  81 {
  82     const char *p = str;
  83
  84     while(maxlen-- && *p)
  85         p++;
  86
  87     return p - str;
  88 }
  89
  90 #endif
  91 #endif
  92
  93 #if { VERSION < 5.31.3 }
  94         /* Versions prior to this accepted things that are now considered
  95          * malformations, and didn't return -1 on error with warnings enabled
  96          * */
  97 #  undef utf8_to_uvchr_buf
  98 #endif
  99
 100 /* This implementation brings modern, generally more restricted standards to
 101  * utf8_to_uvchr_buf.  Some of these are security related, and clearly must
 102  * be done.  But its arguable that the others need not, and hence should not.
 103  * The reason they're here is that a module that intends to play with the
 104  * latest perls should be able to work the same in all releases.  An example is
 105  * that perl no longer accepts any UV for a code point, but limits them to
 106  * IV_MAX or below.  This is for future internal use of the larger code points.
 107  * If it turns out that some of these changes are breaking code that isn't
 108  * intended to work with modern perls, the tighter restrictions could be
 109  * relaxed.  khw thinks this is unlikely, but has been wrong in the past. */
 110
 111 /* 5.6.0 is the first release with UTF-8, and we don't implement this function
 112  * there due to its likely lack of still being in use, and the underlying
 113  * implementation is very different from later ones, without the later
 114  * safeguards, so would require extra work to deal with */
 115 #if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf)
 116    /* Choose which underlying implementation to use.  At least one must be
 117     * present or the perl is too early to handle this function */
 118 #  if defined(utf8n_to_uvchr) || defined(utf8_to_uv)
 119 #    if defined(utf8n_to_uvchr)   /* This is the preferred implementation */
 120 #      define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr
 121 #    else     /* Must be at least 5.6.1 from #if above */
 122 #      define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) utf8_to_uv((U8 *)(s), (curlen), (retlen), (flags))
 123 #    endif
 124 #  endif
 125
 126 #  if { NEED utf8_to_uvchr_buf }
 127
 128 UV
 129 utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
 130 {
 131     UV ret;
 132     STRLEN curlen;
 133     bool overflows = 0;
 134     const U8 *cur_s = s;
 135     const bool do_warnings = ckWARN_d(WARN_UTF8);
 136 #    if { VERSION < 5.26.0 } && ! defined(EBCDIC)
 137     STRLEN overflow_length = 0;
 138 #    endif
 139
 140     if (send > s) {
 141         curlen = send - s;
 142     }
 143     else {
 144         assert(0);  /* Modern perls die under this circumstance */
 145         curlen = 0;
 146         if (! do_warnings) {    /* Handle empty here if no warnings needed */
 147             if (retlen) *retlen = 0;
 148             return UNICODE_REPLACEMENT;
 149         }
 150     }
 151
 152 #    if { VERSION < 5.26.0 } && ! defined(EBCDIC)
 153
 154     /* Perl did not properly detect overflow for much of its history on
 155      * non-EBCDIC platforms, often returning an overlong value which may or may
 156      * not have been tolerated in the call.  Also, earlier versions, when they
 157      * did detect overflow, may have disallowed it completely.  Modern ones can
 158      * replace it with the REPLACEMENT CHARACTER, depending on calling
 159      * parameters.  Therefore detect it ourselves in  releases it was
 160      * problematic in. */
 161
 162     if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {
 163
 164         /* First, on a 32-bit machine the first byte being at least \xFE
 165          * automatically is overflow, as it indicates something requiring more
 166          * than 31 bits */
 167         if (sizeof(ret) < 8) {
 168             overflows = 1;
 169             overflow_length = 7;
 170         }
 171         else {
 172             const U8 highest[] =    /* 2*63-1 */
 173                         "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
 174             const U8 *cur_h = highest;
 175
 176             for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
 177                 if (UNLIKELY(*cur_s == *cur_h)) {
 178                     continue;
 179                 }
 180
 181                 /* If this byte is larger than the corresponding highest UTF-8
 182                  * byte, the sequence overflows; otherwise the byte is less
 183                  * than (as we handled the equality case above), and so the
 184                  * sequence doesn't overflow */
 185                 overflows = *cur_s > *cur_h;
 186                 break;
 187
 188             }
 189
 190             /* Here, either we set the bool and broke out of the loop, or got
 191              * to the end and all bytes are the same which indicates it doesn't
 192              * overflow.  If it did overflow, it would be this number of bytes
 193              * */
 194             overflow_length = 13;
 195         }
 196     }
 197
 198     if (UNLIKELY(overflows)) {
 199         ret = 0;
 200
 201         if (! do_warnings && retlen) {
 202             *retlen = overflow_length;
 203         }
 204     }
 205     else
 206
 207 #    endif  /* < 5.26 */
 208
 209         /* Here, we are either in a release that properly detects overflow, or
 210          * we have checked for overflow and the next statement is executing as
 211          * part of the above conditional where we know we don't have overflow.
 212          *
 213          * The modern versions allow anything that evaluates to a legal UV, but
 214          * not overlongs nor an empty input */
 215         ret = D_PPP_utf8_to_uvchr_buf_callee(
 216                 s, curlen, retlen,   (UTF8_ALLOW_ANYUV
 217                                   & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
 218
 219 #    if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }
 220
 221     /* But actually, more modern versions restrict the UV to being no more than
 222      * what * an IV can hold, so it could, so it could still have gotten it
 223      * wrong about overflowing. */
 224     if (UNLIKELY(ret > IV_MAX)) {
 225         overflows = 1;
 226     }
 227
 228 #    endif
 229
 230     if (UNLIKELY(overflows)) {
 231         if (! do_warnings) {
 232             if (retlen) {
 233                 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
 234                 *retlen = D_PPP_MIN(*retlen, curlen);
 235             }
 236             return UNICODE_REPLACEMENT;
 237         }
 238         else {
 239
 240             /* We use the error message in use from 5.8-5.26 */
 241             Perl_warner(aTHX_ packWARN(WARN_UTF8),
 242                 "Malformed UTF-8 character (overflow at 0x%" UVxf
 243                 ", byte 0x%02x, after start byte 0x%02x)",
 244                 ret, *cur_s, *s);
 245             if (retlen) {
 246                 *retlen = (STRLEN) -1;
 247             }
 248             return 0;
 249         }
 250     }
 251
 252     /* Here, did not overflow, but if it failed for some other reason, and
 253      * warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
 254      * try again, allowing anything.  (Note a return of 0 is ok if the input
 255      * was '\0') */
 256     if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
 257
 258         /* If curlen is 0, we already handled the case where warnings are
 259          * disabled, so this 'if' will be true, and so later on, we know that
 260          * 's' is dereferencible */
 261         if (do_warnings) {
 262             *retlen = (STRLEN) -1;
 263         }
 264         else {
 265             ret = D_PPP_utf8_to_uvchr_buf_callee(
 266                                             s, curlen, retlen, UTF8_ALLOW_ANY);
 267             /* Override with the REPLACEMENT character, as that is what the
 268              * modern version of this function returns */
 269             ret = UNICODE_REPLACEMENT;
 270
 271 #    if { VERSION < 5.16.0 }
 272
 273             /* Versions earlier than this don't necessarily return the proper
 274              * length.  It should not extend past the end of string, nor past
 275              * what the first byte indicates the length is, nor past the
 276              * continuation characters */
 277             if (retlen && *retlen >= 0) {
 278                 unsigned int i = 1;
 279
 280                 *retlen = D_PPP_MIN(*retlen, curlen);
 281                 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
 282                 do {
 283                     if (s[i] < 0x80 || s[i] > 0xBF) {
 284                         *retlen = i;
 285                         break;
 286                     }
 287                 } while (++i < *retlen);
 288             }
 289
 290 #    endif
 291
 292         }
 293     }
 294
 295     return ret;
 296 }
 297
 298 #  endif
 299 #endif
 300
 301 #if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
 302 #undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
 303                         to read past a NUL, making it much less likely to read
 304                         off the end of the buffer.  A NUL indicates the start
 305                         of the next character anyway.  If the input isn't
 306                         NUL-terminated, the function remains unsafe, as it
 307                         always has been. */
 308
 309 __UNDEFINED__  utf8_to_uvchr(s, lp)                                             \
 310     ((*(s) == '\0')                                                             \
 311     ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */        \
 312     : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp)))
 313
 314 #endif
 315
 316 =xsinit
 317
 318 #define NEED_my_strnlen
 319 #define NEED_utf8_to_uvchr_buf
 320
 321 =xsubs
 322
 323 SV *
 324 sv_setuv(uv)
 325         UV uv
 326         CODE:
 327                 RETVAL = newSViv(1);
 328                 sv_setuv(RETVAL, uv);
 329         OUTPUT:
 330                 RETVAL
 331
 332 SV *
 333 newSVuv(uv)
 334         UV uv
 335         CODE:
 336                 RETVAL = newSVuv(uv);
 337         OUTPUT:
 338                 RETVAL
 339
 340 UV
 341 sv_2uv(sv)
 342         SV *sv
 343         CODE:
 344                 RETVAL = sv_2uv(sv);
 345         OUTPUT:
 346                 RETVAL
 347
 348 UV
 349 SvUVx(sv)
 350         SV *sv
 351         CODE:
 352                 sv--;
 353                 RETVAL = SvUVx(++sv);
 354         OUTPUT:
 355                 RETVAL
 356
 357 void
 358 XSRETURN_UV()
 359         PPCODE:
 360                 XSRETURN_UV(42);
 361
 362 void
 363 PUSHu()
 364         PREINIT:
 365                 dTARG;
 366         PPCODE:
 367                 TARG = sv_newmortal();
 368                 EXTEND(SP, 1);
 369                 PUSHu(42);
 370                 XSRETURN(1);
 371
 372 void
 373 XPUSHu()
 374         PREINIT:
 375                 dTARG;
 376         PPCODE:
 377                 TARG = sv_newmortal();
 378                 XPUSHu(43);
 379                 XSRETURN(1);
 380
 381 #if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP)
 382
 383 STRLEN
 384 UTF8_SAFE_SKIP(s, adjustment)
 385         char * s
 386         int adjustment
 387         PREINIT:
 388             const char *const_s;
 389         CODE:
 390             const_s = s;
 391             /* Instead of passing in an 'e' ptr, use the real end, adjusted */
 392             RETVAL = UTF8_SAFE_SKIP(const_s, s + UTF8SKIP(s) + adjustment);
 393         OUTPUT:
 394             RETVAL
 395
 396 #endif
 397
 398 STRLEN
 399 my_strnlen(s, max)
 400         char * s
 401         STRLEN max
 402         CODE:
 403             RETVAL= my_strnlen(s, max);
 404         OUTPUT:
 405             RETVAL
 406
 407 #ifdef utf8_to_uvchr_buf
 408
 409 AV *
 410 utf8_to_uvchr_buf(s, adjustment)
 411         unsigned char *s
 412         int adjustment
 413         PREINIT:
 414             AV *av;
 415             STRLEN len;
 416             const unsigned char *const_s;
 417         CODE:
 418             av = newAV();
 419             const_s = s;
 420             av_push(av, newSVuv(utf8_to_uvchr_buf(const_s,
 421                                                   s + UTF8SKIP(s) + adjustment,
 422                                                   &len)));
 423             if (len == (STRLEN) -1) {
 424                 av_push(av, newSViv(-1));
 425             }
 426             else {
 427                 av_push(av, newSVuv(len));
 428             }
 429             RETVAL = av;
 430         OUTPUT:
 431                 RETVAL
 432
 433 #endif
 434
 435 #ifdef utf8_to_uvchr
 436
 437 AV *
 438 utf8_to_uvchr(s)
 439         unsigned char *s
 440         PREINIT:
 441             AV *av;
 442             STRLEN len;
 443             const unsigned char *const_s;
 444         CODE:
 445             av = newAV();
 446             const_s = s;
 447             av_push(av, newSVuv(utf8_to_uvchr(const_s, &len)));
 448             if (len == (STRLEN) -1) {
 449                 av_push(av, newSViv(-1));
 450             }
 451             else {
 452                 av_push(av, newSVuv(len));
 453             }
 454             RETVAL = av;
 455         OUTPUT:
 456                 RETVAL
 457
 458 #endif
 459
 460 =tests plan => 62
 461
 462 BEGIN { require warnings if "$]" gt '5.006' }
 463
 464 ok(&Devel::PPPort::sv_setuv(42), 42);
 465 ok(&Devel::PPPort::newSVuv(123), 123);
 466 ok(&Devel::PPPort::sv_2uv("4711"), 4711);
 467 ok(&Devel::PPPort::sv_2uv("1735928559"), 1735928559);
 468 ok(&Devel::PPPort::SvUVx("1735928559"), 1735928559);
 469 ok(&Devel::PPPort::SvUVx(1735928559), 1735928559);
 470 ok(&Devel::PPPort::SvUVx(0xdeadbeef), 0xdeadbeef);
 471 ok(&Devel::PPPort::XSRETURN_UV(), 42);
 472 ok(&Devel::PPPort::PUSHu(), 42);
 473 ok(&Devel::PPPort::XPUSHu(), 43);
 474 ok(&Devel::PPPort::my_strnlen("abc\0def", 7), 3);
 475
 476 # skip tests on 5.6.0 and earlier
 477 if ("$]" le '5.006') {
 478     skip 'skip: broken utf8 support', 0 for 1..51;
 479     exit;
 480 }
 481
 482 ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
 483 ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
 484
 485 my $ret = &Devel::PPPort::utf8_to_uvchr("A");
 486 ok($ret->[0], ord("A"));
 487 ok($ret->[1], 1);
 488
 489 $ret = &Devel::PPPort::utf8_to_uvchr("\0");
 490 ok($ret->[0], 0);
 491 ok($ret->[1], 1);
 492
 493 $ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
 494 ok($ret->[0], ord("A"));
 495 ok($ret->[1], 1);
 496
 497 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
 498 ok($ret->[0], 0);
 499 ok($ret->[1], 1);
 500
 501 if (ord("A") != 65) {   # tests not valid for EBCDIC
 502     ok(1, 1) for 1 .. (2 + 4 + (7 * 5));
 503 }
 504 else {
 505     $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
 506     ok($ret->[0], 0x100);
 507     ok($ret->[1], 2);
 508
 509     my @warnings;
 510     local $SIG{__WARN__} = sub { push @warnings, @_; };
 511
 512     {
 513         BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' }
 514         $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
 515         ok($ret->[0], 0);
 516         ok($ret->[1], -1);
 517
 518         BEGIN { 'warnings'->unimport() if "$]" gt '5.006' }
 519         $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
 520         ok($ret->[0], 0xFFFD);
 521         ok($ret->[1], 1);
 522     }
 523
 524     my @buf_tests = (
 525         {
 526             input      => "A",
 527             adjustment => -1,
 528             warning    => qr/empty/,
 529             no_warnings_returned_length => 0,
 530         },
 531         {
 532             input      => "\xc4\xc5",
 533             adjustment => 0,
 534             warning    => qr/non-continuation/,
 535             no_warnings_returned_length => 1,
 536         },
 537         {
 538             input      => "\xc4\x80",
 539             adjustment => -1,
 540             warning    => qr/short|1 byte, need 2/,
 541             no_warnings_returned_length => 1,
 542         },
 543         {
 544             input      => "\xc0\x81",
 545             adjustment => 0,
 546             warning    => qr/overlong|2 bytes, need 1/,
 547             no_warnings_returned_length => 2,
 548         },
 549         {
 550             input      => "\xe0\x80\x81",
 551             adjustment => 0,
 552             warning    => qr/overlong|3 bytes, need 1/,
 553             no_warnings_returned_length => 3,
 554         },
 555         {
 556             input      => "\xf0\x80\x80\x81",
 557             adjustment => 0,
 558             warning    => qr/overlong|4 bytes, need 1/,
 559             no_warnings_returned_length => 4,
 560         },
 561         {                 # Old algorithm failed to detect this
 562             input      => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
 563             adjustment => 0,
 564             warning    => qr/overflow/,
 565             no_warnings_returned_length => 13,
 566         },
 567     );
 568
 569     # An empty input is an assertion failure on debugging builds.  It is
 570     # deliberately the first test.
 571     require Config; import Config;
 572     use vars '%Config';
 573     if ($Config{ccflags} =~ /-DDEBUGGING/) {
 574         shift @buf_tests;
 575         ok(1, 1) for 1..5;
 576     }
 577
 578     for my $test (@buf_tests) {
 579         my $input = $test->{'input'};
 580         my $adjustment = $test->{'adjustment'};
 581         my $display = 'utf8_to_uvchr_buf("';
 582         for (my $i = 0; $i < length($input) + $adjustment; $i++) {
 583             $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
 584         }
 585
 586         $display .= '")';
 587         my $warning = $test->{'warning'};
 588
 589         undef @warnings;
 590         BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' }
 591         $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
 592         ok($ret->[0], 0,  "returned value $display; warnings enabled");
 593         ok($ret->[1], -1, "returned length $display; warnings enabled");
 594         my $all_warnings = join "; ", @warnings;
 595         my $contains = grep { $_ =~ $warning } $all_warnings;
 596         ok($contains, 1, $display
 597                     . "; Got: '$all_warnings', which should contain '$warning'");
 598
 599         undef @warnings;
 600         BEGIN { 'warnings'->unimport('utf8') if "$]" gt '5.006' }
 601         $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
 602         ok($ret->[0], 0xFFFD,  "returned value $display; warnings disabled");
 603         ok($ret->[1], $test->{'no_warnings_returned_length'},
 604                       "returned length $display; warnings disabled");
 605     }
 606 }