This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Encode: synch with CPAN version 2.99
authorDan Kogai <dankogai@dan.co.jp>
Mon, 21 Jan 2019 14:25:18 +0000 (09:25 -0500)
committerJames E Keenan <jkeenan@cpan.org>
Mon, 21 Jan 2019 14:27:16 +0000 (09:27 -0500)
12 files changed:
MANIFEST
Porting/Maintainers.pl
cpan/Encode/Encode.pm
cpan/Encode/Encode.xs
cpan/Encode/Encode/encode.h
cpan/Encode/Unicode/Unicode.pm
cpan/Encode/Unicode/Unicode.xs
cpan/Encode/encengine.c
cpan/Encode/t/decode.t
cpan/Encode/t/enc_eucjp.t
cpan/Encode/t/utf8messages.t [deleted file]
cpan/Encode/t/utf8warnings.t

index 5d1d5cc..e282452 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -629,7 +629,6 @@ cpan/Encode/t/undef.t
 cpan/Encode/t/unibench.pl              benchmark script
 cpan/Encode/t/Unicode.t                        test script
 cpan/Encode/t/use-Encode-Alias.t
-cpan/Encode/t/utf8messages.t
 cpan/Encode/t/utf8ref.t                        test script
 cpan/Encode/t/utf8strict.t             test script
 cpan/Encode/t/utf8warnings.t
index 429ed4f..4b0551b 100755 (executable)
@@ -386,7 +386,7 @@ use File::Glob qw(:case);
     },
 
     'Encode' => {
-        'DISTRIBUTION' => 'DANKOGAI/Encode-2.97.tar.gz',
+        'DISTRIBUTION' => 'DANKOGAI/Encode-2.99.tar.gz',
         'FILES'        => q[cpan/Encode],
         'CUSTOMIZED'   => [
            # TODO test passes on blead
index f90f929..ec625b9 100644 (file)
@@ -1,5 +1,5 @@
 #
-# $Id: Encode.pm,v 2.97 2018/02/21 12:14:24 dankogai Exp $
+# $Id: Encode.pm,v 2.99 2019/01/21 03:11:41 dankogai Exp $
 #
 package Encode;
 use strict;
@@ -7,13 +7,14 @@ use warnings;
 use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
 our $VERSION;
 BEGIN {
-    $VERSION = sprintf "%d.%02d", q$Revision: 2.97 $ =~ /(\d+)/g;
+    $VERSION = sprintf "%d.%02d", q$Revision: 2.99 $ =~ /(\d+)/g;
     require XSLoader;
     XSLoader::load( __PACKAGE__, $VERSION );
 }
 
 use Exporter 5.57 'import';
 
+use Carp ();
 our @CARP_NOT = qw(Encode::Encoder);
 
 # Public, encouraged API is exported by default
@@ -170,134 +171,6 @@ sub clone_encoding($) {
     return Storable::dclone($obj);
 }
 
-sub encode($$;$) {
-    my ( $name, $string, $check ) = @_;
-    return undef unless defined $string;
-    $string .= '';    # stringify;
-    $check ||= 0;
-    unless ( defined $name ) {
-        require Carp;
-        Carp::croak("Encoding name should not be undef");
-    }
-    my $enc = find_encoding($name);
-    unless ( defined $enc ) {
-        require Carp;
-        Carp::croak("Unknown encoding '$name'");
-    }
-    # For Unicode, warnings need to be caught and re-issued at this level
-    # so that callers can disable utf8 warnings lexically.
-    my $octets;
-    if ( ref($enc) eq 'Encode::Unicode' ) {
-        my $warn = '';
-        {
-            local $SIG{__WARN__} = sub { $warn = shift };
-            $octets = $enc->encode( $string, $check );
-        }
-        warnings::warnif('utf8', $warn) if length $warn;
-    }
-    else {
-        $octets = $enc->encode( $string, $check );
-    }
-    $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC );
-    return $octets;
-}
-*str2bytes = \&encode;
-
-sub decode($$;$) {
-    my ( $name, $octets, $check ) = @_;
-    return undef unless defined $octets;
-    $octets .= '';
-    $check ||= 0;
-    my $enc = find_encoding($name);
-    unless ( defined $enc ) {
-        require Carp;
-        Carp::croak("Unknown encoding '$name'");
-    }
-    # For Unicode, warnings need to be caught and re-issued at this level
-    # so that callers can disable utf8 warnings lexically.
-    my $string;
-    if ( ref($enc) eq 'Encode::Unicode' ) {
-        my $warn = '';
-        {
-            local $SIG{__WARN__} = sub { $warn = shift };
-            $string = $enc->decode( $octets, $check );
-        }
-        warnings::warnif('utf8', $warn) if length $warn;
-    }
-    else {
-        $string = $enc->decode( $octets, $check );
-    }
-    $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC );
-    return $string;
-}
-*bytes2str = \&decode;
-
-sub from_to($$$;$) {
-    my ( $string, $from, $to, $check ) = @_;
-    return undef unless defined $string;
-    $check ||= 0;
-    my $f = find_encoding($from);
-    unless ( defined $f ) {
-        require Carp;
-        Carp::croak("Unknown encoding '$from'");
-    }
-    my $t = find_encoding($to);
-    unless ( defined $t ) {
-        require Carp;
-        Carp::croak("Unknown encoding '$to'");
-    }
-
-    # For Unicode, warnings need to be caught and re-issued at this level
-    # so that callers can disable utf8 warnings lexically.
-    my $uni;
-    if ( ref($f) eq 'Encode::Unicode' ) {
-        my $warn = '';
-        {
-            local $SIG{__WARN__} = sub { $warn = shift };
-            $uni = $f->decode($string);
-        }
-        warnings::warnif('utf8', $warn) if length $warn;
-    }
-    else {
-        $uni = $f->decode($string);
-    }
-
-    if ( ref($t) eq 'Encode::Unicode' ) {
-        my $warn = '';
-        {
-            local $SIG{__WARN__} = sub { $warn = shift };
-            $_[0] = $string = $t->encode( $uni, $check );
-        }
-        warnings::warnif('utf8', $warn) if length $warn;
-    }
-    else {
-        $_[0] = $string = $t->encode( $uni, $check );
-    }
-
-    return undef if ( $check && length($uni) );
-    return defined( $_[0] ) ? length($string) : undef;
-}
-
-sub encode_utf8($) {
-    my ($str) = @_;
-    return undef unless defined $str;
-    utf8::encode($str);
-    return $str;
-}
-
-my $utf8enc;
-
-sub decode_utf8($;$) {
-    my ( $octets, $check ) = @_;
-    return undef unless defined $octets;
-    $octets .= '';
-    $check   ||= 0;
-    $utf8enc ||= find_encoding('utf8');
-    my $string = $utf8enc->decode( $octets, $check );
-    $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC );
-    return $string;
-}
-
 onBOOT;
 
 if ($ON_EBCDIC) {
@@ -824,6 +697,12 @@ code to do exactly that:
 This is the same as C<FB_QUIET> above, except that instead of being silent
 on errors, it issues a warning.  This is handy for when you are debugging.
 
+B<CAVEAT>: All warnings from Encode module are reported, independently of
+L<pragma warnings|warnings> settings. If you want to follow settings of
+lexical warnings configured by L<pragma warnings|warnings> then append
+also check value C<ENCODE::ONLY_PRAGMA_WARNINGS>. This value is available
+since Encode version 2.99.
+
 =head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
 
 =over 2
index 774c2b1..ddc1b1f 100644 (file)
@@ -1,8 +1,9 @@
 /*
- $Id: Encode.xs,v 2.43 2018/02/21 12:14:33 dankogai Exp dankogai $
+ $Id: Encode.xs,v 2.45 2019/01/21 03:13:35 dankogai Exp $
  */
 
 #define PERL_NO_GET_CONTEXT
+#define IN_ENCODE_XS
 #include "EXTERN.h"
 #include "perl.h"
 #include "XSUB.h"
    encode_method().  1 is recommended. 2 restores NI-S original */
 #define ENCODE_XS_USEFP   1
 
-#define UNIMPLEMENTED(x,y) static y x (SV *sv, char *encoding) {       \
-                       Perl_croak_nocontext("panic_unimplemented");    \
-                        PERL_UNUSED_VAR(sv); \
-                        PERL_UNUSED_VAR(encoding); \
-             return (y)0; /* fool picky compilers */ \
-                         }
-/**/
-
-UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
-UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
-
 #ifndef SvIV_nomg
 #define SvIV_nomg SvIV
 #endif
 
-#ifndef UTF8_DISALLOW_ILLEGAL_INTERCHANGE
-#  define UTF8_DISALLOW_ILLEGAL_INTERCHANGE 0
-#  define UTF8_ALLOW_NON_STRICT (UTF8_ALLOW_FE_FF|UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
-#else
-#  define UTF8_ALLOW_NON_STRICT 0
+#ifndef SvTRUE_nomg
+#define SvTRUE_nomg SvTRUE
+#endif
+
+#ifndef SVfARG
+#define SVfARG(p) ((void*)(p))
 #endif
 
 static void
@@ -67,16 +58,6 @@ Encode_XSEncoding(pTHX_ encode_t * enc)
 }
 
 static void
-call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig)
-{
-    /* Exists for breakpointing */
-    PERL_UNUSED_VAR(routine);
-    PERL_UNUSED_VAR(done);
-    PERL_UNUSED_VAR(dest);
-    PERL_UNUSED_VAR(orig);
-}
-
-static void
 utf8_safe_downgrade(pTHX_ SV ** src, U8 ** s, STRLEN * slen, bool modify)
 {
     if (!modify) {
@@ -164,7 +145,7 @@ do_bytes_fallback_cb(pTHX_ U8 *s, STRLEN slen, SV *fallback_cb)
 
 static SV *
 encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 * s, STRLEN slen,
-             int check, STRLEN * offset, SV * term, int * retcode, 
+             IV check, STRLEN * offset, SV * term, int * retcode, 
              SV *fallback_cb)
 {
     STRLEN tlen  = slen;
@@ -258,7 +239,7 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 *
                    (UV)ch, enc->name[0]);
             return &PL_sv_undef; /* never reaches but be safe */
         }
-        if (check & ENCODE_WARN_ON_ERR){
+        if (encode_ckWARN(check, WARN_UTF8)) {
             Perl_warner(aTHX_ packWARN(WARN_UTF8),
                 ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]);
         }
@@ -297,7 +278,7 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 *
                               enc->name[0], (UV)s[slen]);
             return &PL_sv_undef; /* never reaches but be safe */
         }
-        if (check & ENCODE_WARN_ON_ERR){
+        if (encode_ckWARN(check, WARN_UTF8)) {
             Perl_warner(
             aTHX_ packWARN(WARN_UTF8),
             ERR_DECODE_NOMAP,
@@ -386,70 +367,6 @@ strict_utf8(pTHX_ SV* sv)
     return SvTRUE(*svp);
 }
 
-/* Modern perls have the capability to do this more efficiently and portably */
-#ifdef utf8n_to_uvchr_msgs
-# define CAN_USE_BASE_PERL
-#endif
-
-#ifndef CAN_USE_BASE_PERL
-
-/*
- * https://github.com/dankogai/p5-encode/pull/56#issuecomment-231959126
- */
-#ifndef UNICODE_IS_NONCHAR
-#define UNICODE_IS_NONCHAR(c) ((c >= 0xFDD0 && c <= 0xFDEF) || (c & 0xFFFE) == 0xFFFE)
-#endif
-
-#ifndef UNICODE_IS_SUPER
-#define UNICODE_IS_SUPER(c) (c > PERL_UNICODE_MAX)
-#endif
-
-#define UNICODE_IS_STRICT(c) (!UNICODE_IS_SURROGATE(c) && !UNICODE_IS_NONCHAR(c) && !UNICODE_IS_SUPER(c))
-
-#ifndef UTF_ACCUMULATION_OVERFLOW_MASK
-#ifndef CHARBITS
-#define CHARBITS CHAR_BIT
-#endif
-#define UTF_ACCUMULATION_OVERFLOW_MASK (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * CHARBITS) - UTF_ACCUMULATION_SHIFT))
-#endif
-
-/*
- * Convert non strict utf8 sequence of len >= 2 to unicode codepoint
- */
-static UV
-convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
-{
-    UV uv;
-    U8 *ptr = s;
-    bool overflowed = 0;
-
-    uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(UTF8SKIP(s));
-
-    len--;
-    s++;
-
-    while (len--) {
-        if (!UTF8_IS_CONTINUATION(*s)) {
-            *rlen = s-ptr;
-            return 0;
-        }
-        if (uv & UTF_ACCUMULATION_OVERFLOW_MASK)
-            overflowed = 1;
-        uv = UTF8_ACCUMULATE(uv, *s);
-        s++;
-    }
-
-    *rlen = s-ptr;
-
-    if (overflowed || *rlen > (STRLEN)UNISKIP(uv)) {
-        return 0;
-    }
-
-    return uv;
-}
-
-#endif  /* CAN_USE_BASE_PERL */
-
 static U8*
 process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
              bool encode, bool strict, bool stop_at_partial)
@@ -472,16 +389,20 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
     UV uv;
     STRLEN ulen;
     SV *fallback_cb;
-    int check;
+    IV check;
     U8 *d;
     STRLEN dlen;
     char esc[UTF8_MAXLEN * 6 + 1];
     STRLEN i;
     const U32 flags = (strict)
                     ? UTF8_DISALLOW_ILLEGAL_INTERCHANGE
-                    : UTF8_ALLOW_NON_STRICT;
+                    : 0;
 
-    if (SvROK(check_sv)) {
+    if (!SvOK(check_sv)) {
+       fallback_cb = &PL_sv_undef;
+       check = 0;
+    }
+    else if (SvROK(check_sv)) {
        /* croak("UTF-8 decoder doesn't support callback CHECK"); */
        fallback_cb = check_sv;
        check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as perlqq */
@@ -501,9 +422,6 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
 
     while (s < e) {
 
-#ifdef CAN_USE_BASE_PERL    /* Use the much faster, portable implementation if
-                               available */
-
         /* If there were no errors, this will be 'e'; otherwise it will point
          * to the first byte of the erroneous input */
         const U8* e_or_where_failed;
@@ -531,63 +449,7 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
          * point, or the best substitution for it */
         uv = utf8n_to_uvchr(s, e - s, &ulen, UTF8_ALLOW_ANY);
 
-#else   /* Use code for earlier perls */
-
-        ((void)sizeof(flags));  /* Avoid compiler warning */
-
-        if (UTF8_IS_INVARIANT(*s)) {
-            *d++ = *s++;
-            continue;
-        }
-
-        uv = 0;
-        ulen = 1;
-        if (! UTF8_IS_CONTINUATION(*s)) {
-            /* Not an invariant nor a continuation; must be a start byte.  (We
-             * can't test for UTF8_IS_START as that excludes things like \xC0
-             * which are start bytes, but always lead to overlongs */
-
-            U8 skip = UTF8SKIP(s);
-            if ((s + skip) > e) {
-                /* just calculate ulen, in pathological cases can be smaller then e-s */
-                if (e-s >= 2)
-                    convert_utf8_multi_seq(s, e-s, &ulen);
-                else
-                    ulen = 1;
-
-                if (stop_at_partial && ulen == (STRLEN)(e-s))
-                    break;
-
-                goto malformed_byte;
-            }
-
-            uv = convert_utf8_multi_seq(s, skip, &ulen);
-            if (uv == 0)
-                goto malformed_byte;
-            else if (strict && !UNICODE_IS_STRICT(uv))
-                goto malformed;
-
-
-             /* Whole char is good */
-             memcpy(d, s, skip);
-             d += skip;
-             s += skip;
-             continue;
-        }
-
-        /* If we get here there is something wrong with alleged UTF-8 */
-        /* uv is used only when encoding */
-    malformed_byte:
-        if (uv == 0)
-            uv = (UV)*s;
-        if (encode || ulen == 0)
-            ulen = 1;
-
-    malformed:
-
-#endif  /* The two versions for processing come back together here, for the
-         * error handling code.
-         *
+        /*
          * Here, we are looping through the input and found an error.
          * 'uv' is the code point in error if calculable, or the REPLACEMENT
          *      CHARACTER if not.
@@ -602,7 +464,7 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
             else
                 Perl_croak(aTHX_ ERR_DECODE_STR_NOMAP, (strict ? "UTF-8" : "utf8"), esc);
         }
-        if (check & ENCODE_WARN_ON_ERR){
+        if (encode_ckWARN(check, WARN_UTF8)) {
             if (encode)
                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
                             ERR_ENCODE_NOMAP, uv, (strict ? "UTF-8" : "utf8"));
@@ -667,6 +529,88 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
     return s;
 }
 
+static SV *
+find_encoding(pTHX_ SV *enc)
+{
+    dSP;
+    I32 count;
+    SV *m_enc;
+    SV *obj = &PL_sv_undef;
+#ifndef SV_NOSTEAL
+    U32 tmp;
+#endif
+
+    ENTER;
+    SAVETMPS;
+    PUSHMARK(sp);
+
+    m_enc = sv_newmortal();
+#ifndef SV_NOSTEAL
+    tmp = SvFLAGS(enc) & SVs_TEMP;
+    SvTEMP_off(enc);
+    sv_setsv_flags(m_enc, enc, 0);
+    SvFLAGS(enc) |= tmp;
+#else
+#if SV_NOSTEAL == 0
+    #error You have broken SV_NOSTEAL which cause memory corruption in sv_setsv_flags()
+    #error Most probably broken SV_NOSTEAL was defined by buggy version of ppport.h
+#else
+    sv_setsv_flags(m_enc, enc, SV_NOSTEAL);
+#endif
+#endif
+    XPUSHs(m_enc);
+
+    PUTBACK;
+
+    count = call_pv("Encode::find_encoding", G_SCALAR);
+
+    SPAGAIN;
+
+    if (count > 0) {
+        obj = POPs;
+        SvREFCNT_inc(obj);
+    }
+
+    PUTBACK;
+    FREETMPS;
+    LEAVE;
+    return sv_2mortal(obj);
+}
+
+static SV *
+call_encoding(pTHX_ const char *method, SV *obj, SV *src, SV *check)
+{
+    dSP;
+    I32 count;
+    SV *dst = &PL_sv_undef;
+
+    PUSHMARK(sp);
+
+    if (check)
+        check = sv_2mortal(newSVsv(check));
+
+    if (!check || SvROK(check) || !SvTRUE_nomg(check) || (SvIV_nomg(check) & ENCODE_LEAVE_SRC))
+        src = sv_2mortal(newSVsv(src));
+
+    XPUSHs(obj);
+    XPUSHs(src);
+    XPUSHs(check ? check : &PL_sv_no);
+
+    PUTBACK;
+
+    count = call_method(method, G_SCALAR);
+
+    SPAGAIN;
+
+    if (count > 0) {
+        dst = POPs;
+        SvREFCNT_inc(dst);
+    }
+
+    PUTBACK;
+    return dst;
+}
+
 
 MODULE = Encode                PACKAGE = Encode::utf8  PREFIX = Method_
 
@@ -683,7 +627,7 @@ PREINIT:
     U8 *e;
     SV *dst;
     bool renewed = 0;
-    int check;
+    IV check;
     bool modify;
     dSP;
 INIT:
@@ -744,7 +688,7 @@ PREINIT:
     U8 *s;
     U8 *e;
     SV *dst;
-    int check;
+    IV check;
     bool modify;
 INIT:
     SvGETMAGIC(src);
@@ -848,7 +792,7 @@ SV *        off
 SV *   term
 SV *    check_sv
 PREINIT:
-    int check;
+    IV check;
     SV *fallback_cb;
     bool modify;
     encode_t *enc;
@@ -886,7 +830,7 @@ SV *        obj
 SV *   src
 SV *   check_sv
 PREINIT:
-    int check;
+    IV check;
     SV *fallback_cb;
     bool modify;
     encode_t *enc;
@@ -917,7 +861,7 @@ SV *        obj
 SV *   src
 SV *   check_sv
 PREINIT:
-    int check;
+    IV check;
     SV *fallback_cb;
     bool modify;
     encode_t *enc;
@@ -988,102 +932,6 @@ MODULE = Encode         PACKAGE = Encode
 
 PROTOTYPES: ENABLE
 
-I32
-_bytes_to_utf8(sv, ...)
-SV *    sv
-PREINIT:
-    SV * encoding;
-INIT:
-    encoding = items == 2 ? ST(1) : Nullsv;
-CODE:
-    if (encoding)
-    RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
-    else {
-    STRLEN len;
-    U8*    s = (U8*)SvPV(sv, len);
-    U8*    converted;
-
-    converted = bytes_to_utf8(s, &len); /* This allocs */
-    sv_setpvn(sv, (char *)converted, len);
-    SvUTF8_on(sv); /* XXX Should we? */
-    Safefree(converted);                /* ... so free it */
-    RETVAL = len;
-    }
-OUTPUT:
-    RETVAL
-
-I32
-_utf8_to_bytes(sv, ...)
-SV *    sv
-PREINIT:
-    SV * to;
-    SV * check;
-INIT:
-    to    = items > 1 ? ST(1) : Nullsv;
-    check = items > 2 ? ST(2) : Nullsv;
-CODE:
-    if (to) {
-    RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
-    } else {
-    STRLEN len;
-    U8 *s = (U8*)SvPV(sv, len);
-
-    RETVAL = 0;
-    if (SvTRUE(check)) {
-        /* Must do things the slow way */
-        U8 *dest;
-            /* We need a copy to pass to check() */
-        U8 *src  = s;
-        U8 *send = s + len;
-        U8 *d0;
-
-        New(83, dest, len, U8); /* I think */
-        d0 = dest;
-
-        while (s < send) {
-                if (*s < 0x80){
-            *dest++ = *s++;
-                } else {
-            STRLEN ulen;
-            UV uv = *s++;
-
-            /* Have to do it all ourselves because of error routine,
-               aargh. */
-            if (!(uv & 0x40)){ goto failure; }
-            if      (!(uv & 0x20)) { ulen = 2;  uv &= 0x1f; }
-            else if (!(uv & 0x10)) { ulen = 3;  uv &= 0x0f; }
-            else if (!(uv & 0x08)) { ulen = 4;  uv &= 0x07; }
-            else if (!(uv & 0x04)) { ulen = 5;  uv &= 0x03; }
-            else if (!(uv & 0x02)) { ulen = 6;  uv &= 0x01; }
-            else if (!(uv & 0x01)) { ulen = 7;  uv = 0; }
-            else                   { ulen = 13; uv = 0; }
-        
-            /* Note change to utf8.c variable naming, for variety */
-            while (ulen--) {
-            if ((*s & 0xc0) != 0x80){
-                goto failure;
-            } else {
-                uv = (uv << 6) | (*s++ & 0x3f);
-            }
-          }
-          if (uv > 256) {
-          failure:
-              call_failure(check, s, dest, src);
-              /* Now what happens? */
-          }
-          *dest++ = (U8)uv;
-        }
-        }
-        RETVAL = dest - d0;
-        sv_usepvn(sv, (char *)dest, RETVAL);
-        SvUTF8_off(sv);
-    } else {
-        RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
-    }
-    }
-OUTPUT:
-    RETVAL
-
 bool
 is_utf8(sv, check = 0)
 SV *   sv
@@ -1132,6 +980,117 @@ CODE:
 OUTPUT:
     RETVAL
 
+SV *
+decode(encoding, octets, check = NULL)
+SV *   encoding
+SV *   octets
+SV *   check
+ALIAS:
+    bytes2str = 0
+PREINIT:
+    SV *obj;
+INIT:
+    SvGETMAGIC(encoding);
+CODE:
+    if (!SvOK(encoding))
+        croak("Encoding name should not be undef");
+    obj = find_encoding(aTHX_ encoding);
+    if (!SvOK(obj))
+        croak("Unknown encoding '%" SVf "'", SVfARG(encoding));
+    RETVAL = call_encoding(aTHX_ "decode", obj, octets, check);
+OUTPUT:
+    RETVAL
+
+SV *
+encode(encoding, string, check = NULL)
+SV *   encoding
+SV *   string
+SV *   check
+ALIAS:
+    str2bytes = 0
+PREINIT:
+    SV *obj;
+INIT:
+    SvGETMAGIC(encoding);
+CODE:
+    if (!SvOK(encoding))
+        croak("Encoding name should not be undef");
+    obj = find_encoding(aTHX_ encoding);
+    if (!SvOK(obj))
+        croak("Unknown encoding '%" SVf "'", SVfARG(encoding));
+    RETVAL = call_encoding(aTHX_ "encode", obj, string, check);
+OUTPUT:
+    RETVAL
+
+SV *
+decode_utf8(octets, check = NULL)
+SV *   octets
+SV *   check
+PREINIT:
+    HV *hv;
+    SV **sv;
+CODE:
+    hv = get_hv("Encode::Encoding", 0);
+    if (!hv)
+        croak("utf8 encoding was not found");
+    sv = hv_fetch(hv, "utf8", 4, 0);
+    if (!sv || !*sv || !SvOK(*sv))
+        croak("utf8 encoding was not found");
+    RETVAL = call_encoding(aTHX_ "decode", *sv, octets, check);
+OUTPUT:
+    RETVAL
+
+SV *
+encode_utf8(string)
+SV *   string
+CODE:
+    RETVAL = newSVsv(string);
+    if (SvOK(RETVAL))
+        sv_utf8_encode(RETVAL);
+OUTPUT:
+    RETVAL
+
+SV *
+from_to(octets, from, to, check = NULL)
+SV *   octets
+SV *   from
+SV *   to
+SV *   check
+PREINIT:
+    SV *from_obj;
+    SV *to_obj;
+    SV *string;
+    SV *new_octets;
+    U8 *ptr;
+    STRLEN len;
+INIT:
+    SvGETMAGIC(from);
+    SvGETMAGIC(to);
+CODE:
+    if (!SvOK(from) || !SvOK(to))
+        croak("Encoding name should not be undef");
+    from_obj = find_encoding(aTHX_ from);
+    if (!SvOK(from_obj))
+        croak("Unknown encoding '%" SVf "'", SVfARG(from));
+    to_obj = find_encoding(aTHX_ to);
+    if (!SvOK(to_obj))
+        croak("Unknown encoding '%" SVf "'", SVfARG(to));
+    string = sv_2mortal(call_encoding(aTHX_ "decode", from_obj, octets, NULL));
+    new_octets = sv_2mortal(call_encoding(aTHX_ "encode", to_obj, string, check));
+    SvGETMAGIC(new_octets);
+    if (SvOK(new_octets) && (!check || SvROK(check) || !SvTRUE_nomg(check) || sv_len(string) == 0)) {
+        ptr = (U8 *)SvPV_nomg(new_octets, len);
+        if (SvUTF8(new_octets))
+            len = utf8_length(ptr, ptr+len);
+        RETVAL = newSVuv(len);
+    } else {
+        RETVAL = &PL_sv_undef;
+    }
+    sv_setsv_nomg(octets, new_octets);
+    SvSETMAGIC(octets);
+OUTPUT:
+    RETVAL
+
 void
 onBOOT()
 CODE:
@@ -1141,11 +1100,12 @@ CODE:
 
 BOOT:
 {
-    HV *stash = gv_stashpvn("Encode", strlen("Encode"), GV_ADD);
+    HV *stash = gv_stashpvn("Encode", (U32)strlen("Encode"), GV_ADD);
     newCONSTSUB(stash, "DIE_ON_ERR", newSViv(ENCODE_DIE_ON_ERR));
     newCONSTSUB(stash, "WARN_ON_ERR", newSViv(ENCODE_WARN_ON_ERR));
     newCONSTSUB(stash, "RETURN_ON_ERR", newSViv(ENCODE_RETURN_ON_ERR));
     newCONSTSUB(stash, "LEAVE_SRC", newSViv(ENCODE_LEAVE_SRC));
+    newCONSTSUB(stash, "ONLY_PRAGMA_WARNINGS", newSViv(ENCODE_ONLY_PRAGMA_WARNINGS));
     newCONSTSUB(stash, "PERLQQ", newSViv(ENCODE_PERLQQ));
     newCONSTSUB(stash, "HTMLCREF", newSViv(ENCODE_HTMLCREF));
     newCONSTSUB(stash, "XMLCREF", newSViv(ENCODE_XMLCREF));
index df5554f..8de56eb 100644 (file)
@@ -99,6 +99,7 @@ extern void Encode_DefineEncoding(encode_t *enc);
 #define  ENCODE_WARN_ON_ERR    0x0002 /* warn on error; may proceed */
 #define  ENCODE_RETURN_ON_ERR  0x0004 /* immediately returns on NOREP */
 #define  ENCODE_LEAVE_SRC      0x0008 /* $src updated unless set */
+#define  ENCODE_ONLY_PRAGMA_WARNINGS 0x0010 /* when enabled report only warnings configured by pragma warnings, otherwise report all warnings; no effect without ENCODE_WARN_ON_ERR */
 #define  ENCODE_PERLQQ         0x0100 /* perlqq fallback string */
 #define  ENCODE_HTMLCREF       0x0200 /* HTML character ref. fb mode */
 #define  ENCODE_XMLCREF        0x0400 /* XML  character ref. fb mode */
@@ -112,4 +113,1233 @@ extern void Encode_DefineEncoding(encode_t *enc);
 #define  ENCODE_FB_HTMLCREF    (ENCODE_HTMLCREF|ENCODE_LEAVE_SRC)
 #define  ENCODE_FB_XMLCREF     (ENCODE_XMLCREF|ENCODE_LEAVE_SRC)
 
+#define encode_ckWARN(c, w) ((c & ENCODE_WARN_ON_ERR)                         \
+                        && (!(c & ENCODE_ONLY_PRAGMA_WARNINGS) || ckWARN(w)))
+
+#ifdef UTF8SKIP
+#  ifdef EBCDIC   /* The value on early perls is wrong */
+#    undef UTF8_MAXBYTES 
+#    define UTF8_MAXBYTES 14
+#  endif
+#  ifndef UNLIKELY
+#    define UNLIKELY(x) (x)
+#  endif
+#  ifndef LIKELY
+#    define LIKELY(x) (x)
+#  endif
+
+/* EBCDIC requires a later perl to work, so the next two definitions are for
+ * ASCII machines only */
+#  ifndef NATIVE_UTF8_TO_I8
+#    define NATIVE_UTF8_TO_I8(x) (x)
+#  endif
+#  ifndef I8_TO_NATIVE_UTF8
+#    define I8_TO_NATIVE_UTF8(x)  (x)
+#  endif
+#  ifndef OFFUNISKIP
+#    define OFFUNISKIP(x)  UNISKIP(x)
+#  endif
+#  ifndef uvoffuni_to_utf8_flags
+#    define uvoffuni_to_utf8_flags(a,b,c) uvuni_to_utf8_flags(a,b,c)
+#  endif
+#  ifndef WARN_SURROGATE    /* Use the overarching category if these
+                               subcategories are missing */
+#    define WARN_SURROGATE WARN_UTF8
+#    define WARN_NONCHAR WARN_UTF8
+#    define WARN_NON_UNICODE WARN_UTF8
+     /* If there's only one possible category, then packing is a no-op */
+#    define encode_ckWARN_packed(c, w) encode_ckWARN(c, w)
+#  else
+#    define encode_ckWARN_packed(c, w)                                      \
+            ((c & ENCODE_WARN_ON_ERR)                                       \
+        && (!(c & ENCODE_ONLY_PRAGMA_WARNINGS) || Perl_ckwarn(aTHX_ w)))
+#  endif
+
+/* All these formats take a single UV code point argument */
+static const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
+static const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
+                                   " is not recommended for open interchange";
+static const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
+                                   " may not be portable";
+
+/* If the perl doesn't have the 5.28 functions, this file includes
+ * stripped-down versions of them but containing enough functionality to be
+ * suitable for Encode's needs.  Many of the comments have been removed.  But
+ * you can inspect the 5.28 source if you get stuck.
+ *
+ * These could be put in Devel::PPPort, but Encode is likely the only user */
+
+#if    (defined(IN_ENCODE_XS) || defined(IN_UNICODE_XS))                     \
+  && (! defined(utf8n_to_uvchr_msgs) && ! defined(uvchr_to_utf8_flags_msgs))
+
+#  ifndef hv_stores
+#    define hv_stores(hv, key, val) hv_store((hv), ("" key ""), (sizeof(key)-1), (val), 0)
+#  endif
+
+static HV *
+S_new_msg_hv(const char * const message, /* The message text */
+                   U32 categories)  /* Packed warning categories */
+{
+    /* Creates, populates, and returns an HV* that describes an error message
+     * for the translators between UTF8 and code point */
+
+    dTHX;
+    SV* msg_sv = newSVpv(message, 0);
+    SV* category_sv = newSVuv(categories);
+
+    HV* msg_hv = newHV();
+
+    (void) hv_stores(msg_hv, "text", msg_sv);
+    (void) hv_stores(msg_hv, "warn_categories",  category_sv);
+
+    return msg_hv;
+}
+
+#endif
+
+#if ! defined(utf8n_to_uvchr_msgs)                      \
+  && (defined(IN_ENCODE_XS) || defined(IN_UNICODE_XS))
+
+#  undef utf8n_to_uvchr     /* Don't use an earlier version: use the version
+                               defined in this file */
+#  define utf8n_to_uvchr(a,b,c,d) utf8n_to_uvchr_msgs(a, b, c, d, 0, NULL)
+
+#  undef UTF8_IS_START      /* Early perls wrongly accepted C0 and C1 */
+#  define UTF8_IS_START(c)  (((U8)(c)) >= 0xc2)
+#  ifndef isUTF8_POSSIBLY_PROBLEMATIC
+#    ifdef EBCDIC
+#      define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c > ' ')
+#    else
+#      define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
+#    endif
+#  endif
+#  ifndef UTF8_ALLOW_OVERFLOW
+#    define UTF8_ALLOW_OVERFLOW (1U<<31)    /* Choose highest bit to avoid
+                                               potential conflicts */
+#    define UTF8_GOT_OVERFLOW           UTF8_ALLOW_OVERFLOW
+#  endif
+#  undef UTF8_ALLOW_ANY     /* Early perl definitions don't work properly with
+                               the code in this file */
+#  define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION                              \
+                          |UTF8_ALLOW_NON_CONTINUATION                          \
+                          |UTF8_ALLOW_SHORT                                     \
+                          |UTF8_ALLOW_LONG                                      \
+                          |UTF8_ALLOW_OVERFLOW)
+
+/* The meanings of these were complemented at some point, but the functions
+ * bundled in this file use the complemented meanings */
+#  ifndef UTF8_DISALLOW_SURROGATE
+#    define UTF8_DISALLOW_SURROGATE     UTF8_ALLOW_SURROGATE
+#    define UTF8_DISALLOW_NONCHAR       UTF8_ALLOW_FFFF
+#    define UTF8_DISALLOW_SUPER         UTF8_ALLOW_FE_FF
+
+     /* In the stripped-down implementation in this file, disallowing is not
+      * independent of warning */
+#    define UTF8_WARN_SURROGATE     UTF8_DISALLOW_SURROGATE
+#    define UTF8_WARN_NONCHAR       UTF8_DISALLOW_NONCHAR
+#    define UTF8_WARN_SUPER         UTF8_DISALLOW_SUPER
+#  endif
+#  ifndef UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+#    define UTF8_DISALLOW_ILLEGAL_INTERCHANGE                                   \
+     (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_SURROGATE|UTF8_DISALLOW_NONCHAR)
+#  endif
+#  ifndef UTF8_WARN_ILLEGAL_INTERCHANGE
+#    define UTF8_WARN_ILLEGAL_INTERCHANGE                                       \
+         (UTF8_WARN_SUPER|UTF8_WARN_SURROGATE|UTF8_WARN_NONCHAR)
+#  endif
+#  ifndef FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER
+#    ifdef EBCDIC   /* On EBCDIC, these are actually I8 bytes */
+#      define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xFA
+#      define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
+
+#      define IS_UTF8_2_BYTE_SURROGATE(s0, s1)     ((s0) == 0xF1            \
+                                              && ((s1) & 0xFE ) == 0xB6)
+#    else
+#      define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xF5
+#      define IS_UTF8_2_BYTE_SUPER(s0, s1)       ((s0) == 0xF4 && (s1) >= 0x90)
+#      define IS_UTF8_2_BYTE_SURROGATE(s0, s1)   ((s0) == 0xED && (s1) >= 0xA0)
+#    endif
+#    if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */
+#      ifdef EBCDIC     /* Actually is I8 */
+#       define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+#      else
+#       define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+#      endif
+#    endif
+#  endif
+
+#  ifndef Newx
+#    define Newx(v,n,t) New(0,v,n,t)
+#  endif
+
+#  ifndef PERL_UNUSED_ARG
+#    define PERL_UNUSED_ARG(x) ((void)x)
+#  endif
+
+static const char malformed_text[] = "Malformed UTF-8 character";
+
+static char *
+_byte_dump_string(const U8 * const start, const STRLEN len)
+{
+    /* Returns a mortalized C string that is a displayable copy of the 'len' */
+
+    const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
+                                               trailing NUL */
+    const U8 * s = start;
+    const U8 * const e = start + len;
+    char * output;
+    char * d;
+    dTHX;
+
+    Newx(output, output_len, char);
+    SAVEFREEPV(output);
+
+    d = output;
+    for (s = start; s < e; s++) {
+        const unsigned high_nibble = (*s & 0xF0) >> 4;
+        const unsigned low_nibble =  (*s & 0x0F);
+
+        *d++ = '\\';
+        *d++ = 'x';
+
+        if (high_nibble < 10) {
+            *d++ = high_nibble + '0';
+        }
+        else {
+            *d++ = high_nibble - 10 + 'a';
+        }
+
+        if (low_nibble < 10) {
+            *d++ = low_nibble + '0';
+        }
+        else {
+            *d++ = low_nibble - 10 + 'a';
+        }
+    }
+
+    *d = '\0';
+    return output;
+}
+
+static char *
+S_unexpected_non_continuation_text(const U8 * const s,
+
+                                         /* Max number of bytes to print */
+                                         STRLEN print_len,
+
+                                         /* Which one is the non-continuation */
+                                         const STRLEN non_cont_byte_pos,
+
+                                         /* How many bytes should there be? */
+                                         const STRLEN expect_len)
+{
+    /* Return the malformation warning text for an unexpected continuation
+     * byte. */
+
+    dTHX;
+    const char * const where = (non_cont_byte_pos == 1)
+                               ? "immediately"
+                               : Perl_form(aTHX_ "%d bytes",
+                                                 (int) non_cont_byte_pos);
+    const U8 * x = s + non_cont_byte_pos;
+    const U8 * e = s + print_len;
+
+    /* We don't need to pass this parameter, but since it has already been
+     * calculated, it's likely faster to pass it; verify under DEBUGGING */
+    assert(expect_len == UTF8SKIP(s));
+
+    /* As a defensive coding measure, don't output anything past a NUL.  Such
+     * bytes shouldn't be in the middle of a malformation, and could mark the
+     * end of the allocated string, and what comes after is undefined */
+    for (; x < e; x++) {
+        if (*x == '\0') {
+            x++;            /* Output this particular NUL */
+            break;
+        }
+    }
+
+    return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
+                           " %s after start byte 0x%02x; need %d bytes, got %d)",
+                           malformed_text,
+                           _byte_dump_string(s, x - s),
+                           *(s + non_cont_byte_pos),
+                           where,
+                           *s,
+                           (int) expect_len,
+                           (int) non_cont_byte_pos);
+}
+
+static int
+S_does_utf8_overflow(const U8 * const s,
+                       const U8 * e,
+                       const bool consider_overlongs)
+{
+    /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
+     * 'e' - 1 would overflow an IV on this platform. */
+
+#  if ! defined(UV_IS_QUAD)
+
+    const STRLEN len = e - s;
+    int is_overlong;
+
+    assert(s <= e && s + UTF8SKIP(s) >= e);
+    assert(! UTF8_IS_INVARIANT(*s) && e > s);
+
+#    ifdef EBCDIC
+
+    PERL_UNUSED_ARG(consider_overlongs);
+
+    if (*s != 0xFE) {
+        return 0;
+    }
+
+    if (len == 1) {
+        return -1;
+    }
+
+#    else
+
+    if (LIKELY(*s < 0xFE)) {
+        return 0;
+    }
+
+    if (! consider_overlongs) {
+        return 1;
+    }
+
+    if (len == 1) {
+        return -1;
+    }
+
+    is_overlong = S_is_utf8_overlong_given_start_byte_ok(s, len);
+
+    if (is_overlong == 0) {
+        return 1;
+    }
+
+    if (is_overlong < 0) {
+        return -1;
+    }
+
+    if (*s == 0xFE) {
+        return 0;
+    }
+
+#    endif
+
+    /* Here, ASCII and EBCDIC rejoin:
+    *  On ASCII:   We have an overlong sequence starting with FF
+    *  On EBCDIC:  We have a sequence starting with FE. */
+
+    {   /* For C89, use a block so the declaration can be close to its use */
+
+#    ifdef EBCDIC
+        const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
+#    else
+        const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
+#    endif
+        const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
+        const STRLEN cmp_len = MIN(conts_len, len - 1);
+
+        if (cmp_len >= conts_len || memNE(s + 1,
+                                          conts_for_highest_30_bit,
+                                          cmp_len))
+        {
+            return memGT(s + 1, conts_for_highest_30_bit, cmp_len);
+        }
+
+        return -1;
+    }
+
+#  else /* Below is 64-bit word */
+
+    PERL_UNUSED_ARG(consider_overlongs);
+
+    {
+        const STRLEN len = e - s;
+        const U8 *x;
+        const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+
+        for (x = s; x < e; x++, y++) {
+
+            if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
+                continue;
+            }
+            return NATIVE_UTF8_TO_I8(*x) > *y;
+        }
+
+        if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
+            return -1;
+        }
+
+        return 0;
+    }
+
+#  endif
+
+}
+
+static int
+S_isFF_OVERLONG(const U8 * const s, const STRLEN len);
+
+static int
+S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
+{
+    const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
+    const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+
+    assert(len > 1 && UTF8_IS_START(*s));
+
+#         ifdef EBCDIC
+#             define F0_ABOVE_OVERLONG 0xB0
+#             define F8_ABOVE_OVERLONG 0xA8
+#             define FC_ABOVE_OVERLONG 0xA4
+#             define FE_ABOVE_OVERLONG 0xA2
+#             define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
+#         else
+
+    if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
+        return 1;
+    }
+
+#             define F0_ABOVE_OVERLONG 0x90
+#             define F8_ABOVE_OVERLONG 0x88
+#             define FC_ABOVE_OVERLONG 0x84
+#             define FE_ABOVE_OVERLONG 0x82
+#             define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
+#         endif
+
+    if (   (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
+        || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
+        || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
+        || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
+    {
+        return 1;
+    }
+
+    /* Check for the FF overlong */
+    return S_isFF_OVERLONG(s, len);
+}
+
+int
+S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
+{
+    if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
+                     MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
+    {
+        return 0;
+    }
+
+    if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
+        return 1;
+    }
+
+    return -1;
+}
+
+#  ifndef UTF8_GOT_CONTINUATION
+#    define UTF8_GOT_CONTINUATION       UTF8_ALLOW_CONTINUATION
+#    define UTF8_GOT_EMPTY              UTF8_ALLOW_EMPTY
+#    define UTF8_GOT_LONG               UTF8_ALLOW_LONG
+#    define UTF8_GOT_NON_CONTINUATION   UTF8_ALLOW_NON_CONTINUATION
+#    define UTF8_GOT_SHORT              UTF8_ALLOW_SHORT
+#    define UTF8_GOT_SURROGATE          UTF8_DISALLOW_SURROGATE
+#    define UTF8_GOT_NONCHAR            UTF8_DISALLOW_NONCHAR
+#    define UTF8_GOT_SUPER              UTF8_DISALLOW_SUPER
+#  endif
+
+#  ifndef UNICODE_IS_SUPER
+#    define UNICODE_IS_SUPER(uv)    ((UV) (uv) > PERL_UNICODE_MAX)
+#  endif
+#  ifndef UNICODE_IS_32_CONTIGUOUS_NONCHARS
+#    define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)      ((UV) (uv) >= 0xFDD0   \
+                                                   && (UV) (uv) <= 0xFDEF)
+#  endif
+#  ifndef UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER
+#    define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)                  \
+                                          (((UV) (uv) & 0xFFFE) == 0xFFFE)
+#  endif
+#  ifndef is_NONCHAR_utf8_safe
+#    define is_NONCHAR_utf8_safe(s,e)     /*** GENERATED CODE ***/            \
+( ( ( LIKELY((e) > (s)) ) && ( LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) ) ? ( ( 0xEF == ((const U8*)s)[0] ) ?\
+           ( ( 0xB7 == ((const U8*)s)[1] ) ?                               \
+               ( ( 0x90 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xAF ) ? 3 : 0 )\
+           : ( ( 0xBF == ((const U8*)s)[1] ) && ( ( ((const U8*)s)[2] & 0xFE ) == 0xBE ) ) ? 3 : 0 )\
+       : ( 0xF0 == ((const U8*)s)[0] ) ?                                   \
+           ( ( ( ( ((const U8*)s)[1] == 0x9F || ( ( ((const U8*)s)[1] & 0xEF ) == 0xAF ) ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 )\
+       : ( 0xF1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xF3 ) ?      \
+           ( ( ( ( ( ((const U8*)s)[1] & 0xCF ) == 0x8F ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 )\
+       : ( ( ( ( 0xF4 == ((const U8*)s)[0] ) && ( 0x8F == ((const U8*)s)[1] ) ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 ) : 0 )
+#  endif
+
+#  ifndef UTF8_IS_NONCHAR
+#    define UTF8_IS_NONCHAR(s, e) (is_NONCHAR_utf8_safe(s,e) > 0)
+#  endif
+#  ifndef UNICODE_IS_NONCHAR
+#    define UNICODE_IS_NONCHAR(uv)                                    \
+    (   UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)                       \
+     || (   LIKELY( ! UNICODE_IS_SUPER(uv))                         \
+         && UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
+#  endif
+
+#  ifndef UTF8_MAXBYTES
+#    define UTF8_MAXBYTES UTF8_MAXLEN
+#  endif
+
+static UV
+utf8n_to_uvchr_msgs(const U8 *s,
+                    STRLEN curlen,
+                    STRLEN *retlen,
+                    const U32 flags,
+                    U32 * errors,
+                    AV ** msgs)
+{
+    const U8 * const s0 = s;
+    const U8 * send = NULL;
+    U32 possible_problems = 0;
+    UV uv = *s;
+    STRLEN expectlen   = 0;
+    U8 * adjusted_s0 = (U8 *) s0;
+    U8 temp_char_buf[UTF8_MAXBYTES + 1];
+    UV uv_so_far = 0;
+    dTHX;
+
+    assert(errors == NULL); /* This functionality has been stripped */
+
+    if (UNLIKELY(curlen == 0)) {
+        possible_problems |= UTF8_GOT_EMPTY;
+        curlen = 0;
+        uv = UNICODE_REPLACEMENT;
+       goto ready_to_handle_errors;
+    }
+
+    expectlen = UTF8SKIP(s);
+
+    if (retlen) {
+       *retlen = expectlen;
+    }
+
+    if (UTF8_IS_INVARIANT(uv)) {
+       return uv;
+    }
+
+    if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
+       possible_problems |= UTF8_GOT_CONTINUATION;
+        curlen = 1;
+        uv = UNICODE_REPLACEMENT;
+       goto ready_to_handle_errors;
+    }
+
+    uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
+
+    send = (U8*) s0;
+    if (UNLIKELY(curlen < expectlen)) {
+        possible_problems |= UTF8_GOT_SHORT;
+        send += curlen;
+    }
+    else {
+        send += expectlen;
+    }
+
+    for (s = s0 + 1; s < send; s++) {
+       if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
+           uv = UTF8_ACCUMULATE(uv, *s);
+            continue;
+        }
+
+        possible_problems |= UTF8_GOT_NON_CONTINUATION;
+        break;
+    } /* End of loop through the character's bytes */
+
+    curlen = s - s0;
+
+#     define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
+
+    if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
+        uv_so_far = uv;
+        uv = UNICODE_REPLACEMENT;
+    }
+
+    if (UNLIKELY(0 < S_does_utf8_overflow(s0, s, 1))) {
+        possible_problems |= UTF8_GOT_OVERFLOW;
+        uv = UNICODE_REPLACEMENT;
+    }
+
+    if (     (   LIKELY(! possible_problems)
+              && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
+        || (       UNLIKELY(possible_problems)
+            && (   UNLIKELY(! UTF8_IS_START(*s0))
+                || (   curlen > 1
+                    && UNLIKELY(0 < S_is_utf8_overlong_given_start_byte_ok(s0,
+                                                                s - s0))))))
+    {
+        possible_problems |= UTF8_GOT_LONG;
+
+        if (   UNLIKELY(   possible_problems & UTF8_GOT_TOO_SHORT)
+            &&   LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
+        {
+            UV min_uv = uv_so_far;
+            STRLEN i;
+
+            for (i = curlen; i < expectlen; i++) {
+                min_uv = UTF8_ACCUMULATE(min_uv,
+                                     I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
+            }
+
+            adjusted_s0 = temp_char_buf;
+            (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
+        }
+    }
+
+    /* Here, we have found all the possible problems, except for when the input
+     * is for a problematic code point not allowed by the input parameters. */
+
+                                /* uv is valid for overlongs */
+    if (   (   (      LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
+                   && uv >= UNICODE_SURROGATE_FIRST)
+            || (   UNLIKELY(possible_problems)
+                && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
+       && ((flags & ( UTF8_DISALLOW_NONCHAR
+                      |UTF8_DISALLOW_SURROGATE
+                      |UTF8_DISALLOW_SUPER))))
+    {
+        if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
+            if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+                possible_problems |= UTF8_GOT_SURROGATE;
+            }
+            else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
+                possible_problems |= UTF8_GOT_SUPER;
+            }
+            else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
+                possible_problems |= UTF8_GOT_NONCHAR;
+            }
+        }
+        else {
+            if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
+                                >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
+            {
+                possible_problems |= UTF8_GOT_SUPER;
+            }
+            else if (curlen > 1) {
+                if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
+                                      NATIVE_UTF8_TO_I8(*adjusted_s0),
+                                      NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
+                {
+                    possible_problems |= UTF8_GOT_SUPER;
+                }
+                else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
+                                      NATIVE_UTF8_TO_I8(*adjusted_s0),
+                                      NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
+                {
+                    possible_problems |= UTF8_GOT_SURROGATE;
+                }
+            }
+        }
+    }
+
+  ready_to_handle_errors:
+
+    if (UNLIKELY(possible_problems)) {
+        bool disallowed = FALSE;
+        const U32 orig_problems = possible_problems;
+
+        if (msgs) {
+            *msgs = NULL;
+        }
+
+        while (possible_problems) { /* Handle each possible problem */
+            UV pack_warn = 0;
+            char * message = NULL;
+            U32 this_flag_bit = 0;
+
+            /* Each 'if' clause handles one problem.  They are ordered so that
+             * the first ones' messages will be displayed before the later
+             * ones; this is kinda in decreasing severity order.  But the
+             * overlong must come last, as it changes 'uv' looked at by the
+             * others */
+            if (possible_problems & UTF8_GOT_OVERFLOW) {
+
+                /* Overflow means also got a super; we handle both here */
+                possible_problems
+                  &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER);
+
+                /* Disallow if any of the categories say to */
+                if ( ! (flags &  UTF8_ALLOW_OVERFLOW)
+                    || (flags &  UTF8_DISALLOW_SUPER))
+                {
+                    disallowed = TRUE;
+                }
+
+                /* Likewise, warn if any say to */
+                if (  ! (flags & UTF8_ALLOW_OVERFLOW)) {
+
+                    /* The warnings code explicitly says it doesn't handle the
+                     * case of packWARN2 and two categories which have
+                     * parent-child relationship.  Even if it works now to
+                     * raise the warning if either is enabled, it wouldn't
+                     * necessarily do so in the future.  We output (only) the
+                     * most dire warning */
+                    if (! (flags & UTF8_CHECK_ONLY)) {
+                        if (msgs || ckWARN_d(WARN_UTF8)) {
+                            pack_warn = packWARN(WARN_UTF8);
+                        }
+                        else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
+                            pack_warn = packWARN(WARN_NON_UNICODE);
+                        }
+                        if (pack_warn) {
+                            message = Perl_form(aTHX_ "%s: %s (overflows)",
+                                            malformed_text,
+                                            _byte_dump_string(s0, curlen));
+                            this_flag_bit = UTF8_GOT_OVERFLOW;
+                        }
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_EMPTY) {
+                possible_problems &= ~UTF8_GOT_EMPTY;
+
+                if (! (flags & UTF8_ALLOW_EMPTY)) {
+                    disallowed = TRUE;
+                    if (  (msgs
+                        || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_ "%s (empty string)",
+                                                   malformed_text);
+                        this_flag_bit = UTF8_GOT_EMPTY;
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_CONTINUATION) {
+                possible_problems &= ~UTF8_GOT_CONTINUATION;
+
+                if (! (flags & UTF8_ALLOW_CONTINUATION)) {
+                    disallowed = TRUE;
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_
+                                "%s: %s (unexpected continuation byte 0x%02x,"
+                                " with no preceding start byte)",
+                                malformed_text,
+                                _byte_dump_string(s0, 1), *s0);
+                        this_flag_bit = UTF8_GOT_CONTINUATION;
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_SHORT) {
+                possible_problems &= ~UTF8_GOT_SHORT;
+
+                if (! (flags & UTF8_ALLOW_SHORT)) {
+                    disallowed = TRUE;
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_
+                             "%s: %s (too short; %d byte%s available, need %d)",
+                             malformed_text,
+                             _byte_dump_string(s0, send - s0),
+                             (int)curlen,
+                             curlen == 1 ? "" : "s",
+                             (int)expectlen);
+                        this_flag_bit = UTF8_GOT_SHORT;
+                    }
+                }
+
+            }
+            else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
+                possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
+
+                if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
+                    disallowed = TRUE;
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
+                        int printlen = s - s0;
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_ "%s",
+                            S_unexpected_non_continuation_text(s0,
+                                                            printlen,
+                                                            s - s0,
+                                                            (int) expectlen));
+                        this_flag_bit = UTF8_GOT_NON_CONTINUATION;
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_SURROGATE) {
+                possible_problems &= ~UTF8_GOT_SURROGATE;
+
+                if (flags & UTF8_WARN_SURROGATE) {
+
+                    if (   ! (flags & UTF8_CHECK_ONLY)
+                        && (msgs || ckWARN_d(WARN_SURROGATE)))
+                    {
+                        pack_warn = packWARN(WARN_SURROGATE);
+
+                        /* These are the only errors that can occur with a
+                        * surrogate when the 'uv' isn't valid */
+                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                            message = Perl_form(aTHX_
+                                    "UTF-16 surrogate (any UTF-8 sequence that"
+                                    " starts with \"%s\" is for a surrogate)",
+                                    _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            message = Perl_form(aTHX_ surrogate_cp_format, uv);
+                        }
+                        this_flag_bit = UTF8_GOT_SURROGATE;
+                    }
+                }
+
+                if (flags & UTF8_DISALLOW_SURROGATE) {
+                    disallowed = TRUE;
+                }
+            }
+            else if (possible_problems & UTF8_GOT_SUPER) {
+                possible_problems &= ~UTF8_GOT_SUPER;
+
+                if (flags & UTF8_WARN_SUPER) {
+
+                    if (   ! (flags & UTF8_CHECK_ONLY)
+                        && (msgs || ckWARN_d(WARN_NON_UNICODE)))
+                    {
+                        pack_warn = packWARN(WARN_NON_UNICODE);
+
+                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                            message = Perl_form(aTHX_
+                                    "Any UTF-8 sequence that starts with"
+                                    " \"%s\" is for a non-Unicode code point,"
+                                    " may not be portable",
+                                    _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            message = Perl_form(aTHX_ super_cp_format, uv);
+                        }
+                        this_flag_bit = UTF8_GOT_SUPER;
+                    }
+                }
+
+                if (flags & UTF8_DISALLOW_SUPER) {
+                    disallowed = TRUE;
+                }
+            }
+            else if (possible_problems & UTF8_GOT_NONCHAR) {
+                possible_problems &= ~UTF8_GOT_NONCHAR;
+
+                if (flags & UTF8_WARN_NONCHAR) {
+
+                    if (  ! (flags & UTF8_CHECK_ONLY)
+                        && (msgs || ckWARN_d(WARN_NONCHAR)))
+                    {
+                        /* The code above should have guaranteed that we don't
+                         * get here with errors other than overlong */
+                        assert (! (orig_problems
+                                        & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
+
+                        pack_warn = packWARN(WARN_NONCHAR);
+                        message = Perl_form(aTHX_ nonchar_cp_format, uv);
+                        this_flag_bit = UTF8_GOT_NONCHAR;
+                    }
+                }
+
+                if (flags & UTF8_DISALLOW_NONCHAR) {
+                    disallowed = TRUE;
+                }
+            }
+            else if (possible_problems & UTF8_GOT_LONG) {
+                possible_problems &= ~UTF8_GOT_LONG;
+
+                if (flags & UTF8_ALLOW_LONG) {
+                    uv = UNICODE_REPLACEMENT;
+                }
+                else {
+                    disallowed = TRUE;
+
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
+                        pack_warn = packWARN(WARN_UTF8);
+
+                        /* These error types cause 'uv' to be something that
+                         * isn't what was intended, so can't use it in the
+                         * message.  The other error types either can't
+                         * generate an overlong, or else the 'uv' is valid */
+                        if (orig_problems &
+                                        (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+                        {
+                            message = Perl_form(aTHX_
+                                    "%s: %s (any UTF-8 sequence that starts"
+                                    " with \"%s\" is overlong which can and"
+                                    " should be represented with a"
+                                    " different, shorter sequence)",
+                                    malformed_text,
+                                    _byte_dump_string(s0, send - s0),
+                                    _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            U8 tmpbuf[UTF8_MAXBYTES+1];
+                            const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
+                                                                        uv, 0);
+                            /* Don't use U+ for non-Unicode code points, which
+                             * includes those in the Latin1 range */
+                            const char * preface = (    uv > PERL_UNICODE_MAX
+#  ifdef EBCDIC
+                                                     || uv <= 0xFF
+#  endif
+                                                    )
+                                                   ? "0x"
+                                                   : "U+";
+                            message = Perl_form(aTHX_
+                                "%s: %s (overlong; instead use %s to represent"
+                                " %s%0*" UVXf ")",
+                                malformed_text,
+                                _byte_dump_string(s0, send - s0),
+                                _byte_dump_string(tmpbuf, e - tmpbuf),
+                                preface,
+                                ((uv < 256) ? 2 : 4), /* Field width of 2 for
+                                                         small code points */
+                                UNI_TO_NATIVE(uv));
+                        }
+                        this_flag_bit = UTF8_GOT_LONG;
+                    }
+                }
+            } /* End of looking through the possible flags */
+
+            /* Display the message (if any) for the problem being handled in
+             * this iteration of the loop */
+            if (message) {
+                if (msgs) {
+                    assert(this_flag_bit);
+
+                    if (*msgs == NULL) {
+                        *msgs = newAV();
+                    }
+
+                    av_push(*msgs, newRV_noinc((SV*) S_new_msg_hv(message,
+                                                                pack_warn)));
+                }
+                else if (PL_op)
+                    Perl_warner(aTHX_ pack_warn, "%s in %s", message,
+                                                 OP_DESC(PL_op));
+                else
+                    Perl_warner(aTHX_ pack_warn, "%s", message);
+            }
+        }   /* End of 'while (possible_problems)' */
+
+        if (retlen) {
+            *retlen = curlen;
+        }
+
+        if (disallowed) {
+            if (flags & UTF8_CHECK_ONLY && retlen) {
+                *retlen = ((STRLEN) -1);
+            }
+            return 0;
+        }
+    }
+
+    return UNI_TO_NATIVE(uv);
+}
+
+static STRLEN
+S_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
+{
+    STRLEN len;
+    const U8 *x;
+
+    assert(0 == (flags & ~UTF8_DISALLOW_ILLEGAL_INTERCHANGE));
+    assert(! UTF8_IS_INVARIANT(*s));
+
+    if (UNLIKELY(! UTF8_IS_START(*s))) {
+        return 0;
+    }
+
+    /* Examine a maximum of a single whole code point */
+    if (e - s > UTF8SKIP(s)) {
+        e = s + UTF8SKIP(s);
+    }
+
+    len = e - s;
+
+    if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
+        const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
+
+        if (  (flags & UTF8_DISALLOW_SUPER)
+            && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
+        {
+            return 0;           /* Above Unicode */
+        }
+
+        if (len > 1) {
+            const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+
+            if (   (flags & UTF8_DISALLOW_SUPER)
+                &&  UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
+            {
+                return 0;       /* Above Unicode */
+            }
+
+            if (   (flags & UTF8_DISALLOW_SURROGATE)
+                &&  UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
+            {
+                return 0;       /* Surrogate */
+            }
+
+            if (  (flags & UTF8_DISALLOW_NONCHAR)
+                && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
+            {
+                return 0;       /* Noncharacter code point */
+            }
+        }
+    }
+
+    for (x = s + 1; x < e; x++) {
+        if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
+            return 0;
+        }
+    }
+
+    if (len > 1 && S_is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
+        return 0;
+    }
+
+    if (0 < S_does_utf8_overflow(s, e, 0)) {
+        return 0;
+    }
+
+    return UTF8SKIP(s);
+}
+
+#  undef is_utf8_valid_partial_char_flags
+
+static bool
+is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
+{
+
+    return S_is_utf8_char_helper(s, e, flags) > 0;
+}
+
+#  undef is_utf8_string_loc_flags
+
+static bool
+is_utf8_string_loc_flags(const U8 *s, STRLEN len, const U8 **ep, const U32 flags)
+{
+    const U8* send = s + len;
+
+    assert(0 == (flags & ~UTF8_DISALLOW_ILLEGAL_INTERCHANGE));
+
+    while (s < send) {
+        if (UTF8_IS_INVARIANT(*s)) {
+            s++;
+        }
+        else if (     UNLIKELY(send - s < UTF8SKIP(s))
+                 || ! S_is_utf8_char_helper(s, send, flags))
+        {
+            *ep = s;
+            return 0;
+        }
+        else {
+            s += UTF8SKIP(s);
+        }
+    }
+
+    *ep = send;
+
+    return 1;
+}
+
+#endif
+
+#if defined(IN_UNICODE_XS) && ! defined(uvchr_to_utf8_flags_msgs)
+
+#  define MY_SHIFT   UTF_ACCUMULATION_SHIFT
+#  define MY_MARK    UTF_CONTINUATION_MARK
+#  define MY_MASK    UTF_CONTINUATION_MASK
+
+static const char cp_above_legal_max[] =
+                        "Use of code point 0x%" UVXf " is not allowed; the"
+                        " permissible max is 0x%" UVXf;
+
+/* These two can be dummys, as they are not looked at by the function, which
+ * has hard-coded into it what flags it is expecting are */
+#  ifndef UNICODE_DISALLOW_ILLEGAL_INTERCHANGE
+#    define UNICODE_DISALLOW_ILLEGAL_INTERCHANGE 0
+#  endif
+#  ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
+#    define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
+#  endif
+
+#  ifndef OFFUNI_IS_INVARIANT
+#    define OFFUNI_IS_INVARIANT(cp) UNI_IS_INVARIANT(cp)
+#  endif
+#  ifndef MAX_EXTERNALLY_LEGAL_CP
+#    define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
+#  endif
+#  ifndef LATIN1_TO_NATIVE
+#    define LATIN1_TO_NATIVE(a) ASCII_TO_NATIVE(a)
+#  endif
+#  ifndef I8_TO_NATIVE_UTF8
+#    define I8_TO_NATIVE_UTF8(a) NATIVE_TO_UTF(a)
+#  endif
+#  ifndef MAX_UTF8_TWO_BYTE
+#    define MAX_UTF8_TWO_BYTE (32 * (1U << UTF_ACCUMULATION_SHIFT) - 1)
+#  endif
+#  ifndef UNICODE_IS_32_CONTIGUOUS_NONCHARS
+#    define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)    ((UV) (uv) >= 0xFDD0   \
+                                                 && (UV) (uv) <= 0xFDEF)
+#  endif
+#  ifndef UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER
+#    define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)                \
+                                          (((UV) (uv) & 0xFFFE) == 0xFFFE)
+#  endif
+#  ifndef UNICODE_IS_SUPER
+#    define UNICODE_IS_SUPER(uv)    ((UV) (uv) > PERL_UNICODE_MAX)
+#  endif
+#  ifndef OFFUNISKIP
+#    define OFFUNISKIP(cp)    UNISKIP(NATIVE_TO_UNI(cp))
+#  endif
+
+#  define HANDLE_UNICODE_SURROGATE(uv, flags, msgs)                 \
+    STMT_START {                                                    \
+        U32 category = packWARN(WARN_SURROGATE);                    \
+        const char * format = surrogate_cp_format;                  \
+        *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv),           \
+                                 category);                         \
+        return NULL;                                                \
+    } STMT_END;
+
+#  define HANDLE_UNICODE_NONCHAR(uv, flags, msgs)                   \
+    STMT_START {                                                    \
+        U32 category = packWARN(WARN_NONCHAR);                      \
+        const char * format = nonchar_cp_format;                    \
+        *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv),           \
+                                 category);                         \
+        return NULL;                                                \
+    } STMT_END;
+
+static U8 *
+uvchr_to_utf8_flags_msgs(U8 *d, UV uv, const UV flags, HV** msgs)
+{
+    dTHX;
+
+    assert(msgs);
+
+    PERL_UNUSED_ARG(flags);
+
+    uv = NATIVE_TO_UNI(uv);
+
+    *msgs = NULL;
+
+    if (OFFUNI_IS_INVARIANT(uv)) {
+       *d++ = LATIN1_TO_NATIVE(uv);
+       return d;
+    }
+
+    if (uv <= MAX_UTF8_TWO_BYTE) {
+        *d++ = I8_TO_NATIVE_UTF8(( uv >> MY_SHIFT) | UTF_START_MARK(2));
+        *d++ = I8_TO_NATIVE_UTF8(( uv   & MY_MASK) | MY_MARK);
+        return d;
+    }
+
+    /* Not 2-byte; test for and handle 3-byte result.   In the test immediately
+     * below, the 16 is for start bytes E0-EF (which are all the possible ones
+     * for 3 byte characters).  The 2 is for 2 continuation bytes; these each
+     * contribute MY_SHIFT bits.  This yields 0x4000 on EBCDIC platforms, 0x1_0000
+     * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
+     * 0x800-0xFFFF on ASCII */
+    if (uv < (16 * (1U << (2 * MY_SHIFT)))) {
+       *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * MY_SHIFT)) | UTF_START_MARK(3));
+       *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
+       *d++ = I8_TO_NATIVE_UTF8(( uv  /* (1 - 1) */           & MY_MASK) | MY_MARK);
+
+#ifndef EBCDIC  /* These problematic code points are 4 bytes on EBCDIC, so
+                   aren't tested here */
+        /* The most likely code points in this range are below the surrogates.
+         * Do an extra test to quickly exclude those. */
+        if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
+            if (UNLIKELY(   UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
+                         || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
+            {
+                HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
+            }
+            else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+                HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
+            }
+        }
+#endif
+       return d;
+    }
+
+    /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
+     * platforms, and 0x4000 on EBCDIC.  There are problematic cases that can
+     * happen starting with 4-byte characters on ASCII platforms.  We unify the
+     * code for these with EBCDIC, even though some of them require 5-bytes on
+     * those, because khw believes the code saving is worth the very slight
+     * performance hit on these high EBCDIC code points. */
+
+    if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
+        const char * format = super_cp_format;
+        U32 category = packWARN(WARN_NON_UNICODE);
+        if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
+            Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
+        }
+        *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv), category);
+        return NULL;
+    }
+    else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
+        HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
+    }
+
+    /* Test for and handle 4-byte result.   In the test immediately below, the
+     * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
+     * characters).  The 3 is for 3 continuation bytes; these each contribute
+     * MY_SHIFT bits.  This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
+     * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
+     * 0x1_0000-0x1F_FFFF on ASCII */
+    if (uv < (8 * (1U << (3 * MY_SHIFT)))) {
+       *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * MY_SHIFT)) | UTF_START_MARK(4));
+       *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
+       *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
+       *d++ = I8_TO_NATIVE_UTF8(( uv  /* (1 - 1) */           & MY_MASK) | MY_MARK);
+
+#ifdef EBCDIC   /* These were handled on ASCII platforms in the code for 3-byte
+                   characters.  The end-plane non-characters for EBCDIC were
+                   handled just above */
+        if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
+            HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
+        }
+        else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+            HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
+        }
+#endif
+
+       return d;
+    }
+
+    /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
+     * platforms, and 0x4000 on EBCDIC.  At this point we switch to a loop
+     * format.  The unrolled version above turns out to not save all that much
+     * time, and at these high code points (well above the legal Unicode range
+     * on ASCII platforms, and well above anything in common use in EBCDIC),
+     * khw believes that less code outweighs slight performance gains. */
+
+    {
+       STRLEN len  = OFFUNISKIP(uv);
+       U8 *p = d+len-1;
+       while (p > d) {
+           *p-- = I8_TO_NATIVE_UTF8((uv & MY_MASK) | MY_MARK);
+           uv >>= MY_SHIFT;
+       }
+       *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
+       return d+len;
+    }
+}
+
+#endif  /* End of defining our own uvchr_to_utf8_flags_msgs() */
+#endif  /* End of UTF8SKIP */
+
 #endif /* ENCODE_H */
index 2a8b477..540337e 100644 (file)
@@ -3,7 +3,7 @@ package Encode::Unicode;
 use strict;
 use warnings;
 
-our $VERSION = do { my @r = ( q$Revision: 2.17 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
+our $VERSION = do { my @r = ( q$Revision: 2.18 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
 
 use XSLoader;
 XSLoader::load( __PACKAGE__, $VERSION );
index b459786..4e111e2 100644 (file)
@@ -1,7 +1,9 @@
 /*
- $Id: Unicode.xs,v 2.17 2018/02/08 00:26:15 dankogai Exp $
+ $Id: Unicode.xs,v 2.19 2019/01/21 03:09:59 dankogai Exp $
  */
 
+#define IN_UNICODE_XS
+
 #define PERL_NO_GET_CONTEXT
 #include "EXTERN.h"
 #include "perl.h"
 #define isLoSurrogate(x)       (0xDC00 <= (x)  && (x) <= 0xDFFF )
 #define invalid_ucs2(x)         ( issurrogate(x) || 0xFFFF < (x) )
 
-/* For pre-5.14 source compatibility */
-#ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
-#   define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
-#   define UTF8_DISALLOW_SURROGATE 0
-#   define UTF8_WARN_SURROGATE 0
-#   define UTF8_DISALLOW_FE_FF 0
-#   define UTF8_WARN_FE_FF 0
-#   define UTF8_WARN_NONCHAR 0
+#ifndef SVfARG
+#define SVfARG(p) ((void*)(p))
 #endif
 
 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
@@ -68,6 +64,7 @@ enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
     case 'N':
        v = *s++;
        v = (v << 8) | *s++;
+        /* FALLTHROUGH */
     case 'n':
        v = (v << 8) | *s++;
        v = (v << 8) | *s++;
@@ -123,8 +120,8 @@ MODULE = Encode::Unicode PACKAGE = Encode::Unicode
 
 PROTOTYPES: DISABLE
 
-#define attr(k, l)  (hv_exists((HV *)SvRV(obj),k,l) ? \
-    *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
+#define attr(k)  (hv_exists((HV *)SvRV(obj),"" k "",sizeof(k)-1) ? \
+    *hv_fetch((HV *)SvRV(obj),"" k "",sizeof(k)-1,0) : &PL_sv_undef)
 
 void
 decode(obj, str, check = 0)
@@ -133,9 +130,10 @@ SV *       str
 IV     check
 CODE:
 {
-    SV *sve      = attr("endian", 6);
+    SV *name     = attr("Name");
+    SV *sve      = attr("endian");
     U8 endian    = *((U8 *)SvPV_nolen(sve));
-    SV *svs      = attr("size", 4);
+    SV *svs      = attr("size");
     int size     = SvIV(svs);
     int ucs2     = -1; /* only needed in the event of surrogate pairs */
     SV *result   = newSVpvn("",0);
@@ -209,7 +207,7 @@ CODE:
        }
 #if 1
        /* Update endian for next sequence */
-       sv = attr("renewed", 7);
+       sv = attr("renewed");
        if (SvTRUE(sv)) {
            (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
        }
@@ -227,59 +225,68 @@ CODE:
     while (s < e && s+size <= e) {
        UV ord = enc_unpack(aTHX_ &s,e,size,endian);
        U8 *d;
+       HV *hv = NULL;
        if (issurrogate(ord)) {
            if (ucs2 == -1) {
-               SV *sv = attr("ucs2", 4);
+               SV *sv = attr("ucs2");
                ucs2 = SvTRUE(sv);
            }
            if (ucs2 || size == 4) {
-               if (check) {
+               if (check & ENCODE_DIE_ON_ERR) {
                    croak("%" SVf ":no surrogates allowed %" UVxf,
-                         *hv_fetch((HV *)SvRV(obj),"Name",4,0),
-                         ord);
+                         SVfARG(name), ord);
+               }
+               if (encode_ckWARN(check, WARN_SURROGATE)) {
+                   warner(packWARN(WARN_SURROGATE),
+                         "%" SVf ":no surrogates allowed %" UVxf,
+                         SVfARG(name), ord);
                }
                ord = FBCHAR;
            }
            else {
                UV lo;
                if (!isHiSurrogate(ord)) {
-                   if (check) {
+                   if (check & ENCODE_DIE_ON_ERR) {
                        croak("%" SVf ":Malformed HI surrogate %" UVxf,
-                             *hv_fetch((HV *)SvRV(obj),"Name",4,0),
-                             ord);
+                             SVfARG(name), ord);
                    }
-                   else {
-                       ord = FBCHAR;
+                   if (encode_ckWARN(check, WARN_SURROGATE)) {
+                       warner(packWARN(WARN_SURROGATE),
+                             "%" SVf ":Malformed HI surrogate %" UVxf,
+                             SVfARG(name), ord);
                    }
+                   ord = FBCHAR;
                }
                else if (s+size > e) {
-                   if (check) {
-                       if (check & ENCODE_STOP_AT_PARTIAL) {
-                            s -= size;
-                            break;
-                       }
-                       else {
-                            croak("%" SVf ":Malformed HI surrogate %" UVxf,
-                                  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
-                                  ord);
-                       }
+                   if (check & ENCODE_STOP_AT_PARTIAL) {
+                       s -= size;
+                       break;
                    }
-                   else {
-                       ord = FBCHAR;
+                   if (check & ENCODE_DIE_ON_ERR) {
+                       croak("%" SVf ":Malformed HI surrogate %" UVxf,
+                             SVfARG(name), ord);
+                   }
+                   if (encode_ckWARN(check, WARN_SURROGATE)) {
+                       warner(packWARN(WARN_SURROGATE),
+                             "%" SVf ":Malformed HI surrogate %" UVxf,
+                             SVfARG(name), ord);
                    }
+                   ord = FBCHAR;
                }
                else {
                    lo = enc_unpack(aTHX_ &s,e,size,endian);
                    if (!isLoSurrogate(lo)) {
-                       if (check) {
+                       if (check & ENCODE_DIE_ON_ERR) {
                            croak("%" SVf ":Malformed LO surrogate %" UVxf,
-                                 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
-                                 ord);
+                                 SVfARG(name), ord);
                        }
-                       else {
-                           s -= size;
-                           ord = FBCHAR;
+                       if (encode_ckWARN(check, WARN_SURROGATE)) {
+                           warner(packWARN(WARN_SURROGATE),
+                                 "%" SVf ":Malformed LO surrogate %" UVxf,
+                                 SVfARG(name), ord);
                        }
+                       s -= size;
+                       ord = FBCHAR;
                    }
                    else {
                        ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
@@ -289,13 +296,16 @@ CODE:
        }
 
        if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
-           if (check) {
+           if (check & ENCODE_DIE_ON_ERR) {
                croak("%" SVf ":Unicode character %" UVxf " is illegal",
-                     *hv_fetch((HV *)SvRV(obj),"Name",4,0),
-                     ord);
-           } else {
-               ord = FBCHAR;
+                     SVfARG(name), ord);
            }
+           if (encode_ckWARN(check, WARN_NONCHAR)) {
+               warner(packWARN(WARN_NONCHAR),
+                     "%" SVf ":Unicode character %" UVxf " is illegal",
+                     SVfARG(name), ord);
+           }
+           ord = FBCHAR;
        }
 
        if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
@@ -315,16 +325,27 @@ CODE:
            resultbuflen = SvLEN(result);
        }
 
-       d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), ord,
-                                            UNICODE_WARN_ILLEGAL_INTERCHANGE);
+        d = uvchr_to_utf8_flags_msgs(resultbuf+SvCUR(result), ord, UNICODE_DISALLOW_ILLEGAL_INTERCHANGE | UNICODE_WARN_ILLEGAL_INTERCHANGE, &hv);
+        if (hv) {
+            SV *message = *hv_fetch(hv, "text", 4, 0);
+            U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0));
+            sv_2mortal((SV *)hv);
+            if (check & ENCODE_DIE_ON_ERR)
+                croak("%" SVf, SVfARG(message));
+            if (encode_ckWARN_packed(check, categories))
+                warner(categories, "%" SVf, SVfARG(message));
+            d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), FBCHAR, 0);
+        }
+
        SvCUR_set(result, d - (U8 *)SvPVX(result));
     }
 
     if (s < e) {
        /* unlikely to happen because it's fixed-length -- dankogai */
-       if (check & ENCODE_WARN_ON_ERR) {
-           Perl_warner(aTHX_ packWARN(WARN_UTF8),"%" SVf ":Partial character",
-                       *hv_fetch((HV *)SvRV(obj),"Name",4,0));
+        if (check & ENCODE_DIE_ON_ERR)
+            croak("%" SVf ":Partial character", SVfARG(name));
+        if (encode_ckWARN(check, WARN_UTF8)) {
+            warner(packWARN(WARN_UTF8),"%" SVf ":Partial character", SVfARG(name));
        }
     }
     if (check && !(check & ENCODE_LEAVE_SRC)) {
@@ -351,9 +372,10 @@ SV *       utf8
 IV     check
 CODE:
 {
-    SV *sve = attr("endian", 6);
+    SV *name = attr("Name");
+    SV *sve = attr("endian");
     U8 endian = *((U8 *)SvPV_nolen(sve));
-    SV *svs = attr("size", 4);
+    SV *svs = attr("size");
     const int size = SvIV(svs);
     int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
     const STRLEN usize = (size > 0 ? size : 1);
@@ -399,32 +421,50 @@ CODE:
        enc_pack(aTHX_ result,size,endian,BOM_BE);
 #if 1
        /* Update endian for next sequence */
-       sv = attr("renewed", 7);
+       sv = attr("renewed");
        if (SvTRUE(sv)) {
            (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
        }
 #endif
     }
     while (s < e && s+UTF8SKIP(s) <= e) {
-       STRLEN len;
-       UV ord = utf8n_to_uvchr(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
-                                               |UTF8_WARN_SURROGATE
-                                               |UTF8_DISALLOW_FE_FF
-                                               |UTF8_WARN_FE_FF
-                                               |UTF8_WARN_NONCHAR));
-       s += len;
-       if (size != 4 && invalid_ucs2(ord)) {
+        STRLEN len;
+        AV *msgs = NULL;
+        UV ord = utf8n_to_uvchr_msgs(s, e-s, &len, UTF8_DISALLOW_ILLEGAL_INTERCHANGE | UTF8_WARN_ILLEGAL_INTERCHANGE, NULL, &msgs);
+        if (msgs) {
+            SSize_t i;
+            SSize_t len = av_len(msgs)+1;
+            sv_2mortal((SV *)msgs);
+            for (i = 0; i < len; ++i) {
+                SV *sv = *av_fetch(msgs, i, 0);
+                HV *hv = (HV *)SvRV(sv);
+                SV *message = *hv_fetch(hv, "text", 4, 0);
+                U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0));
+                if (check & ENCODE_DIE_ON_ERR)
+                    croak("%" SVf, SVfARG(message));
+                if (encode_ckWARN_packed(check, categories))
+                    warner(categories, "%" SVf, SVfARG(message));
+            }
+        }
+       if ((size != 4 && invalid_ucs2(ord)) || (ord == 0 && *s != 0)) {
            if (!issurrogate(ord)) {
                if (ucs2 == -1) {
-                   SV *sv = attr("ucs2", 4);
+                   SV *sv = attr("ucs2");
                    ucs2 = SvTRUE(sv);
                }
                if (ucs2 || ord > 0x10FFFF) {
-                   if (check) {
+                   if (check & ENCODE_DIE_ON_ERR) {
                        croak("%" SVf ":code point \"\\x{%" UVxf "}\" too high",
-                                 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
+                                 SVfARG(name),ord);
+                   }
+                   if (encode_ckWARN(check, WARN_NON_UNICODE)) {
+                       warner(packWARN(WARN_NON_UNICODE),
+                                 "%" SVf ":code point \"\\x{%" UVxf "}\" too high",
+                                 SVfARG(name),ord);
                    }
                    enc_pack(aTHX_ result,size,endian,FBCHAR);
+               } else if (ord == 0) {
+                   enc_pack(aTHX_ result,size,endian,FBCHAR);
                } else {
                    UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
                    UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
@@ -440,6 +480,7 @@ CODE:
        else {
            enc_pack(aTHX_ result,size,endian,ord);
        }
+       s += len;
     }
     if (s < e) {
        /* UTF-8 partial char happens often on PerlIO.
@@ -449,7 +490,7 @@ CODE:
        if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
            Perl_croak(aTHX_ "%" SVf ":partial character is not allowed "
                       "when CHECK = 0x%" UVuf,
-                      *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
+                      SVfARG(name), check);
        }
     }
     if (check && !(check & ENCODE_LEAVE_SRC)) {
index 67613a8..65ab383 100644 (file)
@@ -24,7 +24,7 @@ The process can be considered as pseudo perl:
 my $dst = '';
 while (length($src))
  {
-  my $size    = $count($src);
+  my $size    = src_count($src);
   my $in_seq  = substr($src,0,$size,'');
   my $out_seq = $s2d_hash{$in_seq};
   if (defined $out_seq)
@@ -101,6 +101,8 @@ do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
     U8 *d = dst;
     U8 *dend = d + dlen, *dlast = d;
     int code = 0;
+    if (!dst)
+      return ENCODE_NOSPACE;
     while (s < send) {
         const encpage_t *e = enc;
         U8 byte = *s;
index 93c992c..0c3b669 100644 (file)
@@ -51,9 +51,12 @@ $orig = "\xC3\x80";
 $orig =~ /(..)/;
 is(Encode::decode_utf8($1), "\N{U+C0}", 'passing magic regex to Encode::decode_utf8');
 
-$orig = "\xC3\x80";
-*a = $orig;
-is(Encode::decode_utf8(*a), "*main::\N{U+C0}", 'passing typeglob to Encode::decode_utf8');
+SKIP: {
+    skip "Perl Version ($]) is older than v5.27.1", 1 if $] < 5.027001;
+    $orig = "\xC3\x80";
+    *a = $orig;
+    is(Encode::decode_utf8(*a), "*main::\N{U+C0}", 'passing typeglob to Encode::decode_utf8');
+}
 
 $orig = "\N{U+C0}";
 $orig =~ /(.)/;
index fc0af3c..8f933b0 100644 (file)
@@ -25,6 +25,10 @@ BEGIN {
     }
 }
 
+use Encode qw();
+$PerlIO::encoding::fallback |= Encode::ONLY_PRAGMA_WARNINGS;
+use warnings "utf8";
+
 no warnings "deprecated";
 use encoding 'euc-jp';
 
diff --git a/cpan/Encode/t/utf8messages.t b/cpan/Encode/t/utf8messages.t
deleted file mode 100644 (file)
index 8b6b379..0000000
+++ /dev/null
@@ -1,33 +0,0 @@
-use strict;
-use warnings;
-BEGIN { 'warnings'->unimport('utf8') if $] < 5.014 }; # turn off 'UTF-16 surrogate 0xd800' warnings
-
-use Test::More;
-use Encode qw(encode decode FB_CROAK LEAVE_SRC);
-
-plan tests => 12;
-
-my @invalid;
-
-ok ! defined eval { encode('UTF-8', "\x{D800}", FB_CROAK | LEAVE_SRC) }, 'Surrogate codepoint \x{D800} is not encoded to strict UTF-8';
-like $@, qr/^"\\x\{d800\}" does not map to UTF-8 /, 'Error message contains strict UTF-8 name';
-@invalid = ();
-encode('UTF-8', "\x{D800}", sub { @invalid = @_; return ""; });
-is_deeply \@invalid, [ 0xD800 ], 'Fallback coderef contains invalid codepoint 0xD800';
-
-ok ! defined eval { decode('UTF-8', "\xed\xa0\x80", FB_CROAK | LEAVE_SRC) }, 'Surrogate UTF-8 byte sequence \xED\xA0\x80 is decoded with strict UTF-8 decoder';
-like $@, qr/^UTF-8 "\\xED\\xA0\\x80" does not map to Unicode /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence';
-@invalid = ();
-decode('UTF-8', "\xed\xa0\x80", sub { @invalid = @_; return ""; });
-is_deeply \@invalid, [ 0xED, 0xA0, 0x80 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0, 0x80';
-
-ok ! defined eval { decode('UTF-8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with strict UTF-8 decoder';
-like $@, qr/^UTF-8 "\\xED\\xA0" does not map to Unicode /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence';
-@invalid = ();
-decode('UTF-8', "\xed\xa0", sub { @invalid = @_; return ""; });
-is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0';
-
-ok ! defined eval { decode('utf8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with non-strict utf8 decoder';
-like $@, qr/^utf8 "\\xED\\xA0" does not map to Unicode /, 'Error message contains non-strict utf8 name and original (not decoded) invalid sequence';
-decode('utf8', "\xed\xa0", sub { @invalid = @_; return ""; });
-is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0';
index 0d1ac6d..e4e4304 100644 (file)
@@ -1,94 +1,35 @@
 use strict;
 use warnings;
-BEGIN {
-    if ($] < 5.014){
-        print "1..0 # Skip: Perl 5.14.0 or later required\n";
-        exit 0;
-    }
-}
+BEGIN { 'warnings'->unimport('utf8') if $] < 5.014 }; # turn off 'UTF-16 surrogate 0xd800' warnings
 
-use Encode;
-use Test::More tests => 10;
+use Test::More;
+use Encode qw(encode decode FB_CROAK LEAVE_SRC);
 
-my $valid   = "\x61\x00\x00\x00";
-my $invalid = "\x78\x56\x34\x12";
+my $script = quotemeta $0;
 
-my @warnings;
-$SIG{__WARN__} = sub {push @warnings, "@_"};
+plan tests => 12;
 
-my $enc = find_encoding("UTF32-LE");
+my @invalid;
 
-{
-    @warnings = ();
-    my $ret = Encode::Unicode::decode( $enc, $valid );
-    is("@warnings", "", "Calling decode in Encode::Unicode on valid string produces no warnings");
-}
+ok ! defined eval { encode('UTF-8', "\x{D800}", FB_CROAK | LEAVE_SRC) }, 'Surrogate codepoint \x{D800} is not encoded to strict UTF-8';
+like $@, qr/^"\\x\{d800\}" does not map to UTF-8 at $script line /, 'Error message contains strict UTF-8 name';
+@invalid = ();
+encode('UTF-8', "\x{D800}", sub { @invalid = @_; return ""; });
+is_deeply \@invalid, [ 0xD800 ], 'Fallback coderef contains invalid codepoint 0xD800';
 
+ok ! defined eval { decode('UTF-8', "\xed\xa0\x80", FB_CROAK | LEAVE_SRC) }, 'Surrogate UTF-8 byte sequence \xED\xA0\x80 is decoded with strict UTF-8 decoder';
+like $@, qr/^UTF-8 "\\xED\\xA0\\x80" does not map to Unicode at $script line /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence';
+@invalid = ();
+decode('UTF-8', "\xed\xa0\x80", sub { @invalid = @_; return ""; });
+is_deeply \@invalid, [ 0xED, 0xA0, 0x80 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0, 0x80';
 
+ok ! defined eval { decode('UTF-8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with strict UTF-8 decoder';
+like $@, qr/^UTF-8 "\\xED\\xA0" does not map to Unicode at $script line /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence';
+@invalid = ();
+decode('UTF-8', "\xed\xa0", sub { @invalid = @_; return ""; });
+is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0';
 
-{
-    @warnings = ();
-    my $ret = Encode::Unicode::decode( $enc, $invalid );
-    like("@warnings", qr/is not Unicode/, "Calling decode in Encode::Unicode on invalid string warns");
-}
-
-{
-    no warnings 'utf8';
-    @warnings = ();
-    my $ret = Encode::Unicode::decode( $enc, $invalid );
-    is("@warnings", "", "Warning from decode in Encode::Unicode can be silenced via no warnings 'utf8'");
-}
-
-{
-    no warnings;
-    @warnings = ();
-    my $ret = Encode::Unicode::decode( $enc, $invalid );
-    is("@warnings", "", "Warning from decode in Encode::Unicode can be silenced via no warnings");
-}
-
-
-
-{
-    @warnings = ();
-    my $ret = Encode::decode( $enc, $invalid );
-    like("@warnings", qr/is not Unicode/, "Calling decode in Encode on invalid string warns");
-}
-
-{
-    no warnings 'utf8';
-    @warnings = ();
-    my $ret = Encode::decode( $enc, $invalid );
-    is("@warnings", "", "Warning from decode in Encode can be silenced via no warnings 'utf8'");
-};
-
-{
-    no warnings;
-    @warnings = ();
-    my $ret = Encode::decode( $enc, $invalid );
-    is("@warnings", "", "Warning from decode in Encode can be silenced via no warnings");
-};
-
-
-
-{
-    @warnings = ();
-    my $inplace = $invalid;
-    Encode::from_to( $inplace, "UTF32-LE", "UTF-8" );
-    like("@warnings", qr/is not Unicode/, "Calling from_to in Encode on invalid string warns");
-}
-
-{
-    no warnings 'utf8';
-    @warnings = ();
-    my $inplace = $invalid;
-    Encode::from_to( $inplace, "UTF32-LE", "UTF-8" );
-    is("@warnings", "", "Warning from from_to in Encode can be silenced via no warnings 'utf8'");
-};
-
-{
-    no warnings;
-    @warnings = ();
-    my $inplace = $invalid;
-    Encode::from_to( $inplace, "UTF32-LE", "UTF-8" );
-    is("@warnings", "", "Warning from from_to in Encode can be silenced via no warnings");
-};
+ok ! defined eval { decode('utf8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with non-strict utf8 decoder';
+like $@, qr/^utf8 "\\xED\\xA0" does not map to Unicode at $script line /, 'Error message contains non-strict utf8 name and original (not decoded) invalid sequence';
+decode('utf8', "\xed\xa0", sub { @invalid = @_; return ""; });
+is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0';