/*
- $Id: Encode.xs,v 2.43 2018/02/21 12:14:33 dankogai Exp dankogai $
+ $Id: Encode.xs,v 2.45 2019/01/21 03:13:35 dankogai Exp $
*/
#define PERL_NO_GET_CONTEXT
+#define IN_ENCODE_XS
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
encode_method(). 1 is recommended. 2 restores NI-S original */
#define ENCODE_XS_USEFP 1
-#define UNIMPLEMENTED(x,y) static y x (SV *sv, char *encoding) { \
- Perl_croak_nocontext("panic_unimplemented"); \
- PERL_UNUSED_VAR(sv); \
- PERL_UNUSED_VAR(encoding); \
- return (y)0; /* fool picky compilers */ \
- }
-/**/
-
-UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
-UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
-
#ifndef SvIV_nomg
#define SvIV_nomg SvIV
#endif
-#ifndef UTF8_DISALLOW_ILLEGAL_INTERCHANGE
-# define UTF8_DISALLOW_ILLEGAL_INTERCHANGE 0
-# define UTF8_ALLOW_NON_STRICT (UTF8_ALLOW_FE_FF|UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
-#else
-# define UTF8_ALLOW_NON_STRICT 0
+#ifndef SvTRUE_nomg
+#define SvTRUE_nomg SvTRUE
+#endif
+
+#ifndef SVfARG
+#define SVfARG(p) ((void*)(p))
#endif
static void
}
static void
-call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig)
-{
- /* Exists for breakpointing */
- PERL_UNUSED_VAR(routine);
- PERL_UNUSED_VAR(done);
- PERL_UNUSED_VAR(dest);
- PERL_UNUSED_VAR(orig);
-}
-
-static void
utf8_safe_downgrade(pTHX_ SV ** src, U8 ** s, STRLEN * slen, bool modify)
{
if (!modify) {
static SV *
encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 * s, STRLEN slen,
- int check, STRLEN * offset, SV * term, int * retcode,
+ IV check, STRLEN * offset, SV * term, int * retcode,
SV *fallback_cb)
{
STRLEN tlen = slen;
(UV)ch, enc->name[0]);
return &PL_sv_undef; /* never reaches but be safe */
}
- if (check & ENCODE_WARN_ON_ERR){
+ if (encode_ckWARN(check, WARN_UTF8)) {
Perl_warner(aTHX_ packWARN(WARN_UTF8),
ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]);
}
enc->name[0], (UV)s[slen]);
return &PL_sv_undef; /* never reaches but be safe */
}
- if (check & ENCODE_WARN_ON_ERR){
+ if (encode_ckWARN(check, WARN_UTF8)) {
Perl_warner(
aTHX_ packWARN(WARN_UTF8),
ERR_DECODE_NOMAP,
return SvTRUE(*svp);
}
-/* Modern perls have the capability to do this more efficiently and portably */
-#ifdef utf8n_to_uvchr_msgs
-# define CAN_USE_BASE_PERL
-#endif
-
-#ifndef CAN_USE_BASE_PERL
-
-/*
- * https://github.com/dankogai/p5-encode/pull/56#issuecomment-231959126
- */
-#ifndef UNICODE_IS_NONCHAR
-#define UNICODE_IS_NONCHAR(c) ((c >= 0xFDD0 && c <= 0xFDEF) || (c & 0xFFFE) == 0xFFFE)
-#endif
-
-#ifndef UNICODE_IS_SUPER
-#define UNICODE_IS_SUPER(c) (c > PERL_UNICODE_MAX)
-#endif
-
-#define UNICODE_IS_STRICT(c) (!UNICODE_IS_SURROGATE(c) && !UNICODE_IS_NONCHAR(c) && !UNICODE_IS_SUPER(c))
-
-#ifndef UTF_ACCUMULATION_OVERFLOW_MASK
-#ifndef CHARBITS
-#define CHARBITS CHAR_BIT
-#endif
-#define UTF_ACCUMULATION_OVERFLOW_MASK (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * CHARBITS) - UTF_ACCUMULATION_SHIFT))
-#endif
-
-/*
- * Convert non strict utf8 sequence of len >= 2 to unicode codepoint
- */
-static UV
-convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
-{
- UV uv;
- U8 *ptr = s;
- bool overflowed = 0;
-
- uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(UTF8SKIP(s));
-
- len--;
- s++;
-
- while (len--) {
- if (!UTF8_IS_CONTINUATION(*s)) {
- *rlen = s-ptr;
- return 0;
- }
- if (uv & UTF_ACCUMULATION_OVERFLOW_MASK)
- overflowed = 1;
- uv = UTF8_ACCUMULATE(uv, *s);
- s++;
- }
-
- *rlen = s-ptr;
-
- if (overflowed || *rlen > (STRLEN)UNISKIP(uv)) {
- return 0;
- }
-
- return uv;
-}
-
-#endif /* CAN_USE_BASE_PERL */
-
static U8*
process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
bool encode, bool strict, bool stop_at_partial)
UV uv;
STRLEN ulen;
SV *fallback_cb;
- int check;
+ IV check;
U8 *d;
STRLEN dlen;
char esc[UTF8_MAXLEN * 6 + 1];
STRLEN i;
const U32 flags = (strict)
? UTF8_DISALLOW_ILLEGAL_INTERCHANGE
- : UTF8_ALLOW_NON_STRICT;
+ : 0;
- if (SvROK(check_sv)) {
+ if (!SvOK(check_sv)) {
+ fallback_cb = &PL_sv_undef;
+ check = 0;
+ }
+ else if (SvROK(check_sv)) {
/* croak("UTF-8 decoder doesn't support callback CHECK"); */
fallback_cb = check_sv;
check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as perlqq */
while (s < e) {
-#ifdef CAN_USE_BASE_PERL /* Use the much faster, portable implementation if
- available */
-
/* If there were no errors, this will be 'e'; otherwise it will point
* to the first byte of the erroneous input */
const U8* e_or_where_failed;
* point, or the best substitution for it */
uv = utf8n_to_uvchr(s, e - s, &ulen, UTF8_ALLOW_ANY);
-#else /* Use code for earlier perls */
-
- ((void)sizeof(flags)); /* Avoid compiler warning */
-
- if (UTF8_IS_INVARIANT(*s)) {
- *d++ = *s++;
- continue;
- }
-
- uv = 0;
- ulen = 1;
- if (! UTF8_IS_CONTINUATION(*s)) {
- /* Not an invariant nor a continuation; must be a start byte. (We
- * can't test for UTF8_IS_START as that excludes things like \xC0
- * which are start bytes, but always lead to overlongs */
-
- U8 skip = UTF8SKIP(s);
- if ((s + skip) > e) {
- /* just calculate ulen, in pathological cases can be smaller then e-s */
- if (e-s >= 2)
- convert_utf8_multi_seq(s, e-s, &ulen);
- else
- ulen = 1;
-
- if (stop_at_partial && ulen == (STRLEN)(e-s))
- break;
-
- goto malformed_byte;
- }
-
- uv = convert_utf8_multi_seq(s, skip, &ulen);
- if (uv == 0)
- goto malformed_byte;
- else if (strict && !UNICODE_IS_STRICT(uv))
- goto malformed;
-
-
- /* Whole char is good */
- memcpy(d, s, skip);
- d += skip;
- s += skip;
- continue;
- }
-
- /* If we get here there is something wrong with alleged UTF-8 */
- /* uv is used only when encoding */
- malformed_byte:
- if (uv == 0)
- uv = (UV)*s;
- if (encode || ulen == 0)
- ulen = 1;
-
- malformed:
-
-#endif /* The two versions for processing come back together here, for the
- * error handling code.
- *
+ /*
* Here, we are looping through the input and found an error.
* 'uv' is the code point in error if calculable, or the REPLACEMENT
* CHARACTER if not.
else
Perl_croak(aTHX_ ERR_DECODE_STR_NOMAP, (strict ? "UTF-8" : "utf8"), esc);
}
- if (check & ENCODE_WARN_ON_ERR){
+ if (encode_ckWARN(check, WARN_UTF8)) {
if (encode)
Perl_warner(aTHX_ packWARN(WARN_UTF8),
ERR_ENCODE_NOMAP, uv, (strict ? "UTF-8" : "utf8"));
return s;
}
+static SV *
+find_encoding(pTHX_ SV *enc)
+{
+ dSP;
+ I32 count;
+ SV *m_enc;
+ SV *obj = &PL_sv_undef;
+#ifndef SV_NOSTEAL
+ U32 tmp;
+#endif
+
+ ENTER;
+ SAVETMPS;
+ PUSHMARK(sp);
+
+ m_enc = sv_newmortal();
+#ifndef SV_NOSTEAL
+ tmp = SvFLAGS(enc) & SVs_TEMP;
+ SvTEMP_off(enc);
+ sv_setsv_flags(m_enc, enc, 0);
+ SvFLAGS(enc) |= tmp;
+#else
+#if SV_NOSTEAL == 0
+ #error You have broken SV_NOSTEAL which cause memory corruption in sv_setsv_flags()
+ #error Most probably broken SV_NOSTEAL was defined by buggy version of ppport.h
+#else
+ sv_setsv_flags(m_enc, enc, SV_NOSTEAL);
+#endif
+#endif
+ XPUSHs(m_enc);
+
+ PUTBACK;
+
+ count = call_pv("Encode::find_encoding", G_SCALAR);
+
+ SPAGAIN;
+
+ if (count > 0) {
+ obj = POPs;
+ SvREFCNT_inc(obj);
+ }
+
+ PUTBACK;
+ FREETMPS;
+ LEAVE;
+ return sv_2mortal(obj);
+}
+
+static SV *
+call_encoding(pTHX_ const char *method, SV *obj, SV *src, SV *check)
+{
+ dSP;
+ I32 count;
+ SV *dst = &PL_sv_undef;
+
+ PUSHMARK(sp);
+
+ if (check)
+ check = sv_2mortal(newSVsv(check));
+
+ if (!check || SvROK(check) || !SvTRUE_nomg(check) || (SvIV_nomg(check) & ENCODE_LEAVE_SRC))
+ src = sv_2mortal(newSVsv(src));
+
+ XPUSHs(obj);
+ XPUSHs(src);
+ XPUSHs(check ? check : &PL_sv_no);
+
+ PUTBACK;
+
+ count = call_method(method, G_SCALAR);
+
+ SPAGAIN;
+
+ if (count > 0) {
+ dst = POPs;
+ SvREFCNT_inc(dst);
+ }
+
+ PUTBACK;
+ return dst;
+}
+
MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_
U8 *e;
SV *dst;
bool renewed = 0;
- int check;
+ IV check;
bool modify;
dSP;
INIT:
U8 *s;
U8 *e;
SV *dst;
- int check;
+ IV check;
bool modify;
INIT:
SvGETMAGIC(src);
SV * term
SV * check_sv
PREINIT:
- int check;
+ IV check;
SV *fallback_cb;
bool modify;
encode_t *enc;
SV * src
SV * check_sv
PREINIT:
- int check;
+ IV check;
SV *fallback_cb;
bool modify;
encode_t *enc;
SV * src
SV * check_sv
PREINIT:
- int check;
+ IV check;
SV *fallback_cb;
bool modify;
encode_t *enc;
PROTOTYPES: ENABLE
-I32
-_bytes_to_utf8(sv, ...)
-SV * sv
-PREINIT:
- SV * encoding;
-INIT:
- encoding = items == 2 ? ST(1) : Nullsv;
-CODE:
- if (encoding)
- RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
- else {
- STRLEN len;
- U8* s = (U8*)SvPV(sv, len);
- U8* converted;
-
- converted = bytes_to_utf8(s, &len); /* This allocs */
- sv_setpvn(sv, (char *)converted, len);
- SvUTF8_on(sv); /* XXX Should we? */
- Safefree(converted); /* ... so free it */
- RETVAL = len;
- }
-OUTPUT:
- RETVAL
-
-I32
-_utf8_to_bytes(sv, ...)
-SV * sv
-PREINIT:
- SV * to;
- SV * check;
-INIT:
- to = items > 1 ? ST(1) : Nullsv;
- check = items > 2 ? ST(2) : Nullsv;
-CODE:
- if (to) {
- RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
- } else {
- STRLEN len;
- U8 *s = (U8*)SvPV(sv, len);
-
- RETVAL = 0;
- if (SvTRUE(check)) {
- /* Must do things the slow way */
- U8 *dest;
- /* We need a copy to pass to check() */
- U8 *src = s;
- U8 *send = s + len;
- U8 *d0;
-
- New(83, dest, len, U8); /* I think */
- d0 = dest;
-
- while (s < send) {
- if (*s < 0x80){
- *dest++ = *s++;
- } else {
- STRLEN ulen;
- UV uv = *s++;
-
- /* Have to do it all ourselves because of error routine,
- aargh. */
- if (!(uv & 0x40)){ goto failure; }
- if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
- else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
- else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
- else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
- else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
- else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
- else { ulen = 13; uv = 0; }
-
- /* Note change to utf8.c variable naming, for variety */
- while (ulen--) {
- if ((*s & 0xc0) != 0x80){
- goto failure;
- } else {
- uv = (uv << 6) | (*s++ & 0x3f);
- }
- }
- if (uv > 256) {
- failure:
- call_failure(check, s, dest, src);
- /* Now what happens? */
- }
- *dest++ = (U8)uv;
- }
- }
- RETVAL = dest - d0;
- sv_usepvn(sv, (char *)dest, RETVAL);
- SvUTF8_off(sv);
- } else {
- RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
- }
- }
-OUTPUT:
- RETVAL
-
bool
is_utf8(sv, check = 0)
SV * sv
OUTPUT:
RETVAL
+SV *
+decode(encoding, octets, check = NULL)
+SV * encoding
+SV * octets
+SV * check
+ALIAS:
+ bytes2str = 0
+PREINIT:
+ SV *obj;
+INIT:
+ SvGETMAGIC(encoding);
+CODE:
+ if (!SvOK(encoding))
+ croak("Encoding name should not be undef");
+ obj = find_encoding(aTHX_ encoding);
+ if (!SvOK(obj))
+ croak("Unknown encoding '%" SVf "'", SVfARG(encoding));
+ RETVAL = call_encoding(aTHX_ "decode", obj, octets, check);
+OUTPUT:
+ RETVAL
+
+SV *
+encode(encoding, string, check = NULL)
+SV * encoding
+SV * string
+SV * check
+ALIAS:
+ str2bytes = 0
+PREINIT:
+ SV *obj;
+INIT:
+ SvGETMAGIC(encoding);
+CODE:
+ if (!SvOK(encoding))
+ croak("Encoding name should not be undef");
+ obj = find_encoding(aTHX_ encoding);
+ if (!SvOK(obj))
+ croak("Unknown encoding '%" SVf "'", SVfARG(encoding));
+ RETVAL = call_encoding(aTHX_ "encode", obj, string, check);
+OUTPUT:
+ RETVAL
+
+SV *
+decode_utf8(octets, check = NULL)
+SV * octets
+SV * check
+PREINIT:
+ HV *hv;
+ SV **sv;
+CODE:
+ hv = get_hv("Encode::Encoding", 0);
+ if (!hv)
+ croak("utf8 encoding was not found");
+ sv = hv_fetch(hv, "utf8", 4, 0);
+ if (!sv || !*sv || !SvOK(*sv))
+ croak("utf8 encoding was not found");
+ RETVAL = call_encoding(aTHX_ "decode", *sv, octets, check);
+OUTPUT:
+ RETVAL
+
+SV *
+encode_utf8(string)
+SV * string
+CODE:
+ RETVAL = newSVsv(string);
+ if (SvOK(RETVAL))
+ sv_utf8_encode(RETVAL);
+OUTPUT:
+ RETVAL
+
+SV *
+from_to(octets, from, to, check = NULL)
+SV * octets
+SV * from
+SV * to
+SV * check
+PREINIT:
+ SV *from_obj;
+ SV *to_obj;
+ SV *string;
+ SV *new_octets;
+ U8 *ptr;
+ STRLEN len;
+INIT:
+ SvGETMAGIC(from);
+ SvGETMAGIC(to);
+CODE:
+ if (!SvOK(from) || !SvOK(to))
+ croak("Encoding name should not be undef");
+ from_obj = find_encoding(aTHX_ from);
+ if (!SvOK(from_obj))
+ croak("Unknown encoding '%" SVf "'", SVfARG(from));
+ to_obj = find_encoding(aTHX_ to);
+ if (!SvOK(to_obj))
+ croak("Unknown encoding '%" SVf "'", SVfARG(to));
+ string = sv_2mortal(call_encoding(aTHX_ "decode", from_obj, octets, NULL));
+ new_octets = sv_2mortal(call_encoding(aTHX_ "encode", to_obj, string, check));
+ SvGETMAGIC(new_octets);
+ if (SvOK(new_octets) && (!check || SvROK(check) || !SvTRUE_nomg(check) || sv_len(string) == 0)) {
+ ptr = (U8 *)SvPV_nomg(new_octets, len);
+ if (SvUTF8(new_octets))
+ len = utf8_length(ptr, ptr+len);
+ RETVAL = newSVuv(len);
+ } else {
+ RETVAL = &PL_sv_undef;
+ }
+ sv_setsv_nomg(octets, new_octets);
+ SvSETMAGIC(octets);
+OUTPUT:
+ RETVAL
+
void
onBOOT()
CODE:
BOOT:
{
- HV *stash = gv_stashpvn("Encode", strlen("Encode"), GV_ADD);
+ HV *stash = gv_stashpvn("Encode", (U32)strlen("Encode"), GV_ADD);
newCONSTSUB(stash, "DIE_ON_ERR", newSViv(ENCODE_DIE_ON_ERR));
newCONSTSUB(stash, "WARN_ON_ERR", newSViv(ENCODE_WARN_ON_ERR));
newCONSTSUB(stash, "RETURN_ON_ERR", newSViv(ENCODE_RETURN_ON_ERR));
newCONSTSUB(stash, "LEAVE_SRC", newSViv(ENCODE_LEAVE_SRC));
+ newCONSTSUB(stash, "ONLY_PRAGMA_WARNINGS", newSViv(ENCODE_ONLY_PRAGMA_WARNINGS));
newCONSTSUB(stash, "PERLQQ", newSViv(ENCODE_PERLQQ));
newCONSTSUB(stash, "HTMLCREF", newSViv(ENCODE_HTMLCREF));
newCONSTSUB(stash, "XMLCREF", newSViv(ENCODE_XMLCREF));
#define ENCODE_WARN_ON_ERR 0x0002 /* warn on error; may proceed */
#define ENCODE_RETURN_ON_ERR 0x0004 /* immediately returns on NOREP */
#define ENCODE_LEAVE_SRC 0x0008 /* $src updated unless set */
+#define ENCODE_ONLY_PRAGMA_WARNINGS 0x0010 /* when enabled report only warnings configured by pragma warnings, otherwise report all warnings; no effect without ENCODE_WARN_ON_ERR */
#define ENCODE_PERLQQ 0x0100 /* perlqq fallback string */
#define ENCODE_HTMLCREF 0x0200 /* HTML character ref. fb mode */
#define ENCODE_XMLCREF 0x0400 /* XML character ref. fb mode */
#define ENCODE_FB_HTMLCREF (ENCODE_HTMLCREF|ENCODE_LEAVE_SRC)
#define ENCODE_FB_XMLCREF (ENCODE_XMLCREF|ENCODE_LEAVE_SRC)
+#define encode_ckWARN(c, w) ((c & ENCODE_WARN_ON_ERR) \
+ && (!(c & ENCODE_ONLY_PRAGMA_WARNINGS) || ckWARN(w)))
+
+#ifdef UTF8SKIP
+# ifdef EBCDIC /* The value on early perls is wrong */
+# undef UTF8_MAXBYTES
+# define UTF8_MAXBYTES 14
+# endif
+# ifndef UNLIKELY
+# define UNLIKELY(x) (x)
+# endif
+# ifndef LIKELY
+# define LIKELY(x) (x)
+# endif
+
+/* EBCDIC requires a later perl to work, so the next two definitions are for
+ * ASCII machines only */
+# ifndef NATIVE_UTF8_TO_I8
+# define NATIVE_UTF8_TO_I8(x) (x)
+# endif
+# ifndef I8_TO_NATIVE_UTF8
+# define I8_TO_NATIVE_UTF8(x) (x)
+# endif
+# ifndef OFFUNISKIP
+# define OFFUNISKIP(x) UNISKIP(x)
+# endif
+# ifndef uvoffuni_to_utf8_flags
+# define uvoffuni_to_utf8_flags(a,b,c) uvuni_to_utf8_flags(a,b,c)
+# endif
+# ifndef WARN_SURROGATE /* Use the overarching category if these
+ subcategories are missing */
+# define WARN_SURROGATE WARN_UTF8
+# define WARN_NONCHAR WARN_UTF8
+# define WARN_NON_UNICODE WARN_UTF8
+ /* If there's only one possible category, then packing is a no-op */
+# define encode_ckWARN_packed(c, w) encode_ckWARN(c, w)
+# else
+# define encode_ckWARN_packed(c, w) \
+ ((c & ENCODE_WARN_ON_ERR) \
+ && (!(c & ENCODE_ONLY_PRAGMA_WARNINGS) || Perl_ckwarn(aTHX_ w)))
+# endif
+
+/* All these formats take a single UV code point argument */
+static const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
+static const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
+ " is not recommended for open interchange";
+static const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
+ " may not be portable";
+
+/* If the perl doesn't have the 5.28 functions, this file includes
+ * stripped-down versions of them but containing enough functionality to be
+ * suitable for Encode's needs. Many of the comments have been removed. But
+ * you can inspect the 5.28 source if you get stuck.
+ *
+ * These could be put in Devel::PPPort, but Encode is likely the only user */
+
+#if (defined(IN_ENCODE_XS) || defined(IN_UNICODE_XS)) \
+ && (! defined(utf8n_to_uvchr_msgs) && ! defined(uvchr_to_utf8_flags_msgs))
+
+# ifndef hv_stores
+# define hv_stores(hv, key, val) hv_store((hv), ("" key ""), (sizeof(key)-1), (val), 0)
+# endif
+
+static HV *
+S_new_msg_hv(const char * const message, /* The message text */
+ U32 categories) /* Packed warning categories */
+{
+ /* Creates, populates, and returns an HV* that describes an error message
+ * for the translators between UTF8 and code point */
+
+ dTHX;
+ SV* msg_sv = newSVpv(message, 0);
+ SV* category_sv = newSVuv(categories);
+
+ HV* msg_hv = newHV();
+
+ (void) hv_stores(msg_hv, "text", msg_sv);
+ (void) hv_stores(msg_hv, "warn_categories", category_sv);
+
+ return msg_hv;
+}
+
+#endif
+
+#if ! defined(utf8n_to_uvchr_msgs) \
+ && (defined(IN_ENCODE_XS) || defined(IN_UNICODE_XS))
+
+# undef utf8n_to_uvchr /* Don't use an earlier version: use the version
+ defined in this file */
+# define utf8n_to_uvchr(a,b,c,d) utf8n_to_uvchr_msgs(a, b, c, d, 0, NULL)
+
+# undef UTF8_IS_START /* Early perls wrongly accepted C0 and C1 */
+# define UTF8_IS_START(c) (((U8)(c)) >= 0xc2)
+# ifndef isUTF8_POSSIBLY_PROBLEMATIC
+# ifdef EBCDIC
+# define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c > ' ')
+# else
+# define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
+# endif
+# endif
+# ifndef UTF8_ALLOW_OVERFLOW
+# define UTF8_ALLOW_OVERFLOW (1U<<31) /* Choose highest bit to avoid
+ potential conflicts */
+# define UTF8_GOT_OVERFLOW UTF8_ALLOW_OVERFLOW
+# endif
+# undef UTF8_ALLOW_ANY /* Early perl definitions don't work properly with
+ the code in this file */
+# define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \
+ |UTF8_ALLOW_NON_CONTINUATION \
+ |UTF8_ALLOW_SHORT \
+ |UTF8_ALLOW_LONG \
+ |UTF8_ALLOW_OVERFLOW)
+
+/* The meanings of these were complemented at some point, but the functions
+ * bundled in this file use the complemented meanings */
+# ifndef UTF8_DISALLOW_SURROGATE
+# define UTF8_DISALLOW_SURROGATE UTF8_ALLOW_SURROGATE
+# define UTF8_DISALLOW_NONCHAR UTF8_ALLOW_FFFF
+# define UTF8_DISALLOW_SUPER UTF8_ALLOW_FE_FF
+
+ /* In the stripped-down implementation in this file, disallowing is not
+ * independent of warning */
+# define UTF8_WARN_SURROGATE UTF8_DISALLOW_SURROGATE
+# define UTF8_WARN_NONCHAR UTF8_DISALLOW_NONCHAR
+# define UTF8_WARN_SUPER UTF8_DISALLOW_SUPER
+# endif
+# ifndef UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+# define UTF8_DISALLOW_ILLEGAL_INTERCHANGE \
+ (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_SURROGATE|UTF8_DISALLOW_NONCHAR)
+# endif
+# ifndef UTF8_WARN_ILLEGAL_INTERCHANGE
+# define UTF8_WARN_ILLEGAL_INTERCHANGE \
+ (UTF8_WARN_SUPER|UTF8_WARN_SURROGATE|UTF8_WARN_NONCHAR)
+# endif
+# ifndef FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER
+# ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
+# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
+# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
+
+# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
+ && ((s1) & 0xFE ) == 0xB6)
+# else
+# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
+# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
+# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
+# endif
+# if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */
+# ifdef EBCDIC /* Actually is I8 */
+# define HIGHEST_REPRESENTABLE_UTF8 \
+ "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+# else
+# define HIGHEST_REPRESENTABLE_UTF8 \
+ "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+# endif
+# endif
+# endif
+
+# ifndef Newx
+# define Newx(v,n,t) New(0,v,n,t)
+# endif
+
+# ifndef PERL_UNUSED_ARG
+# define PERL_UNUSED_ARG(x) ((void)x)
+# endif
+
+static const char malformed_text[] = "Malformed UTF-8 character";
+
+static char *
+_byte_dump_string(const U8 * const start, const STRLEN len)
+{
+ /* Returns a mortalized C string that is a displayable copy of the 'len' */
+
+ const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
+ trailing NUL */
+ const U8 * s = start;
+ const U8 * const e = start + len;
+ char * output;
+ char * d;
+ dTHX;
+
+ Newx(output, output_len, char);
+ SAVEFREEPV(output);
+
+ d = output;
+ for (s = start; s < e; s++) {
+ const unsigned high_nibble = (*s & 0xF0) >> 4;
+ const unsigned low_nibble = (*s & 0x0F);
+
+ *d++ = '\\';
+ *d++ = 'x';
+
+ if (high_nibble < 10) {
+ *d++ = high_nibble + '0';
+ }
+ else {
+ *d++ = high_nibble - 10 + 'a';
+ }
+
+ if (low_nibble < 10) {
+ *d++ = low_nibble + '0';
+ }
+ else {
+ *d++ = low_nibble - 10 + 'a';
+ }
+ }
+
+ *d = '\0';
+ return output;
+}
+
+static char *
+S_unexpected_non_continuation_text(const U8 * const s,
+
+ /* Max number of bytes to print */
+ STRLEN print_len,
+
+ /* Which one is the non-continuation */
+ const STRLEN non_cont_byte_pos,
+
+ /* How many bytes should there be? */
+ const STRLEN expect_len)
+{
+ /* Return the malformation warning text for an unexpected continuation
+ * byte. */
+
+ dTHX;
+ const char * const where = (non_cont_byte_pos == 1)
+ ? "immediately"
+ : Perl_form(aTHX_ "%d bytes",
+ (int) non_cont_byte_pos);
+ const U8 * x = s + non_cont_byte_pos;
+ const U8 * e = s + print_len;
+
+ /* We don't need to pass this parameter, but since it has already been
+ * calculated, it's likely faster to pass it; verify under DEBUGGING */
+ assert(expect_len == UTF8SKIP(s));
+
+ /* As a defensive coding measure, don't output anything past a NUL. Such
+ * bytes shouldn't be in the middle of a malformation, and could mark the
+ * end of the allocated string, and what comes after is undefined */
+ for (; x < e; x++) {
+ if (*x == '\0') {
+ x++; /* Output this particular NUL */
+ break;
+ }
+ }
+
+ return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
+ " %s after start byte 0x%02x; need %d bytes, got %d)",
+ malformed_text,
+ _byte_dump_string(s, x - s),
+ *(s + non_cont_byte_pos),
+ where,
+ *s,
+ (int) expect_len,
+ (int) non_cont_byte_pos);
+}
+
+static int
+S_does_utf8_overflow(const U8 * const s,
+ const U8 * e,
+ const bool consider_overlongs)
+{
+ /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
+ * 'e' - 1 would overflow an IV on this platform. */
+
+# if ! defined(UV_IS_QUAD)
+
+ const STRLEN len = e - s;
+ int is_overlong;
+
+ assert(s <= e && s + UTF8SKIP(s) >= e);
+ assert(! UTF8_IS_INVARIANT(*s) && e > s);
+
+# ifdef EBCDIC
+
+ PERL_UNUSED_ARG(consider_overlongs);
+
+ if (*s != 0xFE) {
+ return 0;
+ }
+
+ if (len == 1) {
+ return -1;
+ }
+
+# else
+
+ if (LIKELY(*s < 0xFE)) {
+ return 0;
+ }
+
+ if (! consider_overlongs) {
+ return 1;
+ }
+
+ if (len == 1) {
+ return -1;
+ }
+
+ is_overlong = S_is_utf8_overlong_given_start_byte_ok(s, len);
+
+ if (is_overlong == 0) {
+ return 1;
+ }
+
+ if (is_overlong < 0) {
+ return -1;
+ }
+
+ if (*s == 0xFE) {
+ return 0;
+ }
+
+# endif
+
+ /* Here, ASCII and EBCDIC rejoin:
+ * On ASCII: We have an overlong sequence starting with FF
+ * On EBCDIC: We have a sequence starting with FE. */
+
+ { /* For C89, use a block so the declaration can be close to its use */
+
+# ifdef EBCDIC
+ const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
+# else
+ const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
+# endif
+ const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
+ const STRLEN cmp_len = MIN(conts_len, len - 1);
+
+ if (cmp_len >= conts_len || memNE(s + 1,
+ conts_for_highest_30_bit,
+ cmp_len))
+ {
+ return memGT(s + 1, conts_for_highest_30_bit, cmp_len);
+ }
+
+ return -1;
+ }
+
+# else /* Below is 64-bit word */
+
+ PERL_UNUSED_ARG(consider_overlongs);
+
+ {
+ const STRLEN len = e - s;
+ const U8 *x;
+ const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+
+ for (x = s; x < e; x++, y++) {
+
+ if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
+ continue;
+ }
+ return NATIVE_UTF8_TO_I8(*x) > *y;
+ }
+
+ if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
+ return -1;
+ }
+
+ return 0;
+ }
+
+# endif
+
+}
+
+static int
+S_isFF_OVERLONG(const U8 * const s, const STRLEN len);
+
+static int
+S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
+{
+ const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
+ const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+
+ assert(len > 1 && UTF8_IS_START(*s));
+
+# ifdef EBCDIC
+# define F0_ABOVE_OVERLONG 0xB0
+# define F8_ABOVE_OVERLONG 0xA8
+# define FC_ABOVE_OVERLONG 0xA4
+# define FE_ABOVE_OVERLONG 0xA2
+# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
+# else
+
+ if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
+ return 1;
+ }
+
+# define F0_ABOVE_OVERLONG 0x90
+# define F8_ABOVE_OVERLONG 0x88
+# define FC_ABOVE_OVERLONG 0x84
+# define FE_ABOVE_OVERLONG 0x82
+# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
+# endif
+
+ if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
+ || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
+ || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
+ || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
+ {
+ return 1;
+ }
+
+ /* Check for the FF overlong */
+ return S_isFF_OVERLONG(s, len);
+}
+
+int
+S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
+{
+ if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
+ MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
+ {
+ return 0;
+ }
+
+ if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
+ return 1;
+ }
+
+ return -1;
+}
+
+# ifndef UTF8_GOT_CONTINUATION
+# define UTF8_GOT_CONTINUATION UTF8_ALLOW_CONTINUATION
+# define UTF8_GOT_EMPTY UTF8_ALLOW_EMPTY
+# define UTF8_GOT_LONG UTF8_ALLOW_LONG
+# define UTF8_GOT_NON_CONTINUATION UTF8_ALLOW_NON_CONTINUATION
+# define UTF8_GOT_SHORT UTF8_ALLOW_SHORT
+# define UTF8_GOT_SURROGATE UTF8_DISALLOW_SURROGATE
+# define UTF8_GOT_NONCHAR UTF8_DISALLOW_NONCHAR
+# define UTF8_GOT_SUPER UTF8_DISALLOW_SUPER
+# endif
+
+# ifndef UNICODE_IS_SUPER
+# define UNICODE_IS_SUPER(uv) ((UV) (uv) > PERL_UNICODE_MAX)
+# endif
+# ifndef UNICODE_IS_32_CONTIGUOUS_NONCHARS
+# define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) ((UV) (uv) >= 0xFDD0 \
+ && (UV) (uv) <= 0xFDEF)
+# endif
+# ifndef UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER
+# define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv) \
+ (((UV) (uv) & 0xFFFE) == 0xFFFE)
+# endif
+# ifndef is_NONCHAR_utf8_safe
+# define is_NONCHAR_utf8_safe(s,e) /*** GENERATED CODE ***/ \
+( ( ( LIKELY((e) > (s)) ) && ( LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) ) ? ( ( 0xEF == ((const U8*)s)[0] ) ?\
+ ( ( 0xB7 == ((const U8*)s)[1] ) ? \
+ ( ( 0x90 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xAF ) ? 3 : 0 )\
+ : ( ( 0xBF == ((const U8*)s)[1] ) && ( ( ((const U8*)s)[2] & 0xFE ) == 0xBE ) ) ? 3 : 0 )\
+ : ( 0xF0 == ((const U8*)s)[0] ) ? \
+ ( ( ( ( ((const U8*)s)[1] == 0x9F || ( ( ((const U8*)s)[1] & 0xEF ) == 0xAF ) ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 )\
+ : ( 0xF1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xF3 ) ? \
+ ( ( ( ( ( ((const U8*)s)[1] & 0xCF ) == 0x8F ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 )\
+ : ( ( ( ( 0xF4 == ((const U8*)s)[0] ) && ( 0x8F == ((const U8*)s)[1] ) ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 ) : 0 )
+# endif
+
+# ifndef UTF8_IS_NONCHAR
+# define UTF8_IS_NONCHAR(s, e) (is_NONCHAR_utf8_safe(s,e) > 0)
+# endif
+# ifndef UNICODE_IS_NONCHAR
+# define UNICODE_IS_NONCHAR(uv) \
+ ( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) \
+ || ( LIKELY( ! UNICODE_IS_SUPER(uv)) \
+ && UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
+# endif
+
+# ifndef UTF8_MAXBYTES
+# define UTF8_MAXBYTES UTF8_MAXLEN
+# endif
+
+static UV
+utf8n_to_uvchr_msgs(const U8 *s,
+ STRLEN curlen,
+ STRLEN *retlen,
+ const U32 flags,
+ U32 * errors,
+ AV ** msgs)
+{
+ const U8 * const s0 = s;
+ const U8 * send = NULL;
+ U32 possible_problems = 0;
+ UV uv = *s;
+ STRLEN expectlen = 0;
+ U8 * adjusted_s0 = (U8 *) s0;
+ U8 temp_char_buf[UTF8_MAXBYTES + 1];
+ UV uv_so_far = 0;
+ dTHX;
+
+ assert(errors == NULL); /* This functionality has been stripped */
+
+ if (UNLIKELY(curlen == 0)) {
+ possible_problems |= UTF8_GOT_EMPTY;
+ curlen = 0;
+ uv = UNICODE_REPLACEMENT;
+ goto ready_to_handle_errors;
+ }
+
+ expectlen = UTF8SKIP(s);
+
+ if (retlen) {
+ *retlen = expectlen;
+ }
+
+ if (UTF8_IS_INVARIANT(uv)) {
+ return uv;
+ }
+
+ if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
+ possible_problems |= UTF8_GOT_CONTINUATION;
+ curlen = 1;
+ uv = UNICODE_REPLACEMENT;
+ goto ready_to_handle_errors;
+ }
+
+ uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
+
+ send = (U8*) s0;
+ if (UNLIKELY(curlen < expectlen)) {
+ possible_problems |= UTF8_GOT_SHORT;
+ send += curlen;
+ }
+ else {
+ send += expectlen;
+ }
+
+ for (s = s0 + 1; s < send; s++) {
+ if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
+ uv = UTF8_ACCUMULATE(uv, *s);
+ continue;
+ }
+
+ possible_problems |= UTF8_GOT_NON_CONTINUATION;
+ break;
+ } /* End of loop through the character's bytes */
+
+ curlen = s - s0;
+
+# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
+
+ if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
+ uv_so_far = uv;
+ uv = UNICODE_REPLACEMENT;
+ }
+
+ if (UNLIKELY(0 < S_does_utf8_overflow(s0, s, 1))) {
+ possible_problems |= UTF8_GOT_OVERFLOW;
+ uv = UNICODE_REPLACEMENT;
+ }
+
+ if ( ( LIKELY(! possible_problems)
+ && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
+ || ( UNLIKELY(possible_problems)
+ && ( UNLIKELY(! UTF8_IS_START(*s0))
+ || ( curlen > 1
+ && UNLIKELY(0 < S_is_utf8_overlong_given_start_byte_ok(s0,
+ s - s0))))))
+ {
+ possible_problems |= UTF8_GOT_LONG;
+
+ if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
+ && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
+ {
+ UV min_uv = uv_so_far;
+ STRLEN i;
+
+ for (i = curlen; i < expectlen; i++) {
+ min_uv = UTF8_ACCUMULATE(min_uv,
+ I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
+ }
+
+ adjusted_s0 = temp_char_buf;
+ (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
+ }
+ }
+
+ /* Here, we have found all the possible problems, except for when the input
+ * is for a problematic code point not allowed by the input parameters. */
+
+ /* uv is valid for overlongs */
+ if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
+ && uv >= UNICODE_SURROGATE_FIRST)
+ || ( UNLIKELY(possible_problems)
+ && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
+ && ((flags & ( UTF8_DISALLOW_NONCHAR
+ |UTF8_DISALLOW_SURROGATE
+ |UTF8_DISALLOW_SUPER))))
+ {
+ if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
+ if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+ possible_problems |= UTF8_GOT_SURROGATE;
+ }
+ else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
+ possible_problems |= UTF8_GOT_SUPER;
+ }
+ else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
+ possible_problems |= UTF8_GOT_NONCHAR;
+ }
+ }
+ else {
+ if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
+ >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
+ {
+ possible_problems |= UTF8_GOT_SUPER;
+ }
+ else if (curlen > 1) {
+ if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
+ NATIVE_UTF8_TO_I8(*adjusted_s0),
+ NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
+ {
+ possible_problems |= UTF8_GOT_SUPER;
+ }
+ else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
+ NATIVE_UTF8_TO_I8(*adjusted_s0),
+ NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
+ {
+ possible_problems |= UTF8_GOT_SURROGATE;
+ }
+ }
+ }
+ }
+
+ ready_to_handle_errors:
+
+ if (UNLIKELY(possible_problems)) {
+ bool disallowed = FALSE;
+ const U32 orig_problems = possible_problems;
+
+ if (msgs) {
+ *msgs = NULL;
+ }
+
+ while (possible_problems) { /* Handle each possible problem */
+ UV pack_warn = 0;
+ char * message = NULL;
+ U32 this_flag_bit = 0;
+
+ /* Each 'if' clause handles one problem. They are ordered so that
+ * the first ones' messages will be displayed before the later
+ * ones; this is kinda in decreasing severity order. But the
+ * overlong must come last, as it changes 'uv' looked at by the
+ * others */
+ if (possible_problems & UTF8_GOT_OVERFLOW) {
+
+ /* Overflow means also got a super; we handle both here */
+ possible_problems
+ &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER);
+
+ /* Disallow if any of the categories say to */
+ if ( ! (flags & UTF8_ALLOW_OVERFLOW)
+ || (flags & UTF8_DISALLOW_SUPER))
+ {
+ disallowed = TRUE;
+ }
+
+ /* Likewise, warn if any say to */
+ if ( ! (flags & UTF8_ALLOW_OVERFLOW)) {
+
+ /* The warnings code explicitly says it doesn't handle the
+ * case of packWARN2 and two categories which have
+ * parent-child relationship. Even if it works now to
+ * raise the warning if either is enabled, it wouldn't
+ * necessarily do so in the future. We output (only) the
+ * most dire warning */
+ if (! (flags & UTF8_CHECK_ONLY)) {
+ if (msgs || ckWARN_d(WARN_UTF8)) {
+ pack_warn = packWARN(WARN_UTF8);
+ }
+ else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
+ pack_warn = packWARN(WARN_NON_UNICODE);
+ }
+ if (pack_warn) {
+ message = Perl_form(aTHX_ "%s: %s (overflows)",
+ malformed_text,
+ _byte_dump_string(s0, curlen));
+ this_flag_bit = UTF8_GOT_OVERFLOW;
+ }
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_EMPTY) {
+ possible_problems &= ~UTF8_GOT_EMPTY;
+
+ if (! (flags & UTF8_ALLOW_EMPTY)) {
+ disallowed = TRUE;
+ if ( (msgs
+ || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+ {
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_ "%s (empty string)",
+ malformed_text);
+ this_flag_bit = UTF8_GOT_EMPTY;
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_CONTINUATION) {
+ possible_problems &= ~UTF8_GOT_CONTINUATION;
+
+ if (! (flags & UTF8_ALLOW_CONTINUATION)) {
+ disallowed = TRUE;
+ if (( msgs
+ || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+ {
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_
+ "%s: %s (unexpected continuation byte 0x%02x,"
+ " with no preceding start byte)",
+ malformed_text,
+ _byte_dump_string(s0, 1), *s0);
+ this_flag_bit = UTF8_GOT_CONTINUATION;
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_SHORT) {
+ possible_problems &= ~UTF8_GOT_SHORT;
+
+ if (! (flags & UTF8_ALLOW_SHORT)) {
+ disallowed = TRUE;
+ if (( msgs
+ || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+ {
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_
+ "%s: %s (too short; %d byte%s available, need %d)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ (int)curlen,
+ curlen == 1 ? "" : "s",
+ (int)expectlen);
+ this_flag_bit = UTF8_GOT_SHORT;
+ }
+ }
+
+ }
+ else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
+ possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
+
+ if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
+ disallowed = TRUE;
+ if (( msgs
+ || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+ {
+ int printlen = s - s0;
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_ "%s",
+ S_unexpected_non_continuation_text(s0,
+ printlen,
+ s - s0,
+ (int) expectlen));
+ this_flag_bit = UTF8_GOT_NON_CONTINUATION;
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_SURROGATE) {
+ possible_problems &= ~UTF8_GOT_SURROGATE;
+
+ if (flags & UTF8_WARN_SURROGATE) {
+
+ if ( ! (flags & UTF8_CHECK_ONLY)
+ && (msgs || ckWARN_d(WARN_SURROGATE)))
+ {
+ pack_warn = packWARN(WARN_SURROGATE);
+
+ /* These are the only errors that can occur with a
+ * surrogate when the 'uv' isn't valid */
+ if (orig_problems & UTF8_GOT_TOO_SHORT) {
+ message = Perl_form(aTHX_
+ "UTF-16 surrogate (any UTF-8 sequence that"
+ " starts with \"%s\" is for a surrogate)",
+ _byte_dump_string(s0, curlen));
+ }
+ else {
+ message = Perl_form(aTHX_ surrogate_cp_format, uv);
+ }
+ this_flag_bit = UTF8_GOT_SURROGATE;
+ }
+ }
+
+ if (flags & UTF8_DISALLOW_SURROGATE) {
+ disallowed = TRUE;
+ }
+ }
+ else if (possible_problems & UTF8_GOT_SUPER) {
+ possible_problems &= ~UTF8_GOT_SUPER;
+
+ if (flags & UTF8_WARN_SUPER) {
+
+ if ( ! (flags & UTF8_CHECK_ONLY)
+ && (msgs || ckWARN_d(WARN_NON_UNICODE)))
+ {
+ pack_warn = packWARN(WARN_NON_UNICODE);
+
+ if (orig_problems & UTF8_GOT_TOO_SHORT) {
+ message = Perl_form(aTHX_
+ "Any UTF-8 sequence that starts with"
+ " \"%s\" is for a non-Unicode code point,"
+ " may not be portable",
+ _byte_dump_string(s0, curlen));
+ }
+ else {
+ message = Perl_form(aTHX_ super_cp_format, uv);
+ }
+ this_flag_bit = UTF8_GOT_SUPER;
+ }
+ }
+
+ if (flags & UTF8_DISALLOW_SUPER) {
+ disallowed = TRUE;
+ }
+ }
+ else if (possible_problems & UTF8_GOT_NONCHAR) {
+ possible_problems &= ~UTF8_GOT_NONCHAR;
+
+ if (flags & UTF8_WARN_NONCHAR) {
+
+ if ( ! (flags & UTF8_CHECK_ONLY)
+ && (msgs || ckWARN_d(WARN_NONCHAR)))
+ {
+ /* The code above should have guaranteed that we don't
+ * get here with errors other than overlong */
+ assert (! (orig_problems
+ & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
+
+ pack_warn = packWARN(WARN_NONCHAR);
+ message = Perl_form(aTHX_ nonchar_cp_format, uv);
+ this_flag_bit = UTF8_GOT_NONCHAR;
+ }
+ }
+
+ if (flags & UTF8_DISALLOW_NONCHAR) {
+ disallowed = TRUE;
+ }
+ }
+ else if (possible_problems & UTF8_GOT_LONG) {
+ possible_problems &= ~UTF8_GOT_LONG;
+
+ if (flags & UTF8_ALLOW_LONG) {
+ uv = UNICODE_REPLACEMENT;
+ }
+ else {
+ disallowed = TRUE;
+
+ if (( msgs
+ || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+ {
+ pack_warn = packWARN(WARN_UTF8);
+
+ /* These error types cause 'uv' to be something that
+ * isn't what was intended, so can't use it in the
+ * message. The other error types either can't
+ * generate an overlong, or else the 'uv' is valid */
+ if (orig_problems &
+ (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+ {
+ message = Perl_form(aTHX_
+ "%s: %s (any UTF-8 sequence that starts"
+ " with \"%s\" is overlong which can and"
+ " should be represented with a"
+ " different, shorter sequence)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ _byte_dump_string(s0, curlen));
+ }
+ else {
+ U8 tmpbuf[UTF8_MAXBYTES+1];
+ const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
+ uv, 0);
+ /* Don't use U+ for non-Unicode code points, which
+ * includes those in the Latin1 range */
+ const char * preface = ( uv > PERL_UNICODE_MAX
+# ifdef EBCDIC
+ || uv <= 0xFF
+# endif
+ )
+ ? "0x"
+ : "U+";
+ message = Perl_form(aTHX_
+ "%s: %s (overlong; instead use %s to represent"
+ " %s%0*" UVXf ")",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ _byte_dump_string(tmpbuf, e - tmpbuf),
+ preface,
+ ((uv < 256) ? 2 : 4), /* Field width of 2 for
+ small code points */
+ UNI_TO_NATIVE(uv));
+ }
+ this_flag_bit = UTF8_GOT_LONG;
+ }
+ }
+ } /* End of looking through the possible flags */
+
+ /* Display the message (if any) for the problem being handled in
+ * this iteration of the loop */
+ if (message) {
+ if (msgs) {
+ assert(this_flag_bit);
+
+ if (*msgs == NULL) {
+ *msgs = newAV();
+ }
+
+ av_push(*msgs, newRV_noinc((SV*) S_new_msg_hv(message,
+ pack_warn)));
+ }
+ else if (PL_op)
+ Perl_warner(aTHX_ pack_warn, "%s in %s", message,
+ OP_DESC(PL_op));
+ else
+ Perl_warner(aTHX_ pack_warn, "%s", message);
+ }
+ } /* End of 'while (possible_problems)' */
+
+ if (retlen) {
+ *retlen = curlen;
+ }
+
+ if (disallowed) {
+ if (flags & UTF8_CHECK_ONLY && retlen) {
+ *retlen = ((STRLEN) -1);
+ }
+ return 0;
+ }
+ }
+
+ return UNI_TO_NATIVE(uv);
+}
+
+static STRLEN
+S_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
+{
+ STRLEN len;
+ const U8 *x;
+
+ assert(0 == (flags & ~UTF8_DISALLOW_ILLEGAL_INTERCHANGE));
+ assert(! UTF8_IS_INVARIANT(*s));
+
+ if (UNLIKELY(! UTF8_IS_START(*s))) {
+ return 0;
+ }
+
+ /* Examine a maximum of a single whole code point */
+ if (e - s > UTF8SKIP(s)) {
+ e = s + UTF8SKIP(s);
+ }
+
+ len = e - s;
+
+ if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
+ const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
+
+ if ( (flags & UTF8_DISALLOW_SUPER)
+ && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
+ {
+ return 0; /* Above Unicode */
+ }
+
+ if (len > 1) {
+ const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+
+ if ( (flags & UTF8_DISALLOW_SUPER)
+ && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
+ {
+ return 0; /* Above Unicode */
+ }
+
+ if ( (flags & UTF8_DISALLOW_SURROGATE)
+ && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
+ {
+ return 0; /* Surrogate */
+ }
+
+ if ( (flags & UTF8_DISALLOW_NONCHAR)
+ && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
+ {
+ return 0; /* Noncharacter code point */
+ }
+ }
+ }
+
+ for (x = s + 1; x < e; x++) {
+ if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
+ return 0;
+ }
+ }
+
+ if (len > 1 && S_is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
+ return 0;
+ }
+
+ if (0 < S_does_utf8_overflow(s, e, 0)) {
+ return 0;
+ }
+
+ return UTF8SKIP(s);
+}
+
+# undef is_utf8_valid_partial_char_flags
+
+static bool
+is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
+{
+
+ return S_is_utf8_char_helper(s, e, flags) > 0;
+}
+
+# undef is_utf8_string_loc_flags
+
+static bool
+is_utf8_string_loc_flags(const U8 *s, STRLEN len, const U8 **ep, const U32 flags)
+{
+ const U8* send = s + len;
+
+ assert(0 == (flags & ~UTF8_DISALLOW_ILLEGAL_INTERCHANGE));
+
+ while (s < send) {
+ if (UTF8_IS_INVARIANT(*s)) {
+ s++;
+ }
+ else if ( UNLIKELY(send - s < UTF8SKIP(s))
+ || ! S_is_utf8_char_helper(s, send, flags))
+ {
+ *ep = s;
+ return 0;
+ }
+ else {
+ s += UTF8SKIP(s);
+ }
+ }
+
+ *ep = send;
+
+ return 1;
+}
+
+#endif
+
+#if defined(IN_UNICODE_XS) && ! defined(uvchr_to_utf8_flags_msgs)
+
+# define MY_SHIFT UTF_ACCUMULATION_SHIFT
+# define MY_MARK UTF_CONTINUATION_MARK
+# define MY_MASK UTF_CONTINUATION_MASK
+
+static const char cp_above_legal_max[] =
+ "Use of code point 0x%" UVXf " is not allowed; the"
+ " permissible max is 0x%" UVXf;
+
+/* These two can be dummys, as they are not looked at by the function, which
+ * has hard-coded into it what flags it is expecting are */
+# ifndef UNICODE_DISALLOW_ILLEGAL_INTERCHANGE
+# define UNICODE_DISALLOW_ILLEGAL_INTERCHANGE 0
+# endif
+# ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
+# define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
+# endif
+
+# ifndef OFFUNI_IS_INVARIANT
+# define OFFUNI_IS_INVARIANT(cp) UNI_IS_INVARIANT(cp)
+# endif
+# ifndef MAX_EXTERNALLY_LEGAL_CP
+# define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
+# endif
+# ifndef LATIN1_TO_NATIVE
+# define LATIN1_TO_NATIVE(a) ASCII_TO_NATIVE(a)
+# endif
+# ifndef I8_TO_NATIVE_UTF8
+# define I8_TO_NATIVE_UTF8(a) NATIVE_TO_UTF(a)
+# endif
+# ifndef MAX_UTF8_TWO_BYTE
+# define MAX_UTF8_TWO_BYTE (32 * (1U << UTF_ACCUMULATION_SHIFT) - 1)
+# endif
+# ifndef UNICODE_IS_32_CONTIGUOUS_NONCHARS
+# define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) ((UV) (uv) >= 0xFDD0 \
+ && (UV) (uv) <= 0xFDEF)
+# endif
+# ifndef UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER
+# define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv) \
+ (((UV) (uv) & 0xFFFE) == 0xFFFE)
+# endif
+# ifndef UNICODE_IS_SUPER
+# define UNICODE_IS_SUPER(uv) ((UV) (uv) > PERL_UNICODE_MAX)
+# endif
+# ifndef OFFUNISKIP
+# define OFFUNISKIP(cp) UNISKIP(NATIVE_TO_UNI(cp))
+# endif
+
+# define HANDLE_UNICODE_SURROGATE(uv, flags, msgs) \
+ STMT_START { \
+ U32 category = packWARN(WARN_SURROGATE); \
+ const char * format = surrogate_cp_format; \
+ *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv), \
+ category); \
+ return NULL; \
+ } STMT_END;
+
+# define HANDLE_UNICODE_NONCHAR(uv, flags, msgs) \
+ STMT_START { \
+ U32 category = packWARN(WARN_NONCHAR); \
+ const char * format = nonchar_cp_format; \
+ *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv), \
+ category); \
+ return NULL; \
+ } STMT_END;
+
+static U8 *
+uvchr_to_utf8_flags_msgs(U8 *d, UV uv, const UV flags, HV** msgs)
+{
+ dTHX;
+
+ assert(msgs);
+
+ PERL_UNUSED_ARG(flags);
+
+ uv = NATIVE_TO_UNI(uv);
+
+ *msgs = NULL;
+
+ if (OFFUNI_IS_INVARIANT(uv)) {
+ *d++ = LATIN1_TO_NATIVE(uv);
+ return d;
+ }
+
+ if (uv <= MAX_UTF8_TWO_BYTE) {
+ *d++ = I8_TO_NATIVE_UTF8(( uv >> MY_SHIFT) | UTF_START_MARK(2));
+ *d++ = I8_TO_NATIVE_UTF8(( uv & MY_MASK) | MY_MARK);
+ return d;
+ }
+
+ /* Not 2-byte; test for and handle 3-byte result. In the test immediately
+ * below, the 16 is for start bytes E0-EF (which are all the possible ones
+ * for 3 byte characters). The 2 is for 2 continuation bytes; these each
+ * contribute MY_SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
+ * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
+ * 0x800-0xFFFF on ASCII */
+ if (uv < (16 * (1U << (2 * MY_SHIFT)))) {
+ *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * MY_SHIFT)) | UTF_START_MARK(3));
+ *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
+ *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MY_MASK) | MY_MARK);
+
+#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
+ aren't tested here */
+ /* The most likely code points in this range are below the surrogates.
+ * Do an extra test to quickly exclude those. */
+ if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
+ if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
+ || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
+ {
+ HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
+ }
+ else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+ HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
+ }
+ }
+#endif
+ return d;
+ }
+
+ /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
+ * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
+ * happen starting with 4-byte characters on ASCII platforms. We unify the
+ * code for these with EBCDIC, even though some of them require 5-bytes on
+ * those, because khw believes the code saving is worth the very slight
+ * performance hit on these high EBCDIC code points. */
+
+ if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
+ const char * format = super_cp_format;
+ U32 category = packWARN(WARN_NON_UNICODE);
+ if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
+ Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
+ }
+ *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv), category);
+ return NULL;
+ }
+ else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
+ HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
+ }
+
+ /* Test for and handle 4-byte result. In the test immediately below, the
+ * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
+ * characters). The 3 is for 3 continuation bytes; these each contribute
+ * MY_SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
+ * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
+ * 0x1_0000-0x1F_FFFF on ASCII */
+ if (uv < (8 * (1U << (3 * MY_SHIFT)))) {
+ *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * MY_SHIFT)) | UTF_START_MARK(4));
+ *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
+ *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
+ *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MY_MASK) | MY_MARK);
+
+#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
+ characters. The end-plane non-characters for EBCDIC were
+ handled just above */
+ if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
+ HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
+ }
+ else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+ HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
+ }
+#endif
+
+ return d;
+ }
+
+ /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
+ * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
+ * format. The unrolled version above turns out to not save all that much
+ * time, and at these high code points (well above the legal Unicode range
+ * on ASCII platforms, and well above anything in common use in EBCDIC),
+ * khw believes that less code outweighs slight performance gains. */
+
+ {
+ STRLEN len = OFFUNISKIP(uv);
+ U8 *p = d+len-1;
+ while (p > d) {
+ *p-- = I8_TO_NATIVE_UTF8((uv & MY_MASK) | MY_MARK);
+ uv >>= MY_SHIFT;
+ }
+ *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
+ return d+len;
+ }
+}
+
+#endif /* End of defining our own uvchr_to_utf8_flags_msgs() */
+#endif /* End of UTF8SKIP */
+
#endif /* ENCODE_H */
/*
- $Id: Unicode.xs,v 2.17 2018/02/08 00:26:15 dankogai Exp $
+ $Id: Unicode.xs,v 2.19 2019/01/21 03:09:59 dankogai Exp $
*/
+#define IN_UNICODE_XS
+
#define PERL_NO_GET_CONTEXT
#include "EXTERN.h"
#include "perl.h"
#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
-/* For pre-5.14 source compatibility */
-#ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
-# define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
-# define UTF8_DISALLOW_SURROGATE 0
-# define UTF8_WARN_SURROGATE 0
-# define UTF8_DISALLOW_FE_FF 0
-# define UTF8_WARN_FE_FF 0
-# define UTF8_WARN_NONCHAR 0
+#ifndef SVfARG
+#define SVfARG(p) ((void*)(p))
#endif
#define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
case 'N':
v = *s++;
v = (v << 8) | *s++;
+ /* FALLTHROUGH */
case 'n':
v = (v << 8) | *s++;
v = (v << 8) | *s++;
PROTOTYPES: DISABLE
-#define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
- *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
+#define attr(k) (hv_exists((HV *)SvRV(obj),"" k "",sizeof(k)-1) ? \
+ *hv_fetch((HV *)SvRV(obj),"" k "",sizeof(k)-1,0) : &PL_sv_undef)
void
decode(obj, str, check = 0)
IV check
CODE:
{
- SV *sve = attr("endian", 6);
+ SV *name = attr("Name");
+ SV *sve = attr("endian");
U8 endian = *((U8 *)SvPV_nolen(sve));
- SV *svs = attr("size", 4);
+ SV *svs = attr("size");
int size = SvIV(svs);
int ucs2 = -1; /* only needed in the event of surrogate pairs */
SV *result = newSVpvn("",0);
}
#if 1
/* Update endian for next sequence */
- sv = attr("renewed", 7);
+ sv = attr("renewed");
if (SvTRUE(sv)) {
(void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
}
while (s < e && s+size <= e) {
UV ord = enc_unpack(aTHX_ &s,e,size,endian);
U8 *d;
+ HV *hv = NULL;
if (issurrogate(ord)) {
if (ucs2 == -1) {
- SV *sv = attr("ucs2", 4);
+ SV *sv = attr("ucs2");
ucs2 = SvTRUE(sv);
}
if (ucs2 || size == 4) {
- if (check) {
+ if (check & ENCODE_DIE_ON_ERR) {
croak("%" SVf ":no surrogates allowed %" UVxf,
- *hv_fetch((HV *)SvRV(obj),"Name",4,0),
- ord);
+ SVfARG(name), ord);
+ }
+ if (encode_ckWARN(check, WARN_SURROGATE)) {
+ warner(packWARN(WARN_SURROGATE),
+ "%" SVf ":no surrogates allowed %" UVxf,
+ SVfARG(name), ord);
}
ord = FBCHAR;
}
else {
UV lo;
if (!isHiSurrogate(ord)) {
- if (check) {
+ if (check & ENCODE_DIE_ON_ERR) {
croak("%" SVf ":Malformed HI surrogate %" UVxf,
- *hv_fetch((HV *)SvRV(obj),"Name",4,0),
- ord);
+ SVfARG(name), ord);
}
- else {
- ord = FBCHAR;
+ if (encode_ckWARN(check, WARN_SURROGATE)) {
+ warner(packWARN(WARN_SURROGATE),
+ "%" SVf ":Malformed HI surrogate %" UVxf,
+ SVfARG(name), ord);
}
+ ord = FBCHAR;
}
else if (s+size > e) {
- if (check) {
- if (check & ENCODE_STOP_AT_PARTIAL) {
- s -= size;
- break;
- }
- else {
- croak("%" SVf ":Malformed HI surrogate %" UVxf,
- *hv_fetch((HV *)SvRV(obj),"Name",4,0),
- ord);
- }
+ if (check & ENCODE_STOP_AT_PARTIAL) {
+ s -= size;
+ break;
}
- else {
- ord = FBCHAR;
+ if (check & ENCODE_DIE_ON_ERR) {
+ croak("%" SVf ":Malformed HI surrogate %" UVxf,
+ SVfARG(name), ord);
+ }
+ if (encode_ckWARN(check, WARN_SURROGATE)) {
+ warner(packWARN(WARN_SURROGATE),
+ "%" SVf ":Malformed HI surrogate %" UVxf,
+ SVfARG(name), ord);
}
+ ord = FBCHAR;
}
else {
lo = enc_unpack(aTHX_ &s,e,size,endian);
if (!isLoSurrogate(lo)) {
- if (check) {
+ if (check & ENCODE_DIE_ON_ERR) {
croak("%" SVf ":Malformed LO surrogate %" UVxf,
- *hv_fetch((HV *)SvRV(obj),"Name",4,0),
- ord);
+ SVfARG(name), ord);
}
- else {
- s -= size;
- ord = FBCHAR;
+ if (encode_ckWARN(check, WARN_SURROGATE)) {
+ warner(packWARN(WARN_SURROGATE),
+ "%" SVf ":Malformed LO surrogate %" UVxf,
+ SVfARG(name), ord);
}
+ s -= size;
+ ord = FBCHAR;
}
else {
ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
}
if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
- if (check) {
+ if (check & ENCODE_DIE_ON_ERR) {
croak("%" SVf ":Unicode character %" UVxf " is illegal",
- *hv_fetch((HV *)SvRV(obj),"Name",4,0),
- ord);
- } else {
- ord = FBCHAR;
+ SVfARG(name), ord);
}
+ if (encode_ckWARN(check, WARN_NONCHAR)) {
+ warner(packWARN(WARN_NONCHAR),
+ "%" SVf ":Unicode character %" UVxf " is illegal",
+ SVfARG(name), ord);
+ }
+ ord = FBCHAR;
}
if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
resultbuflen = SvLEN(result);
}
- d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), ord,
- UNICODE_WARN_ILLEGAL_INTERCHANGE);
+ d = uvchr_to_utf8_flags_msgs(resultbuf+SvCUR(result), ord, UNICODE_DISALLOW_ILLEGAL_INTERCHANGE | UNICODE_WARN_ILLEGAL_INTERCHANGE, &hv);
+ if (hv) {
+ SV *message = *hv_fetch(hv, "text", 4, 0);
+ U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0));
+ sv_2mortal((SV *)hv);
+ if (check & ENCODE_DIE_ON_ERR)
+ croak("%" SVf, SVfARG(message));
+ if (encode_ckWARN_packed(check, categories))
+ warner(categories, "%" SVf, SVfARG(message));
+ d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), FBCHAR, 0);
+ }
+
SvCUR_set(result, d - (U8 *)SvPVX(result));
}
if (s < e) {
/* unlikely to happen because it's fixed-length -- dankogai */
- if (check & ENCODE_WARN_ON_ERR) {
- Perl_warner(aTHX_ packWARN(WARN_UTF8),"%" SVf ":Partial character",
- *hv_fetch((HV *)SvRV(obj),"Name",4,0));
+ if (check & ENCODE_DIE_ON_ERR)
+ croak("%" SVf ":Partial character", SVfARG(name));
+ if (encode_ckWARN(check, WARN_UTF8)) {
+ warner(packWARN(WARN_UTF8),"%" SVf ":Partial character", SVfARG(name));
}
}
if (check && !(check & ENCODE_LEAVE_SRC)) {
IV check
CODE:
{
- SV *sve = attr("endian", 6);
+ SV *name = attr("Name");
+ SV *sve = attr("endian");
U8 endian = *((U8 *)SvPV_nolen(sve));
- SV *svs = attr("size", 4);
+ SV *svs = attr("size");
const int size = SvIV(svs);
int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
const STRLEN usize = (size > 0 ? size : 1);
enc_pack(aTHX_ result,size,endian,BOM_BE);
#if 1
/* Update endian for next sequence */
- sv = attr("renewed", 7);
+ sv = attr("renewed");
if (SvTRUE(sv)) {
(void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
}
#endif
}
while (s < e && s+UTF8SKIP(s) <= e) {
- STRLEN len;
- UV ord = utf8n_to_uvchr(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
- |UTF8_WARN_SURROGATE
- |UTF8_DISALLOW_FE_FF
- |UTF8_WARN_FE_FF
- |UTF8_WARN_NONCHAR));
- s += len;
- if (size != 4 && invalid_ucs2(ord)) {
+ STRLEN len;
+ AV *msgs = NULL;
+ UV ord = utf8n_to_uvchr_msgs(s, e-s, &len, UTF8_DISALLOW_ILLEGAL_INTERCHANGE | UTF8_WARN_ILLEGAL_INTERCHANGE, NULL, &msgs);
+ if (msgs) {
+ SSize_t i;
+ SSize_t len = av_len(msgs)+1;
+ sv_2mortal((SV *)msgs);
+ for (i = 0; i < len; ++i) {
+ SV *sv = *av_fetch(msgs, i, 0);
+ HV *hv = (HV *)SvRV(sv);
+ SV *message = *hv_fetch(hv, "text", 4, 0);
+ U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0));
+ if (check & ENCODE_DIE_ON_ERR)
+ croak("%" SVf, SVfARG(message));
+ if (encode_ckWARN_packed(check, categories))
+ warner(categories, "%" SVf, SVfARG(message));
+ }
+ }
+ if ((size != 4 && invalid_ucs2(ord)) || (ord == 0 && *s != 0)) {
if (!issurrogate(ord)) {
if (ucs2 == -1) {
- SV *sv = attr("ucs2", 4);
+ SV *sv = attr("ucs2");
ucs2 = SvTRUE(sv);
}
if (ucs2 || ord > 0x10FFFF) {
- if (check) {
+ if (check & ENCODE_DIE_ON_ERR) {
croak("%" SVf ":code point \"\\x{%" UVxf "}\" too high",
- *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
+ SVfARG(name),ord);
+ }
+ if (encode_ckWARN(check, WARN_NON_UNICODE)) {
+ warner(packWARN(WARN_NON_UNICODE),
+ "%" SVf ":code point \"\\x{%" UVxf "}\" too high",
+ SVfARG(name),ord);
}
enc_pack(aTHX_ result,size,endian,FBCHAR);
+ } else if (ord == 0) {
+ enc_pack(aTHX_ result,size,endian,FBCHAR);
} else {
UV hi = ((ord - 0x10000) >> 10) + 0xD800;
UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
else {
enc_pack(aTHX_ result,size,endian,ord);
}
+ s += len;
}
if (s < e) {
/* UTF-8 partial char happens often on PerlIO.
if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
Perl_croak(aTHX_ "%" SVf ":partial character is not allowed "
"when CHECK = 0x%" UVuf,
- *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
+ SVfARG(name), check);
}
}
if (check && !(check & ENCODE_LEAVE_SRC)) {