From 4b88fb76efce8c436e63b907c9842345d4fa77c7 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 19 Mar 2012 15:38:06 -0600 Subject: [PATCH] Use the new utf8 to code point functions These functions should be used in preference to the old ones which can read beyond the end of the input string. --- cygwin/cygwin.c | 6 +++--- dist/Data-Dumper/Dumper.pm | 4 ++-- dist/Data-Dumper/Dumper.xs | 12 ++++++------ dump.c | 4 ++-- ext/XS-APItest/APItest.pm | 2 +- ext/XS-APItest/APItest.xs | 2 +- handy.h | 27 ++++++++++++++------------- pod/perlguts.pod | 7 ++++--- pod/perlunicode.pod | 3 ++- pp.c | 6 +++--- regcomp.c | 11 +++++++---- symbian/PerlBase.cpp | 6 ++++-- t/lib/warnings/utf8 | 4 ++-- toke.c | 2 +- utf8.c | 46 +++++++++++++++++++++++++--------------------- 15 files changed, 77 insertions(+), 65 deletions(-) diff --git a/cygwin/cygwin.c b/cygwin/cygwin.c index 9419e83..29ee22e 100644 --- a/cygwin/cygwin.c +++ b/cygwin/cygwin.c @@ -176,7 +176,7 @@ utf8_to_wide(const char *buf) setlocale(LC_CTYPE, "utf-8"); wbuf = (wchar_t *) safemalloc(wlen); - /* utf8_to_uvuni(pathname, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */ + /* utf8_to_uvuni_buf(pathname, pathname + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */ wlen = mbsrtowcs(wbuf, (const char**)&buf, wlen, &mbs); if (oldlocale) setlocale(LC_CTYPE, oldlocale); @@ -283,7 +283,7 @@ XS(XS_Cygwin_win_to_posix_path) mbstate_t mbs; char *oldlocale = setlocale(LC_CTYPE, NULL); setlocale(LC_CTYPE, "utf-8"); - /* utf8_to_uvuni(src_path, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */ + /* utf8_to_uvuni_buf(src_path, src_path + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */ wlen = mbsrtowcs(wpath, (const char**)&src_path, wlen, &mbs); if (wlen > 0) err = cygwin_conv_path(what, wpath, wbuf, wlen); @@ -370,7 +370,7 @@ XS(XS_Cygwin_posix_to_win_path) setlocale(LC_CTYPE, "utf-8"); if (!IN_BYTES) { mbstate_t mbs; - /* utf8_to_uvuni(src_path, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */ + /* utf8_to_uvuni_buf(src_path, src_path + wlen, wpath) or Encoding::_utf8_to_bytes(sv, "UCS-2BE"); */ wlen = mbsrtowcs(wpath, (const char**)&src_path, wlen, &mbs); if (wlen > 0) err = cygwin_conv_path(what, wpath, wbuf, wlen); diff --git a/dist/Data-Dumper/Dumper.pm b/dist/Data-Dumper/Dumper.pm index 5cff100..a099277 100644 --- a/dist/Data-Dumper/Dumper.pm +++ b/dist/Data-Dumper/Dumper.pm @@ -10,7 +10,7 @@ package Data::Dumper; BEGIN { - $VERSION = '2.135_05'; # Don't forget to set version and release + $VERSION = '2.135_06'; # Don't forget to set version and release } # date in POD! #$| = 1; @@ -1332,7 +1332,7 @@ modify it under the same terms as Perl itself. =head1 VERSION -Version 2.135_05 (February 18 2012) +Version 2.135_06 (March 20 2012) =head1 SEE ALSO diff --git a/dist/Data-Dumper/Dumper.xs b/dist/Data-Dumper/Dumper.xs index 4b7af7c..91e4c6c 100644 --- a/dist/Data-Dumper/Dumper.xs +++ b/dist/Data-Dumper/Dumper.xs @@ -37,17 +37,17 @@ static I32 DD_dump (pTHX_ SV *val, const char *name, STRLEN namelen, SV *retval, # endif UV -Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen) +Perl_utf8_to_uvchr_buf(pTHX_ U8 *s, U8 *send, STRLEN *retlen) { - const UV uv = utf8_to_uv(s, UTF8_MAXLEN, retlen, + const UV uv = utf8_to_uv(s, send - s, retlen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); return UNI_TO_NATIVE(uv); } # if !defined(PERL_IMPLICIT_CONTEXT) -# define utf8_to_uvchr Perl_utf8_to_uvchr +# define utf8_to_uvchr_buf Perl_utf8_to_uvchr_buf # else -# define utf8_to_uvchr(a,b) Perl_utf8_to_uvchr(aTHX_ a,b) +# define utf8_to_uvchr_buf(a,b) Perl_utf8_to_uvchr_buf(aTHX_ a,b) # endif #endif /* PERL_VERSION <= 6 */ @@ -147,7 +147,7 @@ esc_q_utf8(pTHX_ SV* sv, register const char *src, register STRLEN slen) /* this will need EBCDICification */ for (s = src; s < send; s += increment) { - const UV k = utf8_to_uvchr((U8*)s, NULL); + const UV k = utf8_to_uvchr_buf((U8*)s, (U8*) send, NULL); /* check for invalid utf8 */ increment = (k == 0 && *s != '\0') ? 1 : UTF8SKIP(s); @@ -184,7 +184,7 @@ esc_q_utf8(pTHX_ SV* sv, register const char *src, register STRLEN slen) *r++ = '"'; for (s = src; s < send; s += UTF8SKIP(s)) { - const UV k = utf8_to_uvchr((U8*)s, NULL); + const UV k = utf8_to_uvchr_buf((U8*)s, (U8*) send, NULL); if (k == '"' || k == '\\' || k == '$' || k == '@') { *r++ = '\\'; diff --git a/dump.c b/dump.c index 2c635de..b238ee0 100644 --- a/dump.c +++ b/dump.c @@ -281,7 +281,7 @@ Perl_pv_escape( pTHX_ SV *dsv, char const * const str, isuni = 1; for ( ; (pv < end && (!max || (wrote < max))) ; pv += readsize ) { - const UV u= (isuni) ? utf8_to_uvchr((U8*)pv, &readsize) : (U8)*pv; + const UV u= (isuni) ? utf8_to_uvchr_buf((U8*)pv, (U8*) end, &readsize) : (U8)*pv; const U8 c = (U8)u & 0xFF; if ( ( u > 255 ) @@ -2420,7 +2420,7 @@ Perl_sv_catxmlpvn(pTHX_ SV *dsv, const char *pv, STRLEN len, int utf8) retry: while (pv < e) { if (utf8) { - c = utf8_to_uvchr((U8*)pv, &cl); + c = utf8_to_uvchr_buf((U8*)pv, (U8*)e, &cl); if (cl == 0) { SvCUR(dsv) = dsvcur; pv = start; diff --git a/ext/XS-APItest/APItest.pm b/ext/XS-APItest/APItest.pm index 7e7e8de..78d77f1 100644 --- a/ext/XS-APItest/APItest.pm +++ b/ext/XS-APItest/APItest.pm @@ -5,7 +5,7 @@ use strict; use warnings; use Carp; -our $VERSION = '0.36'; +our $VERSION = '0.37'; require XSLoader; diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index 6e8689c..5105960 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -148,7 +148,7 @@ bitflip_key(pTHX_ IV action, SV *field) { const char *const end = p + len; while (p < end) { STRLEN len; - UV chr = utf8_to_uvuni((U8 *)p, &len); + UV chr = utf8_to_uvuni_buf((U8 *)p, (U8 *) end, &len); new_p = (char *)uvuni_to_utf8((U8 *)new_p, chr ^ 32); p += len; } diff --git a/handy.h b/handy.h index c437447..c90a876 100644 --- a/handy.h +++ b/handy.h @@ -949,7 +949,8 @@ EXTCONST U32 PL_charclass[]; *((p)+1))) \ : function(p)) -/* Note that all ignore 'use bytes' */ +/* Note that all assume that the utf8 has been validated, and ignore 'use + * bytes' */ #define isALNUM_utf8(p) generic_utf8(isWORDCHAR, is_utf8_alnum, p) /* To prevent S_scan_word in toke.c from hanging, we have to make sure that @@ -992,18 +993,18 @@ EXTCONST U32 PL_charclass[]; : isSPACE_utf8(p))) #define isBLANK_utf8(c) isBLANK(c) /* could be wrong */ -#define isALNUM_LC_utf8(p) isALNUM_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isALPHA_LC_utf8(p) isALPHA_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isSPACE_LC_utf8(p) isSPACE_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isDIGIT_LC_utf8(p) isDIGIT_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isUPPER_LC_utf8(p) isUPPER_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isLOWER_LC_utf8(p) isLOWER_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isALNUMC_LC_utf8(p) isALNUMC_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isCNTRL_LC_utf8(p) isCNTRL_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isGRAPH_LC_utf8(p) isGRAPH_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isPRINT_LC_utf8(p) isPRINT_LC_uvchr(utf8_to_uvchr(p, 0)) -#define isPUNCT_LC_utf8(p) isPUNCT_LC_uvchr(utf8_to_uvchr(p, 0)) +#define isALNUM_LC_utf8(p) isALNUM_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isALPHA_LC_utf8(p) isALPHA_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isSPACE_LC_utf8(p) isSPACE_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isDIGIT_LC_utf8(p) isDIGIT_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isUPPER_LC_utf8(p) isUPPER_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isLOWER_LC_utf8(p) isLOWER_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isALNUMC_LC_utf8(p) isALNUMC_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isCNTRL_LC_utf8(p) isCNTRL_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isGRAPH_LC_utf8(p) isGRAPH_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isPRINT_LC_utf8(p) isPRINT_LC_uvchr(valid_utf8_to_uvchr(p, 0)) +#define isPUNCT_LC_utf8(p) isPUNCT_LC_uvchr(valid_utf8_to_uvchr(p, 0)) #define isPSXSPC_LC_utf8(c) (isSPACE_LC_utf8(c) ||(c) == '\f') #define isBLANK_LC_utf8(c) isBLANK(c) /* could be wrong */ diff --git a/pod/perlguts.pod b/pod/perlguts.pod index ee938ea..908fa1f 100644 --- a/pod/perlguts.pod +++ b/pod/perlguts.pod @@ -2670,17 +2670,18 @@ character like this (the UTF8_IS_INVARIANT() is a macro that tests whether the byte can be encoded as a single byte even in UTF-8): U8 *utf; + U8 *utf_end; /* 1 beyond buffer pointed to by utf */ UV uv; /* Note: a UV, not a U8, not a char */ STRLEN len; /* length of character in bytes */ if (!UTF8_IS_INVARIANT(*utf)) /* Must treat this as UTF-8 */ - uv = utf8_to_uvchr(utf, &len); + uv = utf8_to_uvchr_buf(utf, utf_end, &len); else /* OK to treat this character as a byte */ uv = *utf; -You can also see in that example that we use C to get the +You can also see in that example that we use C to get the value of the character; the inverse function C is available for putting a UV into UTF-8: @@ -2792,7 +2793,7 @@ it's not - if you pass on the PV to somewhere, pass on the flag too. =item * -If a string is UTF-8, B use C to get at the value, +If a string is UTF-8, B use C to get at the value, unless C in which case you can use C<*s>. =item * diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index b96efbf..74c1666 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -1514,7 +1514,8 @@ pointing after the UTF-8 bytes. It works appropriately on EBCDIC machines. =item * -C reads UTF-8 encoded bytes from a buffer and +C reads UTF-8 encoded bytes from a +buffer and returns the Unicode character code point and, optionally, the length of the UTF-8 byte sequence. It works appropriately on EBCDIC machines. diff --git a/pp.c b/pp.c index f3c4ebb..ba3ac1f 100644 --- a/pp.c +++ b/pp.c @@ -3383,7 +3383,7 @@ PP(pp_chr) sv_recode_to_utf8(TARG, PL_encoding); tmps = SvPVX(TARG); if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) || - UNICODE_IS_REPLACEMENT(utf8_to_uvchr((U8*)tmps, NULL))) { + UNICODE_IS_REPLACEMENT(utf8_to_uvchr_buf((U8*)tmps, (U8*) tmps + SvCUR(TARG), NULL))) { SvGROW(TARG, 2); tmps = SvPVX(TARG); SvCUR_set(TARG, 1); @@ -3795,7 +3795,7 @@ PP(pp_uc) uv = _to_utf8_upper_flags(s, tmpbuf, &ulen, cBOOL(IN_LOCALE_RUNTIME), &tainted); if (uv == GREEK_CAPITAL_LETTER_IOTA - && utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI) + && utf8_to_uvchr_buf(s, send, 0) == COMBINING_GREEK_YPOGEGRAMMENI) { in_iota_subscript = TRUE; } @@ -5344,7 +5344,7 @@ PP(pp_reverse) continue; } else { - if (!utf8_to_uvchr(s, 0)) + if (!utf8_to_uvchr_buf(s, send, 0)) break; up = (char*)s; s += UTF8SKIP(s); diff --git a/regcomp.c b/regcomp.c index e3da6e9..8c287bf 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3480,8 +3480,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, UV uc; if (UTF) { const U8 * const s = (U8*)STRING(scan); + uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); - uc = utf8_to_uvchr(s, NULL); } else { uc = *((U8*)STRING(scan)); } @@ -3575,8 +3575,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } if (UTF) { const U8 * const s = (U8 *)STRING(scan); + uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); - uc = utf8_to_uvchr(s, NULL); } else if (has_exactf_sharp_s) { RExC_seen |= REG_SEEN_EXACTF_SHARP_S; @@ -9822,7 +9822,10 @@ tryagain: for (foldbuf = tmpbuf; foldlen; foldlen -= numlen) { - ender = utf8_to_uvchr(foldbuf, &numlen); + + /* tmpbuf has been constructed by us, so we + * know it is valid utf8 */ + ender = valid_utf8_to_uvchr(foldbuf, &numlen); if (numlen > 0) { const STRLEN unilen = reguni(pRExC_state, ender, s); s += unilen; @@ -9858,7 +9861,7 @@ tryagain: for (foldbuf = tmpbuf; foldlen; foldlen -= numlen) { - ender = utf8_to_uvchr(foldbuf, &numlen); + ender = valid_utf8_to_uvchr(foldbuf, &numlen); if (numlen > 0) { const STRLEN unilen = reguni(pRExC_state, ender, s); len += unilen; diff --git a/symbian/PerlBase.cpp b/symbian/PerlBase.cpp index 4162e57..9312abe 100644 --- a/symbian/PerlBase.cpp +++ b/symbian/PerlBase.cpp @@ -364,7 +364,9 @@ int CPerlBase::ConsoleRead(const int fd, char* buf, int n) #else dTHX; for (i = 0; i < nUtf8; i+= UTF8SKIP(pUtf8 + i)) { - unsigned long u = utf8_to_uvchr((U8*)(pUtf8 + i), 0); + unsigned long u = utf8_to_uvchr_buf((U8*)(pUtf8 + i), + (U8*)(pUtf8 + nUtf8), + 0); if (u > 0xFF) { iConsole->Printf(_L("(keycode > 0xFF)\n")); buf[i] = 0; @@ -401,7 +403,7 @@ int CPerlBase::ConsoleWrite(const int fd, const char* buf, int n) dTHX; if (is_utf8_string((U8*)buf, n)) { for (int i = 0; i < n; i += UTF8SKIP(buf + i)) { - TChar u = utf8_to_uvchr((U8*)(buf + i), 0); + TChar u = valid_utf8_to_uvchr((U8*)(buf + i), 0); iConsole->Printf(_L("%c"), u); wrote++; } diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8 index 608f198..f6fa8f2 100644 --- a/t/lib/warnings/utf8 +++ b/t/lib/warnings/utf8 @@ -1,7 +1,7 @@ utf8.c AOK - [utf8_to_uvchr] + [utf8_to_uvchr_buf] Malformed UTF-8 character my $a = ord "\x80" ; @@ -14,7 +14,7 @@ <<<<<< Add a test when something actually calls utf16_to_utf8 __END__ -# utf8.c [utf8_to_uvchr] -W +# utf8.c [utf8_to_uvchr_buf] -W BEGIN { if (ord('A') == 193) { print "SKIPPED\n# ebcdic platforms do not generate Malformed UTF-8 warnings."; diff --git a/toke.c b/toke.c index 829ff86..58142ab 100644 --- a/toke.c +++ b/toke.c @@ -9883,7 +9883,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) termlen = 1; } else { - termcode = utf8_to_uvchr((U8*)s, &termlen); + termcode = utf8_to_uvchr_buf((U8*)s, (U8*)PL_bufend, &termlen); Copy(s, termstr, termlen, U8); if (!UTF8_IS_INVARIANT(term)) has_utf8 = TRUE; diff --git a/utf8.c b/utf8.c index c9bc63a..85bf2f0 100644 --- a/utf8.c +++ b/utf8.c @@ -839,7 +839,11 @@ Returns the native code point of the first character in the string C which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. -If C does not point to a well-formed UTF-8 character, zero is +Some, but not all, UTF-8 malformations are detected, and in fact, some +malformed input could cause reading beyond the end of the input buffer. +Use L instead. + +If C points to one of the detected malformations, zero is returned and C is set, if possible, to -1. =cut @@ -850,8 +854,7 @@ Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) { PERL_ARGS_ASSERT_UTF8_TO_UVCHR; - return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen, - ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); + return valid_utf8_to_uvchr(s, retlen); } /* @@ -902,10 +905,11 @@ Returns the Unicode code point of the first character in the string C which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. -This function should only be used when the returned UV is considered -an index into the Unicode semantic tables (e.g. swashes). +Some, but not all, UTF-8 malformations are detected, and in fact, some +malformed input could cause reading beyond the end of the input buffer. +Use L instead. -If C does not point to a well-formed UTF-8 character, zero is +If C points to one of the detected malformations, zero is returned and C is set, if possible, to -1. =cut @@ -916,9 +920,7 @@ Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) { PERL_ARGS_ASSERT_UTF8_TO_UVUNI; - /* Call the low level routine asking for checks */ - return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen, - ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); + return valid_utf8_to_uvuni(s, retlen); } /* @@ -1128,7 +1130,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) d = s = save; while (s < send) { STRLEN ulen; - *d++ = (U8)utf8_to_uvchr(s, &ulen); + *d++ = (U8)utf8_to_uvchr_buf(s, send, &ulen); s += ulen; } *d = '\0'; @@ -2154,7 +2156,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, dVAR; U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; STRLEN len = 0; - const UV uv0 = utf8_to_uvchr(p, NULL); + const UV uv0 = valid_utf8_to_uvchr(p, NULL); /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings * are necessary in EBCDIC, they are redundant no-ops * in ASCII-ish platforms, and hopefully optimized away. */ @@ -2302,7 +2304,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c bad_crossing: /* Failed, have to return the original */ - original = utf8_to_uvchr(p, lenp); + original = valid_utf8_to_uvchr(p, lenp); Copy(p, ustrp, *lenp, char); return original; } @@ -3508,7 +3510,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) "unexpectedly is not a string, flags=%lu", (unsigned long)SvFLAGS(sv_to)); } - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", utf8_to_uvchr((U8*) char_from, 0), utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/ + /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/ /* Each key in the inverse list is a mapped-to value, and the key's * hash value is a list of the strings (each in utf8) that map to @@ -3575,7 +3577,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); } - /* For debugging: UV u = utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/ + /* For debugging: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/ for (j = 0; j <= av_len(from_list); j++) { entryp = av_fetch(from_list, j, FALSE); if (entryp == NULL) { @@ -3583,9 +3585,11 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) } /* When i==j this adds itself to the list */ - av_push(i_list, newSVuv(utf8_to_uvchr( - (U8*) SvPVX(*entryp), 0))); - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/ + av_push(i_list, newSVuv(utf8_to_uvchr_buf( + (U8*) SvPVX(*entryp), + (U8*) SvPVX(*entryp) + SvCUR(*entryp), + 0))); + /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/ } } } @@ -3931,7 +3935,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len) STRLEN char_len; if (UTF8_IS_SUPER(s)) { if (ckWARN_d(WARN_NON_UNICODE)) { - UV uv = utf8_to_uvchr(s, &char_len); + UV uv = utf8_to_uvchr_buf(s, e, &char_len); Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv); ok = FALSE; @@ -3939,7 +3943,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len) } else if (UTF8_IS_SURROGATE(s)) { if (ckWARN_d(WARN_SURROGATE)) { - UV uv = utf8_to_uvchr(s, &char_len); + UV uv = utf8_to_uvchr_buf(s, e, &char_len); Perl_warner(aTHX_ packWARN(WARN_SURROGATE), "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv); ok = FALSE; @@ -3949,7 +3953,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len) ((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s)) && (ckWARN_d(WARN_NONCHAR))) { - UV uv = utf8_to_uvchr(s, &char_len); + UV uv = utf8_to_uvchr_buf(s, e, &char_len); Perl_warner(aTHX_ packWARN(WARN_NONCHAR), "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv); ok = FALSE; @@ -3999,7 +4003,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f truncated++; break; } - u = utf8_to_uvchr((U8*)s, 0); + u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0); if (u < 256) { const unsigned char c = (unsigned char)u & 0xFF; if (flags & UNI_DISPLAY_BACKSLASH) { -- 1.8.3.1