#define UTF8_WARN_EMPTY 1
#define UTF8_WARN_CONTINUATION 2
#define UTF8_WARN_NON_CONTINUATION 3
-#define UTF8_WARN_SHORT 5
-#define UTF8_WARN_OVERFLOW 6
-#define UTF8_WARN_LONG 8
+#define UTF8_WARN_SHORT 4
+#define UTF8_WARN_OVERFLOW 5
+#define UTF8_WARN_LONG 6
if (curlen == 0 &&
!(flags & UTF8_ALLOW_EMPTY)) {
#else
if (uv == 0xfe || uv == 0xff) {
if (flags & (UTF8_WARN_SUPER|UTF8_WARN_FE_FF)) {
- sv = sv_2mortal(newSVpvf_nocontext("Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
flags &= ~UTF8_WARN_SUPER; /* Only warn once on this problem */
}
if (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_FE_FF)) {
} else if (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE)) {
if (UNICODE_IS_SURROGATE(uv)) {
if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE) {
- sv = sv_2mortal(newSVpvf_nocontext("UTF-16 surrogate U+%04"UVXf"", uv));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
}
if (flags & UTF8_DISALLOW_SURROGATE) {
goto disallowed;
}
else if (UNICODE_IS_NONCHAR(uv)) {
if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR ) {
- sv = sv_2mortal(newSVpvf_nocontext("Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
}
if (flags & UTF8_DISALLOW_NONCHAR) {
goto disallowed;
}
else if ((uv > PERL_UNICODE_MAX)) {
if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER) {
- sv = sv_2mortal(newSVpvf_nocontext("Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
}
if (flags & UTF8_DISALLOW_SUPER) {
goto disallowed;
=cut
*/
+
UV
Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
{
PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
- ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
/*
/* Call the low level routine asking for checks */
return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
- ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
/*
PERL_ARGS_ASSERT_TO_UTF8_CASE;
+ /* Note that swash_fetch() doesn't output warnings for these because it
+ * assumes we will */
+ if (uv1 >= UNICODE_SURROGATE_FIRST && ckWARN_d(WARN_UTF8)) {
+ if (uv1 <= UNICODE_SURROGATE_LAST) {
+ const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
+ }
+ else if (UNICODE_IS_SUPER(uv1)) {
+ const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
+ }
+
+ /* Note that non-characters are perfectly legal, so no warning should
+ * be given */
+ }
+
uvuni_to_utf8(tmpbuf, uv1);
if (!*swashp) /* load on-demand */
/* If char is encoded then swatch is for the prefix */
needents = (1 << UTF_ACCUMULATION_SHIFT);
off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
+ if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_UTF8)) {
+ const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
+
+ /* This outputs warnings for binary properties only, assuming that
+ * to_utf8_case() will output any. Also, surrogates aren't checked
+ * for, as that would warn on things like /\p{Gc=Cs}/ */
+ SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+ if (SvUV(*bitssvp) == 1) {
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "Code point 0x%04"UVXf" is not Unicode, no properties match it; all inverse properties do", code_point);
+ }
+ }
}
/*