# This code point is chosen so that it is representable in a UV on
# 32-bit machines
$UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', 0x80000000, 7,
- qr/Code point beginning with byte .* is not Unicode, and not portable/
+ qr/Code point 0x80000000 is not Unicode, and not portable/
],
[ "overflow with FE/FF",
# This tests the interaction of WARN_FE_FF/DISALLOW_FE_FF with
($has_quad)
? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
: "\xfe\x86\x80\x80\x80\x80\x80",
- $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', 0,
+
+ # We include both warning categories to make sure the FE_FF one has
+ # precedence
+ "$UTF8_WARN_FE_FF|$UTF8_WARN_SUPER", "$UTF8_DISALLOW_FE_FF", 'utf8', 0,
($has_quad) ? 13 : 7,
- qr/Code point beginning with byte .* is not Unicode, and not portable/
+ qr/overflow at byte .*, after start byte 0xf/
],
);
push @tests,
[ "begins with FF", "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
$UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', $FF_ret, 13,
- qr/Code point beginning with byte .* is not Unicode, and not portable/
+ qr/Code point 0x.* is not Unicode, and not portable/
];
}
function Perl_newATTRSUB. Function Perl_newATTRSUB has been removed. newATTRSUB
is now macro to a different internal function.
-=item XXX
-
-XXX
+=item Changes in warnings raised by C<utf8n_to_uvchr()>
+
+This bottom level function decodes the first character of a UTF-8 string
+into a code point. It is accessible to C<XS> level code, but it's
+discouraged from using it directly. There are higher level functions
+that call this that should be used instead, such as
+L<perlapi/utf8_to_uvchr_buf>. For completeness though, this documents
+some changes to it. Now, tests for malformations are done before any
+tests for other potential issues. One of those issues involves code
+points so large that they have never appeared in any official standard
+(the current standard has scaled back the highest acceptable code point
+from earlier versions). It is possible (though not done in CPAN) to
+warn and/or forbid these code points, while accepting smaller code
+points that are still above the legal Unicode maximum. The warning
+message for this now includes the code point if representable on the
+machine. Previously it always displayed raw bytes, which is what it
+still does for non-representable code points.
=back
}
}
-#ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */
- if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */
- && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
- {
- /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
- * generation of the sv, since no warnings are raised under CHECK */
- if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF
- && ckWARN_d(WARN_UTF8))
- {
- /* This message is deliberately not of the same syntax as the other
- * messages for malformations, for backwards compatibility in the
- * unlikely event that code is relying on its precise earlier text
- */
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0));
- pack_warn = packWARN(WARN_UTF8);
- }
- if (flags & UTF8_DISALLOW_FE_FF) {
- goto malformed;
- }
- }
+#ifndef EBCDIC /* EBCDIC can't overflow */
if (UNLIKELY(overflowed)) {
-
- /* If the first byte is FF, it will overflow a 32-bit word. If the
- * first byte is FE, it will overflow a signed 32-bit word. The
- * above preserves backward compatibility, since its message was used
- * in earlier versions of this code in preference to overflow */
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
goto malformed;
}
|UTF8_WARN_ILLEGAL_INTERCHANGE)))
{
if (UNICODE_IS_SURROGATE(uv)) {
+
+ /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
+ * generation of the sv, since no warnings are raised under CHECK */
if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
&& ckWARN_d(WARN_SURROGATE))
{
}
else if ((uv > PERL_UNICODE_MAX)) {
if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
- && ckWARN_d(WARN_NON_UNICODE))
+ && ckWARN_d(WARN_NON_UNICODE))
{
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
pack_warn = packWARN(WARN_NON_UNICODE);
}
+#ifndef EBCDIC /* EBCDIC always allows FE, FF */
+
+ /* The first byte being 0xFE or 0xFF is a subset of the SUPER code
+ * points. We test for these after the regular SUPER ones, and
+ * before possibly bailing out, so that the more dire warning
+ * overrides the regular one, if applicable */
+ if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */
+ && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
+ {
+ if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY))
+ == UTF8_WARN_FE_FF
+ && ckWARN_d(WARN_UTF8))
+ {
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv));
+ pack_warn = packWARN(WARN_UTF8);
+ }
+ if (flags & UTF8_DISALLOW_FE_FF) {
+ goto disallowed;
+ }
+ }
+#endif
if (flags & UTF8_DISALLOW_SUPER) {
goto disallowed;
}