From 9ae3ac1a84c63b0eadf5baf47ce7096482280f32 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 9 Jan 2011 15:33:28 -0700 Subject: [PATCH] Add warnings for use of problematic code points The non-Unicode code points have no Unicode semantics, so applying operations such as casing on them warns. This patch also includes the changes to test the warnings added by recent commits for handling the surrogates and above-Unicode code points --- pod/perldiag.pod | 48 ++++++-- t/lib/warnings/utf8 | 339 ++++++++++++++++++++++++++++++++++++++++------------ utf8.c | 30 +++++ 3 files changed, 336 insertions(+), 81 deletions(-) diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 2c5a637..b7b0c10 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -1360,6 +1360,8 @@ template code following the slash. See L. =item Code point 0x%X is not Unicode, may not be portable +=item Code point 0x%X is not Unicode, no properties match it; all inverse properties do + (W utf8) You had a code point above the Unicode maximum of U+10FFFF. Perl allows strings to contain a superset of Unicode code @@ -1369,6 +1371,17 @@ At one time, it was legal in some standards to have code points up to 0x7FFF_FFFF, but not higher. Code points above 0xFFFF_FFFF require larger than a 32 bit word. +None of the Unicode or Perl-defined properties will match a non-Unicode +code point. For example, + + chr(0x7FF_FFFF) =~ /\p{Any}/ + +will not match, because the code point is not in Unicode. But + + chr(0x7FF_FFFF) =~ /\P{Any}/ + +will match. + =item %s: Command not found (A) You've accidentally run your script through B instead of Perl. @@ -3192,6 +3205,32 @@ handler was defined. While some handlers can be autogenerated in terms of other handlers, there is no default handler for any operation, unless C overloading key is specified to be true. See L. +=item Operation "%s" returns its argument for UTF-16 surrogate U+%X + +(W) You performed an operation requiring Unicode semantics on a Unicode +surrogate. Unicode frowns upon the use of surrogates for anything but +storing strings in UTF-16, but semantics are (reluctantly) defined for +the surrogates, and they are to do nothing for this operation. Because +the use of surrogates can be dangerous, Perl warns. + +If the operation shown is "ToFold", it means that case-insensitive +matching in a regular expression was done on the code point. + +If you know what you are doing you can turn off this warning by +C. + +=item Operation "%s" returns its argument for non-Unicode code point 0x%X + +(W) You performed an operation requiring Unicode semantics on a code +point that is not in Unicode, so what it should do is not defined. Perl +has chosen to have it do nothing, and warn you. + +If the operation shown is "ToFold", it means that case-insensitive +matching in a regular expression was done on the code point. + +If you know what you are doing you can turn off this warning by +C. + =item Operator or semicolon missing before %s (S ambiguous) You used a variable or subroutine call where the parser @@ -4628,17 +4667,12 @@ Check the #! line, or manually feed your script into Perl yourself. (F) The unexec() routine failed for some reason. See your local FSF representative, who probably put it there in the first place. -=item Unicode non-character 0x%x is illegal for interchange - =item Unicode non-character U+%X is illegal for open interchange (W utf8) Certain codepoints, such as U+FFFE and U+FFFF, are defined by the Unicode standard to be non-characters. Those are legal codepoints, but are reserved for internal use; so, applications shouldn't attempt to exchange -them. In some cases, this message is also given if you use a codepoint that -isn't in Unicode--that is it is above the legal maximum of U+10FFFF. These -aren't legal at all in Unicode, so they are illegal for interchange, but can be -used internally in a Perl program. If you know what you are doing you can turn +them. If you know what you are doing you can turn off this warning by C. =item Unknown BYTEORDER @@ -5382,7 +5416,7 @@ of the codes @, /, U, u, w or a *-length. Redesign the template. (W closed) The filehandle you're writing to got itself closed sometime before now. Check your control flow. -=item %s "\x%s" does not map to Unicode +=item %s "\x%X" does not map to Unicode (F) When reading in different encodings Perl tries to map everything into Unicode characters. The bytes you read in are not legal in diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8 index f4b5333..6514175 100644 --- a/t/lib/warnings/utf8 +++ b/t/lib/warnings/utf8 @@ -34,87 +34,278 @@ Malformed UTF-8 character (unexpected non-continuation byte 0x73, immediately af Malformed UTF-8 character (unexpected non-continuation byte 0x73, immediately after start byte 0xf8) at - line 14. ######## use warnings 'utf8'; -my $d7ff = chr(0xD7FF); -my $d800 = chr(0xD800); -my $dfff = chr(0xDFFF); -my $e000 = chr(0xE000); -my $feff = chr(0xFEFF); -my $fffd = chr(0xFFFD); -my $fffe = chr(0xFFFE); -my $ffff = chr(0xFFFF); -my $hex4 = chr(0x10000); -my $hex5 = chr(0x100000); -my $maxm1 = chr(0x10FFFE); -my $max = chr(0x10FFFF); +my $d7ff = uc(chr(0xD7FF)); +my $d800 = uc(chr(0xD800)); +my $dfff = uc(chr(0xDFFF)); +my $e000 = uc(chr(0xE000)); +my $feff = uc(chr(0xFEFF)); +my $fffd = uc(chr(0xFFFD)); +my $fffe = uc(chr(0xFFFE)); +my $ffff = uc(chr(0xFFFF)); +my $hex4 = uc(chr(0x10000)); +my $hex5 = uc(chr(0x100000)); +my $maxm1 = uc(chr(0x10FFFE)); +my $max = uc(chr(0x10FFFF)); +my $nonUnicode = uc(chr(0x110000)); no warnings 'utf8'; -my $d7ff = chr(0xD7FF); -my $d800 = chr(0xD800); -my $dfff = chr(0xDFFF); -my $e000 = chr(0xE000); -my $feff = chr(0xFEFF); -my $fffd = chr(0xFFFD); -my $fffe = chr(0xFFFE); -my $ffff = chr(0xFFFF); -my $hex4 = chr(0x10000); -my $hex5 = chr(0x100000); -my $maxm1 = chr(0x10FFFE); -my $max = chr(0x10FFFF); +my $d7ff = uc(chr(0xD7FF)); +my $d800 = uc(chr(0xD800)); +my $dfff = uc(chr(0xDFFF)); +my $e000 = uc(chr(0xE000)); +my $feff = uc(chr(0xFEFF)); +my $fffd = uc(chr(0xFFFD)); +my $fffe = uc(chr(0xFFFE)); +my $ffff = uc(chr(0xFFFF)); +my $hex4 = uc(chr(0x10000)); +my $hex5 = uc(chr(0x100000)); +my $maxm1 = uc(chr(0x10FFFE)); +my $max = uc(chr(0x10FFFF)); +my $nonUnicode = uc(chr(0x110000)); EXPECT +Operation "uc" returns its argument for UTF-16 surrogate U+D800 at - line 3. +Operation "uc" returns its argument for UTF-16 surrogate U+DFFF at - line 4. +Operation "uc" returns its argument for non-Unicode code point 0x110000 at - line 14. ######## use warnings 'utf8'; -my $d7ff = pack("U", 0xD7FF); -my $d800 = pack("U", 0xD800); -my $dfff = pack("U", 0xDFFF); -my $e000 = pack("U", 0xE000); -my $feff = pack("U", 0xFEFF); -my $fffd = pack("U", 0xFFFD); -my $fffe = pack("U", 0xFFFE); -my $ffff = pack("U", 0xFFFF); -my $hex4 = pack("U", 0x10000); -my $hex5 = pack("U", 0x100000); -my $maxm1 = pack("U", 0x10FFFE); -my $max = pack("U", 0x10FFFF); +my $d7ff = lc pack("U", 0xD7FF); +my $d800 = lc pack("U", 0xD800); +my $dfff = lc pack("U", 0xDFFF); +my $e000 = lc pack("U", 0xE000); +my $feff = lc pack("U", 0xFEFF); +my $fffd = lc pack("U", 0xFFFD); +my $fffe = lc pack("U", 0xFFFE); +my $ffff = lc pack("U", 0xFFFF); +my $hex4 = lc pack("U", 0x10000); +my $hex5 = lc pack("U", 0x100000); +my $maxm1 = lc pack("U", 0x10FFFE); +my $max = lc pack("U", 0x10FFFF); +my $nonUnicode = lc(pack("U", 0x110000)); no warnings 'utf8'; -my $d7ff = pack("U", 0xD7FF); -my $d800 = pack("U", 0xD800); -my $dfff = pack("U", 0xDFFF); -my $e000 = pack("U", 0xE000); -my $feff = pack("U", 0xFEFF); -my $fffd = pack("U", 0xFFFD); -my $fffe = pack("U", 0xFFFE); -my $ffff = pack("U", 0xFFFF); -my $hex4 = pack("U", 0x10000); -my $hex5 = pack("U", 0x100000); -my $maxm1 = pack("U", 0x10FFFE); -my $max = pack("U", 0x10FFFF); +my $d7ff = lc pack("U", 0xD7FF); +my $d800 = lc pack("U", 0xD800); +my $dfff = lc pack("U", 0xDFFF); +my $e000 = lc pack("U", 0xE000); +my $feff = lc pack("U", 0xFEFF); +my $fffd = lc pack("U", 0xFFFD); +my $fffe = lc pack("U", 0xFFFE); +my $ffff = lc pack("U", 0xFFFF); +my $hex4 = lc pack("U", 0x10000); +my $hex5 = lc pack("U", 0x100000); +my $maxm1 = lc pack("U", 0x10FFFE); +my $max = lc pack("U", 0x10FFFF); +my $nonUnicode = lc(pack("U", 0x110000)); EXPECT +Operation "lc" returns its argument for UTF-16 surrogate U+D800 at - line 3. +Operation "lc" returns its argument for UTF-16 surrogate U+DFFF at - line 4. +Operation "lc" returns its argument for non-Unicode code point 0x110000 at - line 14. ######## use warnings 'utf8'; -my $d7ff = "\x{D7FF}"; -my $d800 = "\x{D800}"; -my $dfff = "\x{DFFF}"; -my $e000 = "\x{E000}"; -my $feff = "\x{FEFF}"; -my $fffd = "\x{FFFD}"; -my $fffe = "\x{FFFE}"; -my $ffff = "\x{FFFF}"; -my $hex4 = "\x{10000}"; -my $hex5 = "\x{100000}"; -my $maxm1 = "\x{10FFFE}"; -my $max = "\x{10FFFF}"; -uc($ffff); +my $d7ff = ucfirst "\x{D7FF}"; +my $d800 = ucfirst "\x{D800}"; +my $dfff = ucfirst "\x{DFFF}"; +my $e000 = ucfirst "\x{E000}"; +my $feff = ucfirst "\x{FEFF}"; +my $fffd = ucfirst "\x{FFFD}"; +my $fffe = ucfirst "\x{FFFE}"; +my $ffff = ucfirst "\x{FFFF}"; +my $hex4 = ucfirst "\x{10000}"; +my $hex5 = ucfirst "\x{100000}"; +my $maxm1 = ucfirst "\x{10FFFE}"; +my $max = ucfirst "\x{10FFFF}"; +my $nonUnicode = ucfirst "\x{110000}"; no warnings 'utf8'; -my $d7ff = "\x{D7FF}"; -my $d800 = "\x{D800}"; -my $dfff = "\x{DFFF}"; -my $e000 = "\x{E000}"; -my $feff = "\x{FEFF}"; -my $fffd = "\x{FFFD}"; -my $fffe = "\x{FFFE}"; -my $ffff = "\x{FFFF}"; -my $hex4 = "\x{10000}"; -my $hex5 = "\x{100000}"; -my $maxm1 = "\x{10FFFE}"; -my $max = "\x{10FFFF}"; -uc($ffff); +my $d7ff = ucfirst "\x{D7FF}"; +my $d800 = ucfirst "\x{D800}"; +my $dfff = ucfirst "\x{DFFF}"; +my $e000 = ucfirst "\x{E000}"; +my $feff = ucfirst "\x{FEFF}"; +my $fffd = ucfirst "\x{FFFD}"; +my $fffe = ucfirst "\x{FFFE}"; +my $ffff = ucfirst "\x{FFFF}"; +my $hex4 = ucfirst "\x{10000}"; +my $hex5 = ucfirst "\x{100000}"; +my $maxm1 = ucfirst "\x{10FFFE}"; +my $max = ucfirst "\x{10FFFF}"; +my $nonUnicode = ucfirst "\x{110000}"; +EXPECT +Operation "ucfirst" returns its argument for UTF-16 surrogate U+D800 at - line 3. +Operation "ucfirst" returns its argument for UTF-16 surrogate U+DFFF at - line 4. +Operation "ucfirst" returns its argument for non-Unicode code point 0x110000 at - line 14. +######## +use warnings 'utf8'; +chr(0xD7FF) =~ /\p{Any}/; +chr(0xD800) =~ /\p{Any}/; +chr(0xDFFF) =~ /\p{Any}/; +chr(0xE000) =~ /\p{Any}/; +chr(0xFEFF) =~ /\p{Any}/; +chr(0xFFFD) =~ /\p{Any}/; +chr(0xFFFE) =~ /\p{Any}/; +chr(0xFFFF) =~ /\p{Any}/; +chr(0x10000) =~ /\p{Any}/; +chr(0x100000) =~ /\p{Any}/; +chr(0x10FFFE) =~ /\p{Any}/; +chr(0x10FFFF) =~ /\p{Any}/; +chr(0x110000) =~ /\p{Any}/; +no warnings 'utf8'; +chr(0xD7FF) =~ /\p{Any}/; +chr(0xD800) =~ /\p{Any}/; +chr(0xDFFF) =~ /\p{Any}/; +chr(0xE000) =~ /\p{Any}/; +chr(0xFEFF) =~ /\p{Any}/; +chr(0xFFFD) =~ /\p{Any}/; +chr(0xFFFE) =~ /\p{Any}/; +chr(0xFFFF) =~ /\p{Any}/; +chr(0x10000) =~ /\p{Any}/; +chr(0x100000) =~ /\p{Any}/; +chr(0x10FFFE) =~ /\p{Any}/; +chr(0x10FFFF) =~ /\p{Any}/; +chr(0x110000) =~ /\p{Any}/; +EXPECT +Code point 0x110000 is not Unicode, no properties match it; all inverse properties do at - line 14. +######## +require "../test.pl"; +use warnings 'utf8'; +my $file = tempfile(); +open(my $fh, "+>:utf8", $file); +print $fh "\x{D7FF}", "\n"; +print $fh "\x{D800}", "\n"; +print $fh "\x{DFFF}", "\n"; +print $fh "\x{E000}", "\n"; +print $fh "\x{FDCF}", "\n"; +print $fh "\x{FDD0}", "\n"; +print $fh "\x{FDEF}", "\n"; +print $fh "\x{FDF0}", "\n"; +print $fh "\x{FEFF}", "\n"; +print $fh "\x{FFFD}", "\n"; +print $fh "\x{FFFE}", "\n"; +print $fh "\x{FFFF}", "\n"; +print $fh "\x{10000}", "\n"; +print $fh "\x{1FFFE}", "\n"; +print $fh "\x{1FFFF}", "\n"; +print $fh "\x{2FFFE}", "\n"; +print $fh "\x{2FFFF}", "\n"; +print $fh "\x{3FFFE}", "\n"; +print $fh "\x{3FFFF}", "\n"; +print $fh "\x{4FFFE}", "\n"; +print $fh "\x{4FFFF}", "\n"; +print $fh "\x{5FFFE}", "\n"; +print $fh "\x{5FFFF}", "\n"; +print $fh "\x{6FFFE}", "\n"; +print $fh "\x{6FFFF}", "\n"; +print $fh "\x{7FFFE}", "\n"; +print $fh "\x{7FFFF}", "\n"; +print $fh "\x{8FFFE}", "\n"; +print $fh "\x{8FFFF}", "\n"; +print $fh "\x{9FFFE}", "\n"; +print $fh "\x{9FFFF}", "\n"; +print $fh "\x{AFFFE}", "\n"; +print $fh "\x{AFFFF}", "\n"; +print $fh "\x{BFFFE}", "\n"; +print $fh "\x{BFFFF}", "\n"; +print $fh "\x{CFFFE}", "\n"; +print $fh "\x{CFFFF}", "\n"; +print $fh "\x{DFFFE}", "\n"; +print $fh "\x{DFFFF}", "\n"; +print $fh "\x{EFFFE}", "\n"; +print $fh "\x{EFFFF}", "\n"; +print $fh "\x{FFFFE}", "\n"; +print $fh "\x{FFFFF}", "\n"; +print $fh "\x{100000}", "\n"; +print $fh "\x{10FFFE}", "\n"; +print $fh "\x{10FFFF}", "\n"; +print $fh "\x{110000}", "\n"; +close $fh; +EXPECT +Unicode surrogate U+D800 is illegal in UTF-8 at - line 6. +Unicode surrogate U+DFFF is illegal in UTF-8 at - line 7. +Unicode non-character U+FDD0 is illegal for open interchange at - line 10. +Unicode non-character U+FDEF is illegal for open interchange at - line 11. +Unicode non-character U+FFFE is illegal for open interchange at - line 15. +Unicode non-character U+FFFF is illegal for open interchange at - line 16. +Unicode non-character U+1FFFE is illegal for open interchange at - line 18. +Unicode non-character U+1FFFF is illegal for open interchange at - line 19. +Unicode non-character U+2FFFE is illegal for open interchange at - line 20. +Unicode non-character U+2FFFF is illegal for open interchange at - line 21. +Unicode non-character U+3FFFE is illegal for open interchange at - line 22. +Unicode non-character U+3FFFF is illegal for open interchange at - line 23. +Unicode non-character U+4FFFE is illegal for open interchange at - line 24. +Unicode non-character U+4FFFF is illegal for open interchange at - line 25. +Unicode non-character U+5FFFE is illegal for open interchange at - line 26. +Unicode non-character U+5FFFF is illegal for open interchange at - line 27. +Unicode non-character U+6FFFE is illegal for open interchange at - line 28. +Unicode non-character U+6FFFF is illegal for open interchange at - line 29. +Unicode non-character U+7FFFE is illegal for open interchange at - line 30. +Unicode non-character U+7FFFF is illegal for open interchange at - line 31. +Unicode non-character U+8FFFE is illegal for open interchange at - line 32. +Unicode non-character U+8FFFF is illegal for open interchange at - line 33. +Unicode non-character U+9FFFE is illegal for open interchange at - line 34. +Unicode non-character U+9FFFF is illegal for open interchange at - line 35. +Unicode non-character U+AFFFE is illegal for open interchange at - line 36. +Unicode non-character U+AFFFF is illegal for open interchange at - line 37. +Unicode non-character U+BFFFE is illegal for open interchange at - line 38. +Unicode non-character U+BFFFF is illegal for open interchange at - line 39. +Unicode non-character U+CFFFE is illegal for open interchange at - line 40. +Unicode non-character U+CFFFF is illegal for open interchange at - line 41. +Unicode non-character U+DFFFE is illegal for open interchange at - line 42. +Unicode non-character U+DFFFF is illegal for open interchange at - line 43. +Unicode non-character U+EFFFE is illegal for open interchange at - line 44. +Unicode non-character U+EFFFF is illegal for open interchange at - line 45. +Unicode non-character U+FFFFE is illegal for open interchange at - line 46. +Unicode non-character U+FFFFF is illegal for open interchange at - line 47. +Unicode non-character U+10FFFE is illegal for open interchange at - line 49. +Unicode non-character U+10FFFF is illegal for open interchange at - line 50. +Code point 0x110000 is not Unicode, may not be portable at - line 51. +######## +require "../test.pl"; +no warnings 'utf8'; +my $file = tempfile(); +open(my $fh, "+>:utf8", $file); +print $fh "\x{D7FF}", "\n"; +print $fh "\x{D800}", "\n"; +print $fh "\x{DFFF}", "\n"; +print $fh "\x{E000}", "\n"; +print $fh "\x{FDCF}", "\n"; +print $fh "\x{FDD0}", "\n"; +print $fh "\x{FDEF}", "\n"; +print $fh "\x{FDF0}", "\n"; +print $fh "\x{FEFF}", "\n"; +print $fh "\x{FFFD}", "\n"; +print $fh "\x{FFFE}", "\n"; +print $fh "\x{FFFF}", "\n"; +print $fh "\x{10000}", "\n"; +print $fh "\x{1FFFE}", "\n"; +print $fh "\x{1FFFF}", "\n"; +print $fh "\x{2FFFE}", "\n"; +print $fh "\x{2FFFF}", "\n"; +print $fh "\x{3FFFE}", "\n"; +print $fh "\x{3FFFF}", "\n"; +print $fh "\x{4FFFE}", "\n"; +print $fh "\x{4FFFF}", "\n"; +print $fh "\x{5FFFE}", "\n"; +print $fh "\x{5FFFF}", "\n"; +print $fh "\x{6FFFE}", "\n"; +print $fh "\x{6FFFF}", "\n"; +print $fh "\x{7FFFE}", "\n"; +print $fh "\x{7FFFF}", "\n"; +print $fh "\x{8FFFE}", "\n"; +print $fh "\x{8FFFF}", "\n"; +print $fh "\x{9FFFE}", "\n"; +print $fh "\x{9FFFF}", "\n"; +print $fh "\x{AFFFE}", "\n"; +print $fh "\x{AFFFF}", "\n"; +print $fh "\x{BFFFE}", "\n"; +print $fh "\x{BFFFF}", "\n"; +print $fh "\x{CFFFE}", "\n"; +print $fh "\x{CFFFF}", "\n"; +print $fh "\x{DFFFE}", "\n"; +print $fh "\x{DFFFF}", "\n"; +print $fh "\x{EFFFE}", "\n"; +print $fh "\x{EFFFF}", "\n"; +print $fh "\x{FFFFE}", "\n"; +print $fh "\x{FFFFF}", "\n"; +print $fh "\x{100000}", "\n"; +print $fh "\x{10FFFE}", "\n"; +print $fh "\x{10FFFF}", "\n"; +print $fh "\x{110000}", "\n"; +close $fh; EXPECT diff --git a/utf8.c b/utf8.c index 7bf2e15..605db15 100644 --- a/utf8.c +++ b/utf8.c @@ -1801,6 +1801,24 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, PERL_ARGS_ASSERT_TO_UTF8_CASE; + /* Note that swash_fetch() doesn't output warnings for these because it + * assumes we will */ + if (uv1 >= UNICODE_SURROGATE_FIRST && ckWARN_d(WARN_UTF8)) { + if (uv1 <= UNICODE_SURROGATE_LAST) { + const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; + Perl_warner(aTHX_ packWARN(WARN_UTF8), + "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1); + } + else if (UNICODE_IS_SUPER(uv1)) { + const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; + Perl_warner(aTHX_ packWARN(WARN_UTF8), + "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1); + } + + /* Note that non-characters are perfectly legal, so no warning should + * be given */ + } + uvuni_to_utf8(tmpbuf, uv1); if (!*swashp) /* load on-demand */ @@ -2121,6 +2139,18 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) /* If char is encoded then swatch is for the prefix */ needents = (1 << UTF_ACCUMULATION_SHIFT); off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK; + if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_UTF8)) { + const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0); + + /* This outputs warnings for binary properties only, assuming that + * to_utf8_case() will output any. Also, surrogates aren't checked + * for, as that would warn on things like /\p{Gc=Cs}/ */ + SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE); + if (SvUV(*bitssvp) == 1) { + Perl_warner(aTHX_ packWARN(WARN_UTF8), + "Code point 0x%04"UVXf" is not Unicode, no properties match it; all inverse properties do", code_point); + } + } } /* -- 1.8.3.1