From 13f4dd346e6f3b61534a20f246de3a80b3feb743 Mon Sep 17 00:00:00 2001 From: Abigail Date: Tue, 6 Jun 2017 18:51:37 +0200 Subject: [PATCH] Forbid out of range Unicode code points. Unicode allows code points up to 0x10FFFF, but Perl allows much more. However, code points above IV_MAX may not always work correctly, and may even cause the interpreter to hang. Code points exceeding IV_MAX have been deprecated since 5.24, and will be illegal in 5.28. This commit removes many tests (without replacing them) as they were testing behaviour of code points exceeding IV_MAX. --- ext/XS-APItest/t/utf8.t | 27 +++++++++++++------------ t/lib/warnings/pp | 2 +- t/lib/warnings/utf8 | 33 ------------------------------ t/op/bop.t | 53 +------------------------------------------------ t/op/chop.t | 6 +++--- t/op/ver.t | 22 +------------------- t/re/pat_advanced.t | 20 +------------------ utf8.c | 27 ++++++++++--------------- 8 files changed, 31 insertions(+), 159 deletions(-) diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t index c7a032e..788d564 100644 --- a/ext/XS-APItest/t/utf8.t +++ b/ext/XS-APItest/t/utf8.t @@ -381,19 +381,23 @@ my %code_points = ( 0x40000000 => (isASCII) ? "\xfd\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0"), - 0x80000000 - 1 => - (isASCII) ? "\xfd\xbf\xbf\xbf\xbf\xbf" - : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"), - 0x80000000 => - (isASCII) ? "\xfe\x82\x80\x80\x80\x80\x80" - : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), - 0xFFFFFFFF => - (isASCII) ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" - : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), ); if ($::is64bit) { no warnings qw(overflow portable); + + $code_points{0x80000000 - 1} + = (isASCII) + ? "\xfd\xbf\xbf\xbf\xbf\xbf" + : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"), + $code_points{0x80000000} + = (isASCII) + ? "\xfe\x82\x80\x80\x80\x80\x80" + : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), + $code_points{0xFFFFFFFF} + = (isASCII) + ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" + : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), $code_points{0x100000000} = (isASCII) ? "\xfe\x84\x80\x80\x80\x80\x80" @@ -406,10 +410,7 @@ if ($::is64bit) { = (isASCII) ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"); - $code_points{0xFFFFFFFFFFFFFFFF} - = (isASCII) - ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" - : I8_to_native("\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"); + if (isASCII) { # These could falsely show as overlongs in a naive # implementation $code_points{0x40000000000} diff --git a/t/lib/warnings/pp b/t/lib/warnings/pp index 27629a7..33d438b 100644 --- a/t/lib/warnings/pp +++ b/t/lib/warnings/pp @@ -139,7 +139,7 @@ $_ = ~ "\x{100}"; EXPECT OPTION regex Use of strings with code points over 0xFF as arguments to 1's complement \(~\) operator is deprecated. This will be a fatal error in Perl 5.28 at - line \d+. -Use of code point 0xFF+EFF is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 at - line \d+. +Use of code point 0xFF+EFF is not allowed; the permissible max is 0x7F+ at - line 2\. ######## # NAME chr -1 use warnings 'utf8'; diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8 index af04d4c..a10174a 100644 --- a/t/lib/warnings/utf8 +++ b/t/lib/warnings/utf8 @@ -736,39 +736,6 @@ $a = uc("\x{103}"); $a = ucfirst("\x{104}"); EXPECT ######## -# NAME Deprecation of too-large code points -require "../test.pl"; -use warnings 'non_unicode'; -my $max_cp = ~0 >> 1; -my $max_char = chr $max_cp; -my $to_warn_cp = $max_cp + 1; -my $to_warn_char = chr $to_warn_cp; -$max_char =~ /[\x{110000}\P{Unassigned}]/; -$to_warn_char =~ /[\x{110000}\P{Unassigned}]/; -my $temp = qr/$max_char/; -$temp = qr/$to_warn_char/; -$temp = uc($max_char); -$temp = uc($to_warn_char); -my $file = tempfile(); -open(my $fh, "+>:utf8", $file); -print $fh $max_char, "\n"; -print $fh $to_warn_char, "\n"; -close $fh; -EXPECT -OPTION regex -Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 at - line \d+. -Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 in pattern match \(m//\) at - line \d+. -Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 in regexp compilation at - line \d+. -Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 in regexp compilation at - line \d+. -Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 at - line \d+. -Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 in regexp compilation at - line \d+. -Operation "uc" returns its argument for non-Unicode code point 0x7F+ at - line \d+. -Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 in uc at - line \d+. -Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 at - line \d+. -Operation "uc" returns its argument for non-Unicode code point 0x80+ at - line \d+. -Code point 0x7F+ is not Unicode, may not be portable in print at - line \d+. -Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 in print at - line \d+. -######## # NAME [perl #127262] BEGIN{ if (ord('A') == 193) { diff --git a/t/op/bop.t b/t/op/bop.t index 594dd09..1704fdd 100644 --- a/t/op/bop.t +++ b/t/op/bop.t @@ -19,7 +19,7 @@ BEGIN { # If you find tests are failing, please try adding names to tests to track # down where the failure is, and supply your new names as a patch. # (Just-in-time test naming) -plan tests => 192 + (10*13*2) + 5 + 31; +plan tests => 187 + (10*13*2) + 5 + 31; # numerics ok ((0xdead & 0xbeef) == 0x9ead); @@ -136,57 +136,6 @@ is (sprintf("%vd", v120.300 ^ v200.400), '176.188'); is (sprintf("%vd", $a), '248.444'); } -# -# UTF8 ~ behaviour -# - -{ - my @not36; - - for (0x100...0xFFF) { - $a = ~(chr $_); - push @not36, sprintf("%#03X", $_) - if $a ne chr(~$_) or length($a) != 1 or ~$a ne chr($_); - } - is (join (', ', @not36), ''); - - my @not37; - - for my $i (0xEEE...0xF00) { - for my $j (0x0..0x120) { - $a = ~(chr ($i) . chr $j); - push @not37, sprintf("%#03X %#03X", $i, $j) - if $a ne chr(~$i).chr(~$j) or - length($a) != 2 or - ~$a ne chr($i).chr($j); - } - } - is (join (', ', @not37), ''); - - is (~chr(~0), "\0"); - - - my @not39; - - for my $i (0x100..0x120) { - for my $j (0x100...0x120) { - push @not39, sprintf("%#03X %#03X", $i, $j) - if ~(chr($i)|chr($j)) ne (~chr($i)&~chr($j)); - } - } - is (join (', ', @not39), ''); - - my @not40; - - for my $i (0x100..0x120) { - for my $j (0x100...0x120) { - push @not40, sprintf("%#03X %#03X", $i, $j) - if ~(chr($i)&chr($j)) ne (~chr($i)|~chr($j)); - } - } - is (join (', ', @not40), ''); -} - # More variations on 19 and 22. is ("ok \xFF\x{FF}\n" & "ok 41\n", "ok 41\n"); diff --git a/t/op/chop.t b/t/op/chop.t index 743f21a..f12332a 100644 --- a/t/op/chop.t +++ b/t/op/chop.t @@ -264,10 +264,10 @@ foreach my $start (@chars) { use Config; $Config{ivsize} >= 8 or skip("this build can't handle very large characters", 2); - my $utf = "\x{ffffffffffffffff}\x{fffffffffffffffe}"; + my $utf = "\x{7fffffffffffffff}\x{7ffffffffffffffe}"; my $result = chop $utf; - is($utf, "\x{ffffffffffffffff}", "chop even higher 'unicode' - remnant"); - is($result, "\x{fffffffffffffffe}", "chop even higher 'unicode' - result"); + is($utf, "\x{7fffffffffffffff}", "chop even higher 'unicode' - remnant"); + is($result, "\x{7ffffffffffffffe}", "chop even higher 'unicode' - result"); } } diff --git a/t/op/ver.t b/t/op/ver.t index e896711..182c42a 100644 --- a/t/op/ver.t +++ b/t/op/ver.t @@ -12,7 +12,7 @@ $DOWARN = 1; # enable run-time warnings now use Config; -plan( tests => 58 ); +plan( tests => 52 ); eval 'use v5.5.640'; is( $@, '', "use v5.5.640; $@"); @@ -224,26 +224,6 @@ $v = $revision + $version/1000 + $subversion/1000000; ok( abs($v - $]) < 10**-8 , "\$^V == \$] (numeric)" ); -{ - - no warnings 'deprecated'; # These are above IV_MAX on 32 bit machines - # [ID 20010902.001 (#7608)] check if v-strings handle full UV range or not - if ( $Config{'uvsize'} >= 4 ) { - is( sprintf("%vd", eval 'v2147483647.2147483648'), '2147483647.2147483648', 'v-string > IV_MAX[32-bit]' ); - is( sprintf("%vd", eval 'v3141592653'), '3141592653', 'IV_MAX < v-string < UV_MAX[32-bit]'); - is( sprintf("%vd", eval 'v4294967295'), '4294967295', 'v-string == UV_MAX[32-bit] - 1'); - } - - SKIP: { - skip("No quads", 3) if $Config{uvsize} < 8; - - if ( $Config{'uvsize'} >= 8 ) { - is( sprintf("%vd", eval 'v9223372036854775807.9223372036854775808'), '9223372036854775807.9223372036854775808', 'v-string > IV_MAX[64-bit]' ); - is( sprintf("%vd", eval 'v17446744073709551615'), '17446744073709551615', 'IV_MAX < v-string < UV_MAX[64-bit]'); - is( sprintf("%vd", eval 'v18446744073709551615'), '18446744073709551615', 'v-string == UV_MAX[64-bit] - 1'); - } - } -} # Tests for magic v-strings diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t index 7f0859c..f2d9c74 100644 --- a/t/re/pat_advanced.t +++ b/t/re/pat_advanced.t @@ -2342,7 +2342,7 @@ EOF # We use 'ok' instead of 'like' because the warnings are lexically # scoped, and want to turn them off, so have to do the match in this # scope. - if ($Config{uvsize} < 8) { + if ($Config{uvsize} > 4) { ok(chr(0xFFFF_FFFE) =~ /\p{Is_32_Bit_Super}/, "chr(0xFFFF_FFFE) can match a Unicode property"); ok(chr(0xFFFF_FFFF) =~ /\p{Is_32_Bit_Super}/, @@ -2353,24 +2353,6 @@ EOF ok(chr(0xFFFF_FFFF) =~ $p, # Tests any caching "chr(0xFFFF_FFFF) can match itself in a [class] subsequently"); } - else { - no warnings 'overflow'; - ok(chr(0xFFFF_FFFF_FFFF_FFFE) =~ qr/\p{Is_Portable_Super}/, - "chr(0xFFFF_FFFF_FFFF_FFFE) can match a Unicode property"); - ok(chr(0xFFFF_FFFF_FFFF_FFFF) =~ qr/^\p{Is_Portable_Super}$/, - "chr(0xFFFF_FFFF_FFFF_FFFF) can match a Unicode property"); - - my $p = qr/^[\x{FFFF_FFFF_FFFF_FFFF}]$/; - ok(chr(0xFFFF_FFFF_FFFF_FFFF) =~ $p, - "chr(0xFFFF_FFFF_FFFF_FFFF) can match itself in a [class]"); - ok(chr(0xFFFF_FFFF_FFFF_FFFF) =~ $p, # Tests any caching - "chr(0xFFFF_FFFF_FFFF_FFFF) can match itself in a [class] subsequently"); - - # This test is because something was declared as 32 bits, but - # should have been cast to 64; only a problem where - # sizeof(STRLEN) != sizeof(UV) - ok(chr(0xFFFF_FFFF_FFFF_FFFE) !~ qr/\p{Is_32_Bit_Super}/, "chr(0xFFFF_FFFF_FFFF_FFFE) shouldn't match a range ending in 0xFFFF_FFFF"); - } } { # [perl #112530], the code below caused a panic diff --git a/utf8.c b/utf8.c index d87af86..39df019 100644 --- a/utf8.c +++ b/utf8.c @@ -37,7 +37,8 @@ static const char malformed_text[] = "Malformed UTF-8 character"; static const char unees[] = "Malformed UTF-8 character (unexpected end of string)"; static const char cp_above_legal_max[] = - "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf ". This will be fatal in Perl 5.28"; + "Use of code point 0x%" UVXf " is not allowed; " + "the permissible max is 0x%" UVXf; #define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX)) @@ -198,11 +199,8 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) * performance hit on these high EBCDIC code points. */ if (UNLIKELY(UNICODE_IS_SUPER(uv))) { - if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), - cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP); + if (UNLIKELY(uv > MAX_NON_DEPRECATED_CP)) { + Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP); } if ( (flags & UNICODE_WARN_SUPER) || ( UNICODE_IS_ABOVE_31_BIT(uv) @@ -1663,12 +1661,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, * where 'uv' is not valid. */ if ( ! (orig_problems & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW)) - && UNLIKELY(uv > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - message = Perl_form(aTHX_ cp_above_legal_max, - uv, MAX_NON_DEPRECATED_CP); - pack_warn = packWARN(WARN_DEPRECATED); + && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)) { + Perl_croak(aTHX_ cp_above_legal_max, uv, + MAX_NON_DEPRECATED_CP); } } else if (possible_problems & UTF8_GOT_NONCHAR) { @@ -2818,11 +2813,9 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, } if (UNLIKELY(UNICODE_IS_SUPER(uv1))) { - if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), - cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP); + if (UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)) { + Perl_croak(aTHX_ cp_above_legal_max, uv1, + MAX_NON_DEPRECATED_CP); } if (ckWARN_d(WARN_NON_UNICODE)) { const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; -- 1.8.3.1