PATCH [perl #123562] Regexp-matching "hangs"
authorKarl Williamson <khw@cpan.org>
Wed, 16 Sep 2015 20:34:31 +0000 (14:34 -0600)
committerKarl Williamson <khw@cpan.org>
Wed, 16 Sep 2015 20:48:38 +0000 (14:48 -0600)
The regex engine got into an infinite loop because of the malformation.
It is trying to back-up over a sequence of UTF-8 continuation bytes.
But the character just before the sequence should be a start byte.  If
not, there is a malformation.  I added a test to croak if that isn't the
case so that it doesn't just infinitely loop.  I did this also in the
similar areas of regexec.c.

Comments long ago added to the code suggested that we check for
malformations in the vicinity of the new tests.  But that was never
done.  These new tests should be good enough to prevent looping, anyway.

regexec.c
t/re/pat.t

index c88f467..1aa0c47 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -8761,6 +8761,10 @@ S_reghop3(U8 *s, SSize_t off, const U8* lim)
             if (UTF8_IS_CONTINUED(*s)) {
                 while (s > lim && UTF8_IS_CONTINUATION(*s))
                     s--;
+                if (! UTF8_IS_START(*s)) {
+                    dTHX;
+                    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
+                }
            }
             /* XXX could check well-formedness here */
        }
@@ -8785,6 +8789,10 @@ S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
             if (UTF8_IS_CONTINUED(*s)) {
                 while (s > llim && UTF8_IS_CONTINUATION(*s))
                     s--;
+                if (! UTF8_IS_START(*s)) {
+                    dTHX;
+                    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
+                }
             }
             /* XXX could check well-formedness here */
         }
@@ -8814,6 +8822,10 @@ S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
             if (UTF8_IS_CONTINUED(*s)) {
                 while (s > lim && UTF8_IS_CONTINUATION(*s))
                     s--;
+                if (! UTF8_IS_START(*s)) {
+                    dTHX;
+                    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
+                }
            }
             /* XXX could check well-formedness here */
        }
index 64f6487..3377b19 100644 (file)
@@ -23,7 +23,7 @@ BEGIN {
     skip_all_without_unicode_tables();
 }
 
-plan tests => 774;  # Update this when adding/deleting tests.
+plan tests => 775;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1675,6 +1675,22 @@ EOP
                 "test that we handle things like m/\\888888888/ without infinite loops" );
         }
 
+        {   # Test that we handle some malformed UTF-8 without looping [perl
+            # #123562]
+
+            my $code='
+                BEGIN{require q(test.pl);}
+                use Encode qw(_utf8_on);
+                my $malformed = "a\x80\n";
+                _utf8_on($malformed);
+                watchdog(3);
+                $malformed =~ /(\n\r|\r)$/;
+                print q(No infinite loop here!);
+            ';
+            fresh_perl_like($code, qr/Malformed UTF-8 character/, {},
+                "test that we handle some UTF-8 malformations without looping" );
+        }
+
        {
                # [perl #123843] hits SEGV trying to compile this pattern
                my $match;