From 2ce94a867b15d96bd49eb8807d39df950f3a1087 Mon Sep 17 00:00:00 2001 From: David Mitchell Date: Sun, 3 Dec 2017 16:38:37 +0000 Subject: [PATCH] re_intuit_start(): skip too short variant utf8 pat RT #132187 This function searches in the target string for known fixed substrings of the pattern, either to quickly reject the match, or to find a minimum start point at which to run the full regex engine. If the target string is utf8 and the pattern is non-utf8 but contains chars in the rang 0x80-0xff, the fixed substring to be searched for will be upgraded to utf8, which causes its length to grow. This can defeat an early quick rejection test of: "is the known substring longer than the target string", because that check is done before the upgrade. It can also trigger the bug reported in the ticket above: a calculation of the maximum end-point within the target string to find the substring goes wrong, because (endpoint - N1) gets limited to the start point (since N1 is longer than the string length), and so the moral equivalent of ((endpoint - N1) + N2) then disappears off the end of the string. The net effect of this bug is that a few bytes off the end of the string may be read, triggering complaints by ASAN etc, or even a SEGV. It makes no difference to the match (which should fail and does fail), except that it might match slower in the unlikely event that the bytes off the end of the string match that tail of the searched-for substring, in which case the full regex engine has to be run to finally reject it. This commit: 1) adds a second length(substr) > length(target string) check at the point its going to run the FBM substring search; 2) it tidies up the code that moves the endpoint back, skipping an expensive utf8 hop-back in more cases. --- regexec.c | 17 +++++++++++++---- t/re/re_tests | 1 + 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/regexec.c b/regexec.c index a19ede9..a571be2 100644 --- a/regexec.c +++ b/regexec.c @@ -907,19 +907,28 @@ Perl_re_intuit_start(pTHX_ && prog->intflags & PREGf_ANCH && prog->check_offset_max != SSize_t_MAX) { - SSize_t len = SvCUR(check) - !!SvTAIL(check); + SSize_t check_len = SvCUR(check) - !!SvTAIL(check); const char * const anchor = (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); + SSize_t targ_len = (char*)end_point - anchor; + + if (check_len > targ_len) { + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + "Anchored string too short...\n")); + goto fail_finish; + } /* do a bytes rather than chars comparison. It's conservative; * so it skips doing the HOP if the result can't possibly end * up earlier than the old value of end_point. */ - if ((char*)end_point - anchor > prog->check_offset_max) { + assert(anchor + check_len <= (char *)end_point); + if (prog->check_offset_max + check_len < targ_len) { end_point = HOP3lim((U8*)anchor, prog->check_offset_max, - end_point -len) - + len; + end_point - check_len + ) + + check_len; } } diff --git a/t/re/re_tests b/t/re/re_tests index 0bd9b55..9dff78c 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1984,6 +1984,7 @@ AB\s+\x{100} AB \x{100}X y - - /(?x)[a b]/xx \N{SPACE} yS $& # Note a space char here /(?xx)[a b]/x \N{SPACE} n - - /(?-x:[a b])/xx \N{SPACE} yS $& # Note a space char here +^a?bcd\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff ABCDEFGHIJKLMNOPQRSTUVWXYZ n - - # [perl #132187] for valgrind's benefit # Keep these lines at the end of the file # vim: softtabstop=0 noexpandtab -- 1.8.3.1