From a4b8f4b4b34c2038e6b9fbd77849f6f99e527be0 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 7 Nov 2018 20:20:53 -0700 Subject: [PATCH] regcomp.c: calculate variants instead of assuming worst case When converting a byte pattern to UTF-8, the needed size may increase due to some bytes (the UTF-8 variants) occupying two bytes instead of one under UTF-8. Prior to this commit, the pattern was assumed to contain only variants, and enough memory was allocated for the worst case. This commit actually calculates how much space is needed and allocates only that. There is extra work involved in doing this calculation. But the pattern is parsed per-word. For short strings, it doesn't much matter either way. But for very long strings, it seems to me the consequences of potentially allocating way too much memory out weighs the negative of this extra work. If field experience proves me wrong, then revert this commit. --- regcomp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/regcomp.c b/regcomp.c index 7b3bc69..2f72faf 100644 --- a/regcomp.c +++ b/regcomp.c @@ -6396,7 +6396,8 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, DEBUG_PARSE_r(Perl_re_printf( aTHX_ "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); - Newx(dst, *plen_p * 2 + 1, U8); + /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */ + Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8); d = dst; while (s < *plen_p) { -- 1.8.3.1