From: Karl Williamson Date: Sun, 17 Mar 2019 16:52:41 +0000 (-0600) Subject: regexec.c: restrict match to substring in regmatch() X-Git-Tag: v5.29.9~18^2~3 X-Git-Url: https://perl5.git.perl.org/perl5.git/commitdiff_plain/255b1a66c426b857aeced6ded35298de6c4ad950 regexec.c: restrict match to substring in regmatch() This begins to add the capability to regmatch() to not look beyond a certain point in the string in order to find a match. But this is currently set to the end of the entire string so it shouldn't affect any current outcomes. --- diff --git a/regexec.c b/regexec.c index 223eaf2..bca2c4c 100644 --- a/regexec.c +++ b/regexec.c @@ -5645,6 +5645,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SSize_t ln = 0; /* len or last; init to avoid compiler warning */ SSize_t endref = 0; /* offset of end of backref when ln is start */ char *locinput = startpos; + char *loceol = reginfo->strend; char *pushinput; /* where to continue after a PUSH */ I32 nextchr; /* is always set to UCHARAT(locinput), or -1 at EOS */ @@ -5809,13 +5810,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; case SANY: /* /./s */ - if (NEXTCHR_IS_EOS) + if (NEXTCHR_IS_EOS || locinput >= loceol) sayNO; goto increment_locinput; case REG_ANY: /* /./ */ - if ((NEXTCHR_IS_EOS) || nextchr == '\n') + if ( NEXTCHR_IS_EOS + || locinput >= loceol + || nextchr == '\n') + { sayNO; + } goto increment_locinput; @@ -5825,7 +5830,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* In this case the charclass data is available inline so we can fail fast without a lot of extra overhead. */ - if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) { + if ( ! NEXTCHR_IS_EOS + && locinput < loceol + && ! ANYOF_BITMAP_TEST(scan, nextchr)) + { DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n", depth, PL_colors[4], PL_colors[5]) @@ -5904,7 +5912,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } } if ( trie->bitmap - && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr))) + && ( NEXTCHR_IS_EOS + || locinput >= loceol + || ! TRIE_BITMAP_TEST(trie, nextchr))) { if (trie->states[ state ].wordnum) { DEBUG_EXECUTE_r( @@ -5942,7 +5952,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) shortest accept state and the wordnum of the longest accept state */ - while ( state && uc <= (U8*)(reginfo->strend) ) { + while ( state && uc <= (U8*)(loceol) ) { U32 base = trie->states[ state ].trans.base; UV uvc = 0; U16 charid = 0; @@ -5977,10 +5987,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) }); /* read a char and goto next state */ - if ( base && (foldlen || uc < (U8*)(reginfo->strend))) { + if ( base && (foldlen || uc < (U8*)(loceol))) { I32 offset; REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, - (U8 *) reginfo->strend, uscan, + (U8 *) loceol, uscan, len, uvc, charid, foldlen, foldbuf, uniflags); charcount++; @@ -6214,7 +6224,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * is an invariant, but there are tests in the test suite * dealing with (??{...}) which violate this) */ while (s < e) { - if (l >= reginfo->strend + if ( l >= loceol || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) { sayNO; @@ -6238,7 +6248,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) else { /* The target is not utf8, the pattern is utf8. */ while (s < e) { - if (l >= reginfo->strend + if ( l >= loceol || UTF8_IS_ABOVE_LATIN1(* (U8*) s)) { sayNO; @@ -6264,7 +6274,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) else { /* The target and the pattern have the same utf8ness. */ /* Inline the first character, for speed. */ - if (reginfo->strend - locinput < ln + if ( loceol - locinput < ln || UCHARAT(s) != nextchr || (ln > 1 && memNE(s, locinput, ln))) { @@ -6360,7 +6370,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* Either target or the pattern are utf8, or has the issue where * the fold lengths may differ. */ const char * const l = locinput; - char *e = reginfo->strend; + char *e = loceol; if (! foldEQ_utf8_flags(l, &e, 0, utf8_target, s, 0, ln, is_utf8_pat,fold_utf8_flags)) @@ -6378,7 +6388,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) { sayNO; } - if (reginfo->strend - locinput < ln) + if (loceol - locinput < ln) sayNO; if (ln > 1 && ! folder(locinput, s, ln)) sayNO; @@ -6674,7 +6684,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* FALLTHROUGH */ case ANYOFD: /* /[abc]/d */ case ANYOF: /* /[abc]/ */ - if (NEXTCHR_IS_EOS) + if (NEXTCHR_IS_EOS || locinput >= loceol) sayNO; if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput)) && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP)) @@ -6685,7 +6695,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) locinput++; } else { - if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, + if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol, utf8_target)) { sayNO; @@ -6695,14 +6705,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; case ANYOFM: - if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)) { + if ( NEXTCHR_IS_EOS + || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan) + || locinput >= loceol) + { sayNO; } locinput++; /* ANYOFM is always single byte */ break; case NANYOFM: - if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)) { + if ( NEXTCHR_IS_EOS + || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan) + || locinput >= loceol) + { sayNO; } goto increment_locinput; @@ -6711,7 +6727,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case ANYOFH: if ( ! utf8_target || NEXTCHR_IS_EOS - || ! reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, + || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, utf8_target)) { sayNO; @@ -6728,7 +6744,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case POSIXL: /* \w or [:punct:] etc. under /l */ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - if (NEXTCHR_IS_EOS) + if (NEXTCHR_IS_EOS || locinput >= loceol) sayNO; /* Use isFOO_lc() for characters within Latin1. (Note that @@ -6773,7 +6789,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case NPOSIXA: /* \W or [:^punct:] etc. under /a */ - if (NEXTCHR_IS_EOS) { + if (NEXTCHR_IS_EOS || locinput >= loceol) { sayNO; } @@ -6792,7 +6808,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * UTF-8, and also from NPOSIXA even in UTF-8 when the current * character is a single byte */ - if (NEXTCHR_IS_EOS) { + if (NEXTCHR_IS_EOS || locinput >= loceol) { sayNO; } @@ -6815,7 +6831,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case POSIXU: /* \w or [:punct:] etc. under /u */ utf8_posix: - if (NEXTCHR_IS_EOS) { + if (NEXTCHR_IS_EOS || locinput >= loceol) { sayNO; } @@ -6896,7 +6912,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case CLUMP: /* Match \X: logical Unicode character. This is defined as a Unicode extended Grapheme Cluster */ - if (NEXTCHR_IS_EOS) + if (NEXTCHR_IS_EOS || locinput >= loceol) sayNO; if (! utf8_target) { @@ -6905,7 +6921,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) locinput++; /* Match the . or CR */ if (nextchr == '\r' /* And if it was CR, and the next is LF, match the LF */ - && locinput < reginfo->strend + && locinput < loceol && UCHARAT(locinput) == '\n') { locinput++; @@ -6922,7 +6938,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * current character. (There is always a break at the * end-of-input) */ locinput += UTF8SKIP(locinput); - while (locinput < reginfo->strend) { + while (locinput < loceol) { GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput, (U8*) reginfo->strend); if (isGCB(prev_gcb, cur_gcb, @@ -7044,11 +7060,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (type != REF /* REF can do byte comparison */ && (utf8_target || type == REFFU || type == REFFL)) { - char * limit = reginfo->strend; + char * limit = loceol; /* This call case insensitively compares the entire buffer * at s, with the current input starting at locinput, but - * not going off the end given by reginfo->strend, and + * not going off the end given by loceol, and * returns in upon success, how much of the * current input was matched */ if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target, @@ -7061,13 +7077,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } /* Not utf8: Inline the first character, for speed. */ - if (!NEXTCHR_IS_EOS && - UCHARAT(s) != nextchr && - (type == REF || - UCHARAT(s) != fold_array[nextchr])) + if ( ! NEXTCHR_IS_EOS + && locinput < loceol + && UCHARAT(s) != nextchr + && ( type == REF + || UCHARAT(s) != fold_array[nextchr])) + { sayNO; + } ln = endref - ln; - if (locinput + ln > reginfo->strend) + if (locinput + ln > loceol) sayNO; if (ln > 1 && (type == REF ? memNE(s, locinput, ln) @@ -8313,7 +8332,7 @@ NULL if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren)) { char *li = locinput; - if (!regrepeat(rex, &li, scan, reginfo->strend, reginfo, 1)) + if (!regrepeat(rex, &li, scan, loceol, reginfo, 1)) sayNO; SET_locinput(li); goto fake_end; @@ -8369,7 +8388,7 @@ NULL char *li = locinput; minmod = 0; if (ST.min && - regrepeat(rex, &li, ST.A, reginfo->strend, reginfo, ST.min) + regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min) < ST.min) sayNO; SET_locinput(li); @@ -8383,7 +8402,7 @@ NULL /* set ST.maxpos to the furthest point along the * string that could possibly match */ if (ST.max == REG_INFTY) { - ST.maxpos = reginfo->strend - 1; + ST.maxpos = loceol - 1; if (utf8_target) while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) ST.maxpos--; @@ -8391,13 +8410,13 @@ NULL else if (utf8_target) { int m = ST.max - ST.min; for (ST.maxpos = locinput; - m >0 && ST.maxpos < reginfo->strend; m--) + m >0 && ST.maxpos < loceol; m--) ST.maxpos += UTF8SKIP(ST.maxpos); } else { ST.maxpos = locinput + ST.max - ST.min; - if (ST.maxpos >= reginfo->strend) - ST.maxpos = reginfo->strend - 1; + if (ST.maxpos >= loceol) + ST.maxpos = loceol - 1; } goto curly_try_B_min_known; @@ -8406,7 +8425,7 @@ NULL /* avoid taking address of locinput, so it can remain * a register var */ char *li = locinput; - ST.count = regrepeat(rex, &li, ST.A, reginfo->strend, reginfo, ST.max); + ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max); if (ST.count < ST.min) sayNO; SET_locinput(li); @@ -8439,7 +8458,7 @@ NULL if (ST.c1 == CHRTEST_VOID) { /* failed -- move forward one */ char *li = locinput; - if (!regrepeat(rex, &li, ST.A, reginfo->strend, reginfo, 1)) { + if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) { sayNO; } locinput = li; @@ -8536,7 +8555,7 @@ NULL * locinput matches */ char *li = ST.oldloc; ST.count += n; - if (regrepeat(rex, &li, ST.A, reginfo->strend, reginfo, n) < n) + if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n) sayNO; assert(n == REG_INFTY || locinput == li); } @@ -8551,7 +8570,7 @@ NULL curly_try_B_max: /* a successful greedy match: now try to match B */ { - bool could_match = locinput < reginfo->strend; + bool could_match = locinput < loceol; /* If it could work, try it. */ if (ST.c1 != CHRTEST_VOID && could_match) { @@ -8726,7 +8745,7 @@ NULL break; case COMMIT: /* (*COMMIT) */ - reginfo->cutpoint = reginfo->strend; + reginfo->cutpoint = loceol; /* FALLTHROUGH */ case PRUNE: /* (*PRUNE) */ @@ -8837,7 +8856,7 @@ NULL #undef ST case LNBREAK: /* \R */ - if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) { + if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) { locinput += n; } else sayNO; @@ -8856,7 +8875,7 @@ NULL locinput += PL_utf8skip[nextchr]; /* locinput is allowed to go 1 char off the end (signifying * EOS), but not 2+ */ - if (locinput > reginfo->strend) + if (locinput > loceol) sayNO; } else