X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/565fc1bb88638c2490cdab7a1055007f6b2d577c..45ccb7bbaadd51904a7fe1262254aac938f3e9f1:/regexec.c diff --git a/regexec.c b/regexec.c index f25bce1..a5451b6 100644 --- a/regexec.c +++ b/regexec.c @@ -660,8 +660,12 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); goto fail; } - if (prog->check_offset_min == prog->check_offset_max && - !(prog->extflags & RXf_CANY_SEEN)) { + if (prog->check_offset_min == prog->check_offset_max + && !(prog->extflags & RXf_CANY_SEEN) + && ! multiline) /* /m can cause \n's to match that aren't + accounted for in the string max length. + See [perl #115242] */ + { /* Substring at constant offset from beg-of-str... */ I32 slen; @@ -3658,6 +3662,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) reenter_switch: SET_nextchr; + assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); switch (state_num) { case BOL: /* /^../ */ @@ -4205,8 +4210,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } /* Neither the target nor the pattern are utf8 */ - if (UCHARAT(s) != nextchr && - UCHARAT(s) != fold_array[nextchr]) + if (UCHARAT(s) != nextchr + && !NEXTCHR_IS_EOS + && UCHARAT(s) != fold_array[nextchr]) { sayNO; } @@ -6326,6 +6332,7 @@ NULL /* this is a point to jump to in order to increment * locinput by one character */ increment_locinput: + assert(!NEXTCHR_IS_EOS); if (utf8_target) { locinput += PL_utf8skip[nextchr]; /* locinput is allowed to go 1 char off the end, but not 2+ */ @@ -6523,21 +6530,24 @@ no_silent: /* - regrepeat - repeatedly match something simple, report how many * + * What 'simple' means is a node which can be the operand of a quantifier like + * '+', or {1,3} + * * startposp - pointer a pointer to the start position. This is updated * to point to the byte following the highest successful * match. * p - the regnode to be repeatedly matched against. - * max - maximum number of characters to match. + * max - maximum number of things to match. * depth - (for debugging) backtracking depth. */ STATIC I32 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth) { dVAR; - char *scan; + char *scan; /* Pointer to current position in target string */ I32 c; - char *loceol = PL_regeol; - I32 hardcount = 0; + char *loceol = PL_regeol; /* local version */ + I32 hardcount = 0; /* How many matches so far */ bool utf8_target = PL_reg_match_utf8; UV utf8_flags; #ifndef DEBUGGING @@ -6549,12 +6559,35 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma scan = *startposp; if (max == REG_INFTY) max = I32_MAX; - else if (max < loceol - scan) + else if (! utf8_target && scan + max < loceol) loceol = scan + max; + + /* Here, for the case of a non-UTF-8 target we have adjusted down + * to the maximum of how far we should go in it (leaving it set to the real + * end, if the maximum permissible would take us beyond that). This allows + * us to make the loop exit condition that we haven't gone past to + * also mean that we haven't exceeded the max permissible count, saving a + * test each time through the loop. But it assumes that the OP matches a + * single byte, which is true for most of the OPs below when applied to a + * non-UTF-8 target. Those relatively few OPs that don't have this + * characteristic will have to compensate. + * + * There is no adjustment for UTF-8 targets, as the number of bytes per + * character varies. OPs will have to test both that the count is less + * than the max permissible (using to keep track), and that we + * are still within the bounds of the string (using . A few OPs + * match a single byte no matter what the encoding. They can omit the max + * test if, for the UTF-8 case, they do the adjustment that was skipped + * above. + * + * Thus, the code above sets things up for the common case; and exceptional + * cases need extra work; the common case is to make sure doesn't + * go past , and for UTF-8 to also use to make sure the + * count doesn't exceed the maximum permissible */ + switch (OP(p)) { case REG_ANY: if (utf8_target) { - loceol = PL_regeol; while (scan < loceol && hardcount < max && *scan != '\n') { scan += UTF8SKIP(scan); hardcount++; @@ -6566,7 +6599,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case SANY: if (utf8_target) { - loceol = PL_regeol; while (scan < loceol && hardcount < max) { scan += UTF8SKIP(scan); hardcount++; @@ -6575,8 +6607,15 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma else scan = loceol; break; - case CANY: - scan = loceol; + case CANY: /* Move forward bytes, unless goes off end */ + if (utf8_target && scan + max < loceol) { + + /* hadn't been adjusted in the UTF-8 case */ + scan += max; + } + else { + scan = loceol; + } break; case EXACT: assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1); @@ -6588,6 +6627,11 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's * true iff it doesn't matter if the argument is in UTF-8 or not */ if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) { + if (utf8_target && scan + max < loceol) { + /* We didn't adjust because is UTF-8, but ok to do so, + * since here, to match at all, 1 char == 1 byte */ + loceol = scan + max; + } while (scan < loceol && UCHARAT(scan) == c) { scan++; } @@ -6595,9 +6639,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma else if (UTF_PATTERN) { if (utf8_target) { STRLEN scan_char_len; - loceol = PL_regeol; - /* When both target and pattern are UTF-8, we have to do s + /* When both target and pattern are UTF-8, we have to do * string EQ */ while (hardcount < max && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol @@ -6627,7 +6670,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma * then look for those in sequence in the utf8 string */ U8 high = UTF8_TWO_BYTE_HI(c); U8 low = UTF8_TWO_BYTE_LO(c); - loceol = PL_regeol; while (hardcount < max && scan + 1 < loceol @@ -6667,7 +6709,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) { if (c1 == CHRTEST_VOID) { /* Use full Unicode fold matching */ - char *tmpeol = loceol; + char *tmpeol = PL_regeol; STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1; while (hardcount < max && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, @@ -6675,13 +6717,14 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma cBOOL(UTF_PATTERN), utf8_flags)) { scan = tmpeol; - tmpeol = loceol; + tmpeol = PL_regeol; hardcount++; } } else if (utf8_target) { if (c1 == c2) { - while (hardcount < max + while (scan < loceol + && hardcount < max && memEQ(scan, c1_utf8, UTF8SKIP(scan))) { scan += UTF8SKIP(scan); @@ -6689,7 +6732,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } } else { - while (hardcount < max + while (scan < loceol + && hardcount < max && (memEQ(scan, c1_utf8, UTF8SKIP(scan)) || memEQ(scan, c2_utf8, UTF8SKIP(scan)))) { @@ -6716,7 +6760,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case ANYOF: if (utf8_target) { STRLEN inclasslen; - loceol = PL_regeol; inclasslen = loceol - scan; while (hardcount < max && ((inclasslen = loceol - scan) > 0) @@ -6733,7 +6776,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case ALNUMU: if (utf8_target) { utf8_wordchar: - loceol = PL_regeol; LOAD_UTF8_CHARCLASS_ALNUM(); while (hardcount < max && scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) @@ -6755,6 +6797,12 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } break; case ALNUMA: + if (utf8_target && scan + max < loceol) { + + /* We didn't adjust because is UTF-8, but ok to do so, + * since here, to match, 1 char == 1 byte */ + loceol = scan + max; + } while (scan < loceol && isWORDCHAR_A((U8) *scan)) { scan++; } @@ -6762,7 +6810,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case ALNUML: PL_reg_flags |= RF_tainted; if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && isALNUM_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); @@ -6778,7 +6825,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma utf8_Nwordchar: - loceol = PL_regeol; LOAD_UTF8_CHARCLASS_ALNUM(); while (hardcount < max && scan < loceol && ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) @@ -6801,14 +6847,23 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case POSIXA: - while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) { + if (utf8_target && scan + max < loceol) { + + /* We didn't adjust because is UTF-8, but ok to do so, + * since here, to match, 1 char == 1 byte */ + loceol = scan + max; + } + while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) { scan++; } break; case NPOSIXA: if (utf8_target) { - while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { + while (scan < loceol && hardcount < max + && ! _generic_isCC_A((U8) *scan, FLAGS(p))) + { scan += UTF8SKIP(scan); + hardcount++; } } else { @@ -6819,8 +6874,11 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case NALNUMA: if (utf8_target) { - while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) { + while (scan < loceol && hardcount < max + && ! isWORDCHAR_A((U8) *scan)) + { scan += UTF8SKIP(scan); + hardcount++; } } else { @@ -6832,7 +6890,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case NALNUML: PL_reg_flags |= RF_tainted; if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && !isALNUM_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); @@ -6848,7 +6905,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma utf8_space: - loceol = PL_regeol; LOAD_UTF8_CHARCLASS_SPACE(); while (hardcount < max && scan < loceol && (*scan == ' ' || @@ -6874,6 +6930,12 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } break; case SPACEA: + if (utf8_target && scan + max < loceol) { + + /* We didn't adjust because is UTF-8, but ok to do so, + * since here, to match, 1 char == 1 byte */ + loceol = scan + max; + } while (scan < loceol && isSPACE_A((U8) *scan)) { scan++; } @@ -6881,7 +6943,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case SPACEL: PL_reg_flags |= RF_tainted; if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && isSPACE_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); @@ -6897,7 +6958,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma utf8_Nspace: - loceol = PL_regeol; LOAD_UTF8_CHARCLASS_SPACE(); while (hardcount < max && scan < loceol && ! (*scan == ' ' || @@ -6924,8 +6984,11 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case NSPACEA: if (utf8_target) { - while (scan < loceol && ! isSPACE_A((U8) *scan)) { + while (hardcount < max && scan < loceol + && ! isSPACE_A((U8) *scan)) + { scan += UTF8SKIP(scan); + hardcount++; } } else { @@ -6937,7 +7000,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case NSPACEL: PL_reg_flags |= RF_tainted; if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && !isSPACE_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); @@ -6950,7 +7012,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case DIGIT: if (utf8_target) { - loceol = PL_regeol; LOAD_UTF8_CHARCLASS_DIGIT(); while (hardcount < max && scan < loceol && swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) { @@ -6963,6 +7024,12 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } break; case DIGITA: + if (utf8_target && scan + max < loceol) { + + /* We didn't adjust because is UTF-8, but ok to do so, + * since here, to match, 1 char == 1 byte */ + loceol = scan + max; + } while (scan < loceol && isDIGIT_A((U8) *scan)) { scan++; } @@ -6970,7 +7037,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case DIGITL: PL_reg_flags |= RF_tainted; if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && isDIGIT_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); @@ -6983,7 +7049,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case NDIGIT: if (utf8_target) { - loceol = PL_regeol; LOAD_UTF8_CHARCLASS_DIGIT(); while (hardcount < max && scan < loceol && !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) { @@ -6997,8 +7062,10 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case NDIGITA: if (utf8_target) { - while (scan < loceol && ! isDIGIT_A((U8) *scan)) { + while (hardcount < max && scan < loceol + && ! isDIGIT_A((U8) *scan)) { scan += UTF8SKIP(scan); + hardcount++; } } else { @@ -7010,7 +7077,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case NDIGITL: PL_reg_flags |= RF_tainted; if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && !isDIGIT_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); @@ -7022,11 +7088,25 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } break; case LNBREAK: - Perl_croak(aTHX_ "panic: regrepeat() should not be called with non-simple: LNBREAK"); - assert(0); /* NOTREACHED */ + if (utf8_target) { + while (hardcount < max && scan < loceol && + (c=is_LNBREAK_utf8_safe(scan, loceol))) { + scan += c; + hardcount++; + } + } else { + /* LNBREAK can match one or two latin chars, which is ok, but we + * have to use hardcount in this situation, and throw away the + * adjustment to done before the switch statement */ + loceol = PL_regeol; + while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) { + scan+=c; + hardcount++; + } + } + break; case HORIZWS: if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8_safe(scan, loceol))) { @@ -7040,7 +7120,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case NHORIZWS: if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && !is_HORIZWS_utf8_safe(scan, loceol)) { @@ -7055,7 +7134,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case VERTWS: if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8_safe(scan, loceol))) { @@ -7070,7 +7148,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case NVERTWS: if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && !is_VERTWS_utf8_safe(scan, loceol)) { @@ -7084,8 +7161,27 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } break; - default: /* Called on something of 0 width. */ - break; /* So match right here or not at all. */ + case BOUND: + case BOUNDA: + case BOUNDL: + case BOUNDU: + case EOS: + case GPOS: + case KEEPS: + case NBOUND: + case NBOUNDA: + case NBOUNDL: + case NBOUNDU: + case OPFAIL: + case SBOL: + case SEOL: + /* These are all 0 width, so match right here or not at all. */ + break; + + default: + Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]); + assert(0); /* NOTREACHED */ + } if (hardcount)