X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/52bcf2657b8c6ee52d2eeb841fc2607db893f58f..11675f52de33c98d9506667c90b3109e45b149b3:/regexec.c diff --git a/regexec.c b/regexec.c index cd0a94f..97ea458 100644 --- a/regexec.c +++ b/regexec.c @@ -92,7 +92,7 @@ static const char utf8_locale_required[] = #ifdef DEBUGGING /* At least one required character in the target string is expressible only in * UTF-8. */ -static const char* const non_utf8_target_but_utf8_required +static const char non_utf8_target_but_utf8_required[] = "Can't match, because target string needs to be in UTF-8\n"; #endif @@ -1173,8 +1173,8 @@ Perl_re_intuit_start(pTHX_ /* now look for the 'other' substring if defined */ - if (utf8_target ? prog->substrs->data[other_ix].utf8_substr - : prog->substrs->data[other_ix].substr) + if (prog->substrs->data[other_ix].utf8_substr + || prog->substrs->data[other_ix].substr) { /* Take into account the "other" substring. */ char *last, *last1; @@ -1184,6 +1184,11 @@ Perl_re_intuit_start(pTHX_ do_other_substr: other = &prog->substrs->data[other_ix]; + if (!utf8_target && !other->substr) { + if (!to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); + } + } /* if "other" is anchored: * we've previously found a floating substr starting at check_at. @@ -1720,7 +1725,7 @@ STMT_START { } else { \ uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \ flags); \ - len = UTF8SKIP(uc); \ + len = UTF8_SAFE_SKIP(uc, uc_end); \ skiplen = UVCHR_SKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ @@ -1782,7 +1787,9 @@ STMT_START { STMT_START { \ while (s < strend) { \ CODE \ - s += ((UTF8) ? UTF8SKIP(s) : 1); \ + s += ((UTF8) \ + ? UTF8_SAFE_SKIP(s, reginfo->strend) \ + : 1); \ } \ } STMT_END @@ -1796,7 +1803,7 @@ STMT_START { #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \ if (COND) { \ FBC_CHECK_AND_TRY \ - s += ((UTF8) ? UTF8SKIP(s) : 1); \ + s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\ previous_occurrence_end = s; \ } \ else { \ @@ -1815,12 +1822,13 @@ STMT_START { * of the one we're looking for. Knowing that, we can see right away if the * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we * don't accept the 2nd and succeeding adjacent occurrences */ -#define FBC_CHECK_AND_TRY \ - if ( ( doevery \ - || s != previous_occurrence_end) \ - && (reginfo->intuit || regtry(reginfo, &s))) \ - { \ - goto got_it; \ +#define FBC_CHECK_AND_TRY \ + if ( ( doevery \ + || s != previous_occurrence_end) \ + && ( reginfo->intuit \ + || (s <= reginfo->strend && regtry(reginfo, &s)))) \ + { \ + goto got_it; \ } @@ -1839,6 +1847,28 @@ STMT_START { previous_occurrence_end = s; \ } +/* This differs from the above macros in that it is passed a single byte that + * is known to begin the next occurrence of the thing being looked for in 's'. + * It does a memchr to find the next occurrence of 'byte', before trying 'COND' + * at that position. */ +#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \ + while (s < strend) { \ + s = (char *) memchr(s, byte, strend -s); \ + if (s == NULL) { \ + s = (char *) strend; \ + break; \ + } \ + \ + if (COND) { \ + FBC_CHECK_AND_TRY \ + s += UTF8_SAFE_SKIP(s, reginfo->strend); \ + previous_occurrence_end = s; \ + } \ + else { \ + s += UTF8SKIP(s); \ + } \ + } + /* The three macros below are slightly different versions of the same logic. * * The first is for /a and /aa when the target string is UTF-8. This can only @@ -1945,9 +1975,12 @@ STMT_START { } /* This is the macro to use when we want to see if something that looks like it - * could match, actually does, and if so exits the loop */ -#define REXEC_FBC_TRYIT \ - if ((reginfo->intuit || regtry(reginfo, &s))) \ + * could match, actually does, and if so exits the loop. It needs to be used + * only for bounds checking macros, as it allows for matching beyond the end of + * string (which should be zero length without having to look at the string + * contents) */ +#define REXEC_FBC_TRYIT \ + if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \ goto got_it /* The only difference between the BOUND and NBOUND cases is that @@ -2129,21 +2162,47 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; case ANYOFM: /* ARG() is the base byte; FLAGS() the mask byte */ - /* UTF-8ness doesn't matter, so use 0 */ + /* UTF-8ness doesn't matter because only matches UTF-8 invariants, so + * use 0 */ REXEC_FBC_FIND_NEXT_SCAN(0, (char *) find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c))); break; - case NANYOFM: - REXEC_FBC_FIND_NEXT_SCAN(0, + case NANYOFM: /* UTF-8ness does matter because can match UTF-8 variants. + */ + REXEC_FBC_FIND_NEXT_SCAN(utf8_target, (char *) find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c))); break; case ANYOFH: - if (utf8_target) REXEC_FBC_CLASS_SCAN(TRUE, + if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ + REXEC_FBC_CLASS_SCAN(TRUE, + ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) + && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target))); + } + break; + + case ANYOFHb: + if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ + + /* We know what the first byte of any matched string should be */ + U8 first_byte = FLAGS(c); + + REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); + } + break; + + case ANYOFHr: + if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ + REXEC_FBC_CLASS_SCAN(TRUE, + ( inRANGE((U8) NATIVE_UTF8_TO_I8(*s), + LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)), + HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c))) + && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target))); + } break; case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */ @@ -2355,7 +2414,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, { goto got_it; } - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; } break; } @@ -2439,7 +2498,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } /* Didn't match. Try at the next position (if there is one) */ - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; if (UNLIKELY(s >= reginfo->strend)) { break; } @@ -2463,7 +2522,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto got_it; } before = after; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, reginfo->strend); } } else { /* Not utf8. Everything is a GCB except between CR and @@ -2481,7 +2540,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* And, since this is a bound, it can match after the final * character in the string */ - if ((reginfo->intuit || regtry(reginfo, &s))) { + if ( reginfo->intuit + || (s <= reginfo->strend && regtry(reginfo, &s))) + { goto got_it; } break; @@ -2491,7 +2552,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; if (UNLIKELY(s >= reginfo->strend)) { break; } @@ -2515,7 +2576,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto got_it; } before = after; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, reginfo->strend); } } else { /* Not utf8. */ @@ -2537,7 +2598,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } } - if (reginfo->intuit || regtry(reginfo, &s)) { + if ( reginfo->intuit + || (s <= reginfo->strend && regtry(reginfo, &s))) + { goto got_it; } @@ -2548,7 +2611,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; if (UNLIKELY(s >= reginfo->strend)) { break; } @@ -2573,7 +2636,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto got_it; } before = after; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, reginfo->strend); } } else { /* Not utf8. */ @@ -2598,7 +2661,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* Here are at the final position in the target string. The SB * value is always true here, so matches, depending on other * constraints */ - if (reginfo->intuit || regtry(reginfo, &s)) { + if ( reginfo->intuit + || (s <= reginfo->strend && regtry(reginfo, &s))) + { goto got_it; } @@ -2609,7 +2674,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; if (UNLIKELY(s >= reginfo->strend)) { break; } @@ -2643,7 +2708,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } previous = before; before = after; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, reginfo->strend); } } else { /* Not utf8. */ @@ -2668,7 +2733,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } } - if (reginfo->intuit || regtry(reginfo, &s)) { + if ( reginfo->intuit + || (s <= reginfo->strend && regtry(reginfo, &s))) + { goto got_it; } } @@ -2985,7 +3052,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, LEAVE; goto got_it; } - s = HOPc(s,1); + if (s < reginfo->strend) { + s = HOPc(s,1); + } DEBUG_TRIE_EXECUTE_r({ Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n"); }); @@ -3305,7 +3374,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, RXp_MATCH_UTF8_set(prog, utf8_target); prog->offs[0].start = s - strbeg; prog->offs[0].end = utf8_target - ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg + ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg : s - strbeg + prog->minlenret; if ( !(flags & REXEC_NOT_FIRST) ) S_reg_set_capture_string(aTHX_ rx, @@ -3504,7 +3573,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, if (*s == ch) { DEBUG_EXECUTE_r( did_match = 1 ); if (regtry(reginfo, &s)) goto got_it; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, strend); while (s < strend && *s == ch) s += UTF8SKIP(s); } @@ -5465,18 +5534,20 @@ S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, /* push a new state then goto it */ -#define PUSH_STATE_GOTO(state, node, input, eol) \ +#define PUSH_STATE_GOTO(state, node, input, eol, sr0) \ pushinput = input; \ pusheol = eol; \ + pushsr0 = sr0; \ scan = node; \ st->resume_state = state; \ goto push_state; /* push a new state with success backtracking, then goto it */ -#define PUSH_YES_STATE_GOTO(state, node, input, eol) \ +#define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \ pushinput = input; \ pusheol = eol; \ + pushsr0 = sr0; \ scan = node; \ st->resume_state = state; \ goto push_yes_state; @@ -5660,6 +5731,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) char *loceol = reginfo->strend; char *pushinput; /* where to continue after a PUSH */ char *pusheol; /* where to stop matching (loceol) after a PUSH */ + U8 *pushsr0; /* save starting pos of script run */ I32 nextchr; /* is always set to UCHARAT(locinput), or -1 at EOS */ bool result = 0; /* return value of S_regmatch */ @@ -5796,7 +5868,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* update the startpoint */ st->u.keeper.val = rex->offs[0].start; rex->offs[0].start = locinput - reginfo->strbeg; - PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol); + PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case KEEPS_next_fail: @@ -6172,7 +6245,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) }); if ( ST.accepted > 1 || has_cutgroup || ST.jump ) { - PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol); + PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } /* only one choice left - just continue */ @@ -6742,6 +6816,33 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case ANYOFH: if ( ! utf8_target || NEXTCHR_IS_EOS + || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8((U8) *locinput) + || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, + utf8_target)) + { + sayNO; + } + goto increment_locinput; + break; + + case ANYOFHb: + if ( ! utf8_target + || NEXTCHR_IS_EOS + || ANYOF_FLAGS(scan) != (U8) *locinput + || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, + utf8_target)) + { + sayNO; + } + goto increment_locinput; + break; + + case ANYOFHr: + if ( ! utf8_target + || NEXTCHR_IS_EOS + || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput), + LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)), + HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan))) || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, utf8_target)) { @@ -6921,7 +7022,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } break; } - locinput += UTF8SKIP(locinput); + locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend); } break; @@ -6971,7 +7072,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } break; - case NREFFL: /* /\g{name}/il */ + case REFFLN: /* /\g{name}/il */ { /* The capture buffer cases. The ones beginning with N for the named buffers just convert to the equivalent numbered and pretend they were called as the corresponding numbered buffer @@ -6991,28 +7092,28 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) utf8_fold_flags = FOLDEQ_LOCALE; goto do_nref; - case NREFFA: /* /\g{name}/iaa */ + case REFFAN: /* /\g{name}/iaa */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; type = REFFA; utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_nref; - case NREFFU: /* /\g{name}/iu */ + case REFFUN: /* /\g{name}/iu */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; type = REFFU; utf8_fold_flags = 0; goto do_nref; - case NREFF: /* /\g{name}/i */ + case REFFN: /* /\g{name}/i */ folder = foldEQ; fold_array = PL_fold; type = REFF; utf8_fold_flags = 0; goto do_nref; - case NREF: /* /\g{name}/ */ + case REFN: /* /\g{name}/ */ type = REF; folder = NULL; fold_array = NULL; @@ -7408,7 +7509,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) PL_curpm = PL_reg_curpm; if (logical != 2) { - PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol); + PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol, + script_run_begin); /* NOTREACHED */ } } @@ -7508,7 +7610,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) ST.prev_eval = cur_eval; cur_eval = st; /* now continue from first node in postoned RE */ - PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput, loceol); + PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput, + loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -7664,7 +7767,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); break; - case NGROUPP: /* (?()) */ + case GROUPPN: /* (?()) */ /* reg_check_named_buff_matched returns 0 for no match */ sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan)); break; @@ -7808,7 +7911,8 @@ NULL ST.count = -1; /* this will be updated by WHILEM */ ST.lastloc = NULL; /* this will be updated by WHILEM */ - PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol); + PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -7856,7 +7960,8 @@ NULL cur_curlyx->u.curlyx.lastloc = locinput; REGCP_SET(ST.lastcp); - PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol); + PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -7964,7 +8069,7 @@ NULL ST.save_curlyx = cur_curlyx; cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, - locinput, loceol); + locinput, loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -7975,7 +8080,8 @@ NULL maxopenparen); cur_curlyx->u.curlyx.lastloc = locinput; REGCP_SET(ST.lastcp); - PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol); + PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } goto do_whilem_B_max; @@ -8027,7 +8133,7 @@ NULL ST.save_curlyx = cur_curlyx; cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, - locinput, loceol); + locinput, loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ @@ -8058,7 +8164,7 @@ NULL REGCP_SET(ST.lastcp); PUSH_STATE_GOTO(WHILEM_A_min, /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS, - locinput, loceol); + locinput, loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ #undef ST @@ -8080,9 +8186,11 @@ NULL /* Now go into the branch */ if (has_cutgroup) { - PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol); + PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol, + script_run_begin); } else { - PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol); + PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol, + script_run_begin); } NOT_REACHED; /* NOTREACHED */ @@ -8090,7 +8198,8 @@ NULL sv_yes_mark = st->u.mark.mark_name = scan->flags ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ]) : NULL; - PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol); + PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case CUTGROUP_next_fail: @@ -8167,7 +8276,8 @@ NULL goto curlym_do_B; curlym_do_A: /* execute the A in /A{m,n}B/ */ - PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol); /* match A */ + PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */ + script_run_begin); NOT_REACHED; /* NOTREACHED */ case CURLYM_A: /* we've just matched an A */ @@ -8242,8 +8352,10 @@ NULL * having to worry about one being shorter than the * other, since the first byte of each gives the * length of the character) */ - if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) - && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + if ( memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput, + reginfo->strend)) + && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput, + reginfo->strend))) { /* simulate B failing */ DEBUG_OPTIMISE_r( @@ -8289,7 +8401,8 @@ NULL } } - PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol); /* match B */ + PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */ + script_run_begin); NOT_REACHED; /* NOTREACHED */ case CURLYM_B_fail: /* just failed to match a B */ @@ -8505,20 +8618,26 @@ NULL n = (ST.oldloc == locinput) ? 0 : 1; if (ST.c1 == ST.c2) { /* set n to utf8_distance(oldloc, locinput) */ - while (locinput <= ST.maxpos - && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))) + while ( locinput <= ST.maxpos + && locinput < loceol + && memNE(locinput, ST.c1_utf8, + UTF8_SAFE_SKIP(locinput, reginfo->strend))) { - locinput += UTF8SKIP(locinput); + locinput += UTF8_SAFE_SKIP(locinput, + reginfo->strend); n++; } } else { /* set n to utf8_distance(oldloc, locinput) */ - while (locinput <= ST.maxpos - && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) - && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + while ( locinput <= ST.maxpos + && locinput < loceol + && memNE(locinput, ST.c1_utf8, + UTF8_SAFE_SKIP(locinput, reginfo->strend)) + && memNE(locinput, ST.c2_utf8, + UTF8_SAFE_SKIP(locinput, reginfo->strend))) { - locinput += UTF8SKIP(locinput); + locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend); n++; } } @@ -8583,7 +8702,8 @@ NULL curly_try_B_min: CURLY_SETPAREN(ST.paren, ST.count); - PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol); + PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ @@ -8596,21 +8716,22 @@ NULL if (ST.c1 != CHRTEST_VOID && could_match) { if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target) { - could_match = memEQ(locinput, - ST.c1_utf8, - UTF8SKIP(locinput)) - || memEQ(locinput, - ST.c2_utf8, - UTF8SKIP(locinput)); + could_match = memEQ(locinput, ST.c1_utf8, + UTF8_SAFE_SKIP(locinput, + reginfo->strend)) + || memEQ(locinput, ST.c2_utf8, + UTF8_SAFE_SKIP(locinput, + reginfo->strend)); } else { - could_match = UCHARAT(locinput) == ST.c1 - || UCHARAT(locinput) == ST.c2; + could_match = UCHARAT(locinput) == ST.c1 + || UCHARAT(locinput) == ST.c2; } } if (ST.c1 == CHRTEST_VOID || could_match) { CURLY_SETPAREN(ST.paren, ST.count); - PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol); + PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } } @@ -8665,8 +8786,9 @@ NULL SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput); - PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B, - locinput, loceol); /* match B */ + PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */ + st->u.eval.prev_eval->u.eval.B, + locinput, loceol, script_run_begin); } if (locinput < reginfo->till) { @@ -8717,8 +8839,8 @@ NULL PERL_UINT_FAST8_T back_count = scan->flags; char * s; - /* Lookbehind ends here */ - ST.end = locinput; + /* Lookbehind can look beyond the current position */ + ST.end = loceol; /* ... and starts at the first place in the input that is in * the range of the possible start positions */ @@ -8752,7 +8874,8 @@ NULL logical = 0; /* XXX: reset state of logical once it has been saved into ST */ /* execute body of (?...A) */ - PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start, ST.end); + PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start, + ST.end, script_run_begin); NOT_REACHED; /* NOTREACHED */ { @@ -8786,6 +8909,7 @@ NULL /* restore old position except for (?>...) */ locinput = st->locinput; loceol = st->loceol; + script_run_begin = st->sr0; } scan = ST.me + ARG(ST.me); if (scan == ST.me) @@ -8809,7 +8933,8 @@ NULL case PRUNE: /* (*PRUNE) */ if (scan->flags) sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); - PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol); + PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case COMMIT_next_fail: @@ -8839,7 +8964,8 @@ NULL = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); mark_state = st; ST.mark_loc = locinput; - PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol); + PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case MARKPOINT_next: @@ -8872,7 +8998,8 @@ NULL /* (*SKIP) : if we fail we cut here*/ ST.mark_name = NULL; ST.mark_loc = locinput; - PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol); + PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol, + script_run_begin); } else { /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, otherwise do nothing. Meaning we need to scan @@ -8885,7 +9012,8 @@ NULL find ) ) { ST.mark_name = find; - PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol); + PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol, + script_run_begin); } cur = cur->u.mark.prev_mark; } @@ -8982,6 +9110,7 @@ NULL depth++; st->locinput = locinput; st->loceol = loceol; + st->sr0 = script_run_begin; newst = st+1; if (newst > SLAB_LAST(PL_regmatch_slab)) newst = S_push_slab(aTHX); @@ -8989,6 +9118,7 @@ NULL locinput = pushinput; loceol = pusheol; + script_run_begin = pushsr0; st = newst; continue; /* NOTREACHED */ @@ -9044,6 +9174,7 @@ NULL if (no_final) { locinput= st->locinput; loceol= st->loceol; + script_run_begin = st->sr0; } state_num = st->resume_state + no_final; goto reenter_switch; @@ -9095,6 +9226,7 @@ NULL PL_regmatch_state = st; locinput= st->locinput; loceol= st->loceol; + script_run_begin = st->sr0; DEBUG_STATE_pp("pop"); depth--; @@ -9377,19 +9509,22 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (c1 == c2) { while (scan < this_eol && hardcount < max - && memEQ(scan, c1_utf8, UTF8SKIP(scan))) + && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan, + loceol))) { - scan += UTF8SKIP(scan); + scan += UTF8SKIP(c1_utf8); hardcount++; } } else { while (scan < this_eol && hardcount < max - && (memEQ(scan, c1_utf8, UTF8SKIP(scan)) - || memEQ(scan, c2_utf8, UTF8SKIP(scan)))) + && ( memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan, + loceol)) + || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan, + loceol)))) { - scan += UTF8SKIP(scan); + scan += UTF8_SAFE_SKIP(scan, loceol); hardcount++; } } @@ -9480,13 +9615,47 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, break; case ANYOFH: - if (utf8_target) while ( hardcount < max - && scan < this_eol - && reginclass(prog, p, (U8*)scan, (U8*) this_eol, - TRUE)) - { - scan += UTF8SKIP(scan); - hardcount++; + if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ + while ( hardcount < max + && scan < this_eol + && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case ANYOFHb: + if (utf8_target) { /* ANYOFHb only can match UTF-8 targets */ + + /* we know the first byte must be the FLAGS field */ + while ( hardcount < max + && scan < this_eol + && (U8) *scan == ANYOF_FLAGS(p) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, + TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case ANYOFHr: + if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ + while ( hardcount < max + && scan < this_eol + && inRANGE((U8) NATIVE_UTF8_TO_I8(*scan), + LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)), + HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p))) + && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; + } } break; @@ -9733,7 +9902,9 @@ STATIC bool S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) { dVAR; - const char flags = ANYOF_FLAGS(n); + const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHr)) + ? 0 + : ANYOF_FLAGS(n); bool match = FALSE; UV c = *p; @@ -9760,7 +9931,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } /* If this character is potentially in the bitmap, check it */ - if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) { + if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; else if ((flags @@ -10062,6 +10233,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; eval_state->rex = rex; + eval_state->sv = reginfo->sv; if (reginfo->sv) { /* Make $_ available to executed code. */ @@ -10069,6 +10241,8 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) SAVE_DEFSV; DEFSV_set(reginfo->sv); } + /* will be dec'd by S_cleanup_regmatch_info_aux */ + SvREFCNT_inc_NN(reginfo->sv); if (!(mg = mg_find_mglob(reginfo->sv))) { /* prepare for quick setting of pos */ @@ -10160,6 +10334,7 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg) } PL_curpm = eval_state->curpm; + SvREFCNT_dec(eval_state->sv); } PL_regmatch_state = aux->old_regmatch_state; @@ -10230,6 +10405,7 @@ S_to_byte_substr(pTHX_ regexp *prog) && !prog->substrs->data[i].substr) { SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); if (! sv_utf8_downgrade(sv, TRUE)) { + SvREFCNT_dec_NN(sv); return FALSE; } if (SvVALID(prog->substrs->data[i].utf8_substr)) {