X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/0a5ed81e6617c9229cc1ea042e9a70c3ec63fd65..975d8916e99aadf059b284a824eff315e49a1825:/regexec.c diff --git a/regexec.c b/regexec.c index 48ed8c3..ee961e7 100644 --- a/regexec.c +++ b/regexec.c @@ -218,7 +218,7 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH) const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; I32 p; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_REGCPPUSH; @@ -328,7 +328,7 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH) { UV i; U32 paren; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_REGCPPOP; @@ -579,7 +579,7 @@ S_find_span_end(U8 * s, const U8 * send, const U8 span_byte) span_word |= span_word << 4; /* That reduces the problem to what this function solves */ - return s + _variant_byte_number(span_word); + return s + variant_byte_number(span_word); #endif @@ -657,7 +657,7 @@ S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask) masked &= PERL_VARIANTS_WORD_MASK; /* This reduces the problem to that solved by this function */ - s += _variant_byte_number(masked); + s += variant_byte_number(masked); return s; } while (s + PERL_WORDSIZE <= send); @@ -723,7 +723,7 @@ S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask) masked |= masked << 1; masked |= masked << 2; masked |= masked << 4; - return s + _variant_byte_number(masked); + return s + variant_byte_number(masked); #endif @@ -859,7 +859,7 @@ Perl_re_intuit_start(pTHX_ RXi_GET_DECL(prog,progi); regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ regmatch_info *const reginfo = ®info_buf; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_RE_INTUIT_START; PERL_UNUSED_ARG(flags); @@ -1472,10 +1472,10 @@ Perl_re_intuit_start(pTHX_ const U8* const str = (U8*)STRING(progi->regstclass); /* XXX this value could be pre-computed */ - const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT + const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT ? (reginfo->is_utf8_pat - ? utf8_distance(str + STR_LEN(progi->regstclass), str) - : STR_LEN(progi->regstclass)) + ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str) + : (SSize_t)STR_LEN(progi->regstclass)) : 1); char * endpos; char *s; @@ -1787,7 +1787,9 @@ STMT_START { STMT_START { \ while (s < strend) { \ CODE \ - s += ((UTF8) ? UTF8SKIP(s) : 1); \ + s += ((UTF8) \ + ? UTF8_SAFE_SKIP(s, reginfo->strend) \ + : 1); \ } \ } STMT_END @@ -1801,7 +1803,7 @@ STMT_START { #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \ if (COND) { \ FBC_CHECK_AND_TRY \ - s += ((UTF8) ? UTF8SKIP(s) : 1); \ + s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\ previous_occurrence_end = s; \ } \ else { \ @@ -1820,12 +1822,13 @@ STMT_START { * of the one we're looking for. Knowing that, we can see right away if the * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we * don't accept the 2nd and succeeding adjacent occurrences */ -#define FBC_CHECK_AND_TRY \ - if ( ( doevery \ - || s != previous_occurrence_end) \ - && (reginfo->intuit || regtry(reginfo, &s))) \ - { \ - goto got_it; \ +#define FBC_CHECK_AND_TRY \ + if ( ( doevery \ + || s != previous_occurrence_end) \ + && ( reginfo->intuit \ + || (s <= reginfo->strend && regtry(reginfo, &s)))) \ + { \ + goto got_it; \ } @@ -1858,7 +1861,7 @@ STMT_START { \ if (COND) { \ FBC_CHECK_AND_TRY \ - s += UTF8SKIP(s); \ + s += UTF8_SAFE_SKIP(s, reginfo->strend); \ previous_occurrence_end = s; \ } \ else { \ @@ -1972,9 +1975,12 @@ STMT_START { } /* This is the macro to use when we want to see if something that looks like it - * could match, actually does, and if so exits the loop */ -#define REXEC_FBC_TRYIT \ - if ((reginfo->intuit || regtry(reginfo, &s))) \ + * could match, actually does, and if so exits the loop. It needs to be used + * only for bounds checking macros, as it allows for matching beyond the end of + * string (which should be zero length without having to look at the string + * contents) */ +#define REXEC_FBC_TRYIT \ + if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \ goto got_it /* The only difference between the BOUND and NBOUND cases is that @@ -2172,17 +2178,72 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case ANYOFH: if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ + REXEC_FBC_CLASS_SCAN(TRUE, + ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) + && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target))); + } + break; + + case ANYOFHb: + if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ + + /* We know what the first byte of any matched string should be */ U8 first_byte = FLAGS(c); - if (first_byte) { /* We know what the first byte of any matched - string should be */ - REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, + REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); - } - else { - REXEC_FBC_CLASS_SCAN(TRUE, - reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); - } + } + break; + + case ANYOFHr: + if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ + REXEC_FBC_CLASS_SCAN(TRUE, + ( inRANGE(NATIVE_UTF8_TO_I8(*s), + LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)), + HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c))) + && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target))); + } + break; + + case ANYOFHs: + if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ + REXEC_FBC_CLASS_SCAN(TRUE, + ( strend -s >= FLAGS(c) + && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c)) + && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target))); + } + break; + + case ANYOFR: + if (utf8_target) { + REXEC_FBC_CLASS_SCAN(TRUE, + ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) + && withinCOUNT(utf8_to_uvchr_buf((U8 *) s, + (U8 *) strend, + NULL), + ANYOFRbase(c), ANYOFRdelta(c)))); + } + else { + REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s, + ANYOFRbase(c), ANYOFRdelta(c))); + } + break; + + case ANYOFRb: + if (utf8_target) { + + /* We know what the first byte of any matched string should be */ + U8 first_byte = FLAGS(c); + + REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, + withinCOUNT(utf8_to_uvchr_buf((U8 *) s, + (U8 *) strend, + NULL), + ANYOFRbase(c), ANYOFRdelta(c))); + } + else { + REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s, + ANYOFRbase(c), ANYOFRdelta(c))); } break; @@ -2247,7 +2308,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, | FOLDEQ_S2_FOLDS_SANE; goto do_exactf_utf8; - case EXACTFU_ONLY8: + case EXACTFU_REQ8: if (! utf8_target) { break; } @@ -2279,8 +2340,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * first character. c2 is its fold. This logic will not work for * Unicode semantics and the german sharp ss, which hence should * not be compiled into a node that gets here. */ - pat_string = STRING(c); - ln = STR_LEN(c); /* length to match in octets/bytes */ + pat_string = STRINGs(c); + ln = STR_LENs(c); /* length to match in octets/bytes */ /* We know that we have to match at least 'ln' bytes (which is the * same as characters, since not utf8). If we have to match 3 @@ -2355,8 +2416,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* If one of the operands is in utf8, we can't use the simpler folding * above, due to the fact that many different characters can have the * same fold, or portion of a fold, or different- length fold */ - pat_string = STRING(c); - ln = STR_LEN(c); /* length to match in octets/bytes */ + pat_string = STRINGs(c); + ln = STR_LENs(c); /* length to match in octets/bytes */ pat_end = pat_string + ln; lnc = is_utf8_pat /* length to match in characters */ ? utf8_length((U8 *) pat_string, (U8 *) pat_end) @@ -2395,7 +2456,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, { goto got_it; } - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; } break; } @@ -2479,7 +2540,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } /* Didn't match. Try at the next position (if there is one) */ - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; if (UNLIKELY(s >= reginfo->strend)) { break; } @@ -2503,7 +2564,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto got_it; } before = after; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, reginfo->strend); } } else { /* Not utf8. Everything is a GCB except between CR and @@ -2521,7 +2582,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* And, since this is a bound, it can match after the final * character in the string */ - if ((reginfo->intuit || regtry(reginfo, &s))) { + if ( reginfo->intuit + || (s <= reginfo->strend && regtry(reginfo, &s))) + { goto got_it; } break; @@ -2531,7 +2594,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; if (UNLIKELY(s >= reginfo->strend)) { break; } @@ -2555,7 +2618,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto got_it; } before = after; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, reginfo->strend); } } else { /* Not utf8. */ @@ -2577,7 +2640,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } } - if (reginfo->intuit || regtry(reginfo, &s)) { + if ( reginfo->intuit + || (s <= reginfo->strend && regtry(reginfo, &s))) + { goto got_it; } @@ -2588,7 +2653,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; if (UNLIKELY(s >= reginfo->strend)) { break; } @@ -2613,7 +2678,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto got_it; } before = after; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, reginfo->strend); } } else { /* Not utf8. */ @@ -2638,7 +2703,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* Here are at the final position in the target string. The SB * value is always true here, so matches, depending on other * constraints */ - if (reginfo->intuit || regtry(reginfo, &s)) { + if ( reginfo->intuit + || (s <= reginfo->strend && regtry(reginfo, &s))) + { goto got_it; } @@ -2649,7 +2716,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; if (UNLIKELY(s >= reginfo->strend)) { break; } @@ -2683,7 +2750,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } previous = before; before = after; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, reginfo->strend); } } else { /* Not utf8. */ @@ -2708,7 +2775,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } } - if (reginfo->intuit || regtry(reginfo, &s)) { + if ( reginfo->intuit + || (s <= reginfo->strend && regtry(reginfo, &s))) + { goto got_it; } } @@ -2845,7 +2914,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, U8 *bitmap=NULL; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; /* We can't just allocate points here. We need to wrap it in * an SV so it gets freed properly if there is a croak while @@ -3025,7 +3094,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, LEAVE; goto got_it; } - s = HOPc(s,1); + if (s < reginfo->strend) { + s = HOPc(s,1); + } DEBUG_TRIE_EXECUTE_r({ Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n"); }); @@ -3226,7 +3297,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, regmatch_info *const reginfo = ®info_buf; regexp_paren_pair *swap = NULL; I32 oldsave; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_REGEXEC_FLAGS; PERL_UNUSED_ARG(data); @@ -3280,7 +3351,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, if (!startpos || ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg)) { - DEBUG_r(Perl_re_printf( aTHX_ + DEBUG_GPOS_r(Perl_re_printf( aTHX_ "fail: ganch-gofs before earliest possible start\n")); return 0; } @@ -3299,8 +3370,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, minlen = prog->minlen; if ((startpos + minlen) > strend || startpos < strbeg) { - DEBUG_r(Perl_re_printf( aTHX_ - "Regex match can't succeed, so not even tried\n")); + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + "Regex match can't succeed, so not even tried\n")); return 0; } @@ -3544,7 +3615,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, if (*s == ch) { DEBUG_EXECUTE_r( did_match = 1 ); if (regtry(reginfo, &s)) goto got_it; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, strend); while (s < strend && *s == ch) s += UTF8SKIP(s); } @@ -3912,7 +3983,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) U32 depth = 0; /* used by REGCP_SET */ #endif RXi_GET_DECL(prog,progi); - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_REGTRY; @@ -4208,7 +4279,9 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; if ( OP(text_node) == EXACT - || OP(text_node) == EXACT_ONLY8 + || OP(text_node) == LEXACT + || OP(text_node) == EXACT_REQ8 + || OP(text_node) == LEXACT_REQ8 || OP(text_node) == EXACTL) { @@ -4217,7 +4290,8 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, * copy the input to the output, avoiding finding the code point of * that character */ if (!is_utf8_pat) { - assert(OP(text_node) != EXACT_ONLY8); + assert( OP(text_node) != EXACT_REQ8 + && OP(text_node) != LEXACT_REQ8); c2 = c1 = *pat; } else if (utf8_target) { @@ -4225,7 +4299,9 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, Copy(pat, c2_utf8, UTF8SKIP(pat), U8); utf8_has_been_setup = TRUE; } - else if (OP(text_node) == EXACT_ONLY8) { + else if ( OP(text_node) == EXACT_REQ8 + || OP(text_node) == LEXACT_REQ8) + { return FALSE; /* Can only match UTF-8 target */ } else { @@ -4233,7 +4309,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } } else { /* an EXACTFish node */ - U8 *pat_end = pat + STR_LEN(text_node); + U8 *pat_end = pat + STR_LENs(text_node); /* An EXACTFL node has at least some characters unfolded, because what * they match is not known until now. So, now is the time to fold @@ -4315,8 +4391,8 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } } else if (c1 > 255) { - const unsigned int * remaining_folds; - unsigned int first_fold; + const U32 * remaining_folds; + U32 first_fold; /* Look up what code points (besides c1) fold to c1; e.g., * [ 'K', KELVIN_SIGN ] both fold to 'k'. */ @@ -4398,7 +4474,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, case EXACTFU: c2 = PL_fold_latin1[c1]; break; - case EXACTFU_ONLY8: + case EXACTFU_REQ8: return FALSE; NOT_REACHED; /* NOTREACHED */ @@ -4450,7 +4526,7 @@ STATIC bool S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target) { /* returns a boolean indicating if there is a Grapheme Cluster Boundary - * between the inputs. See http://www.unicode.org/reports/tr29/. */ + * between the inputs. See https://www.unicode.org/reports/tr29/. */ PERL_ARGS_ASSERT_ISGCB; @@ -4512,7 +4588,7 @@ S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strb } while (prev == GCB_Extend); - return prev != GCB_XPG_XX; + return prev != GCB_ExtPict_XX; } default: @@ -4887,7 +4963,7 @@ S_isSB(pTHX_ SB_enum before, const bool utf8_target) { /* returns a boolean indicating if there is a Sentence Boundary Break - * between the inputs. See http://www.unicode.org/reports/tr29/ */ + * between the inputs. See https://www.unicode.org/reports/tr29/ */ U8 * lpos = (U8 *) curpos; bool has_para_sep = FALSE; @@ -5505,18 +5581,20 @@ S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, /* push a new state then goto it */ -#define PUSH_STATE_GOTO(state, node, input, eol) \ +#define PUSH_STATE_GOTO(state, node, input, eol, sr0) \ pushinput = input; \ pusheol = eol; \ + pushsr0 = sr0; \ scan = node; \ st->resume_state = state; \ goto push_state; /* push a new state with success backtracking, then goto it */ -#define PUSH_YES_STATE_GOTO(state, node, input, eol) \ +#define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \ pushinput = input; \ pusheol = eol; \ + pushsr0 = sr0; \ scan = node; \ st->resume_state = state; \ goto push_yes_state; @@ -5700,6 +5778,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) char *loceol = reginfo->strend; char *pushinput; /* where to continue after a PUSH */ char *pusheol; /* where to stop matching (loceol) after a PUSH */ + U8 *pushsr0; /* save starting pos of script run */ I32 nextchr; /* is always set to UCHARAT(locinput), or -1 at EOS */ bool result = 0; /* return value of S_regmatch */ @@ -5761,7 +5840,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) #endif #ifdef DEBUGGING - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; #endif /* protect against undef(*^R) */ @@ -5836,7 +5915,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* update the startpoint */ st->u.keeper.val = rex->offs[0].start; rex->offs[0].start = locinput - reginfo->strbeg; - PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol); + PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case KEEPS_next_fail: @@ -6212,7 +6292,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) }); if ( ST.accepted > 1 || has_cutgroup || ST.jump ) { - PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol); + PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } /* only one choice left - just continue */ @@ -6240,6 +6321,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } #undef ST + case LEXACT_REQ8: + if (! utf8_target) { + sayNO; + } + /* FALLTHROUGH */ + + case LEXACT: + { + char *s; + + s = STRINGl(scan); + ln = STR_LENl(scan); + goto join_short_long_exact; + case EXACTL: /* /abc/l */ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; @@ -6253,16 +6348,18 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend); } goto do_exact; - case EXACT_ONLY8: + case EXACT_REQ8: if (! utf8_target) { sayNO; } /* FALLTHROUGH */ - case EXACT: { /* /abc/ */ - char *s; + + case EXACT: /* /abc/ */ do_exact: - s = STRING(scan); - ln = STR_LEN(scan); + s = STRINGs(scan); + ln = STR_LENs(scan); + + join_short_long_exact: if (utf8_target != is_utf8_pat) { /* The target and the pattern have differing utf8ness. */ char *l = locinput; @@ -6365,7 +6462,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) fold_array = PL_fold_latin1; goto do_exactf; - case EXACTFU_ONLY8: /* /abc/iu with something in /abc/ > 255 */ + case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */ if (! utf8_target) { sayNO; } @@ -6414,8 +6511,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) fold_utf8_flags = 0; do_exactf: - s = STRING(scan); - ln = STR_LEN(scan); + s = STRINGs(scan); + ln = STR_LENs(scan); if ( utf8_target || is_utf8_pat @@ -6472,9 +6569,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (locinput == reginfo->strbeg) b1 = isWORDCHAR_LC('\n'); else { - b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1, - (U8*)(reginfo->strbeg)), - (U8*)(reginfo->strend)); + U8 *p = reghop3((U8*)locinput, -1, + (U8*)(reginfo->strbeg)); + b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend)); } b2 = (NEXTCHR_IS_EOS) ? isWORDCHAR_LC('\n') @@ -6551,13 +6648,15 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case TRADITIONAL_BOUND: { bool b1, b2; - b1 = (locinput == reginfo->strbeg) - ? 0 /* isWORDCHAR_L1('\n') */ - : isWORDCHAR_utf8_safe( - reghop3((U8*)locinput, - -1, - (U8*)(reginfo->strbeg)), - (U8*) reginfo->strend); + if (locinput == reginfo->strbeg) { + b1 = 0 /* isWORDCHAR_L1('\n') */; + } + else { + U8 *p = reghop3((U8*)locinput, -1, + (U8*)(reginfo->strbeg)); + + b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend); + } b2 = (NEXTCHR_IS_EOS) ? 0 /* isWORDCHAR_L1('\n') */ : isWORDCHAR_utf8_safe((U8*)locinput, @@ -6782,8 +6881,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case ANYOFH: if ( ! utf8_target || NEXTCHR_IS_EOS - || ( ANYOF_FLAGS(scan) != 0 - && ANYOF_FLAGS(scan) != (U8) *locinput) + || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput) || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, utf8_target)) { @@ -6792,6 +6890,95 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto increment_locinput; break; + case ANYOFHb: + if ( ! utf8_target + || NEXTCHR_IS_EOS + || ANYOF_FLAGS(scan) != (U8) *locinput + || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, + utf8_target)) + { + sayNO; + } + goto increment_locinput; + break; + + case ANYOFHr: + if ( ! utf8_target + || NEXTCHR_IS_EOS + || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput), + LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)), + HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan))) + || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, + utf8_target)) + { + sayNO; + } + goto increment_locinput; + break; + + case ANYOFHs: + if ( ! utf8_target + || NEXTCHR_IS_EOS + || loceol - locinput < FLAGS(scan) + || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan)) + || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, + utf8_target)) + { + sayNO; + } + goto increment_locinput; + break; + + case ANYOFR: + if (NEXTCHR_IS_EOS) { + sayNO; + } + + if (utf8_target) { + if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput) + || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput, + (U8 *) reginfo->strend, + NULL), + ANYOFRbase(scan), ANYOFRdelta(scan))) + { + sayNO; + } + } + else { + if (! withinCOUNT((U8) *locinput, + ANYOFRbase(scan), ANYOFRdelta(scan))) + { + sayNO; + } + } + goto increment_locinput; + break; + + case ANYOFRb: + if (NEXTCHR_IS_EOS) { + sayNO; + } + + if (utf8_target) { + if ( ANYOF_FLAGS(scan) != (U8) *locinput + || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput, + (U8 *) reginfo->strend, + NULL), + ANYOFRbase(scan), ANYOFRdelta(scan))) + { + sayNO; + } + } + else { + if (! withinCOUNT((U8) *locinput, + ANYOFRbase(scan), ANYOFRdelta(scan))) + { + sayNO; + } + } + goto increment_locinput; + break; + /* The argument (FLAGS) to all the POSIX node types is the class number * */ @@ -7013,7 +7200,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } break; - case NREFFL: /* /\g{name}/il */ + case REFFLN: /* /\g{name}/il */ { /* The capture buffer cases. The ones beginning with N for the named buffers just convert to the equivalent numbered and pretend they were called as the corresponding numbered buffer @@ -7033,28 +7220,28 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) utf8_fold_flags = FOLDEQ_LOCALE; goto do_nref; - case NREFFA: /* /\g{name}/iaa */ + case REFFAN: /* /\g{name}/iaa */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; type = REFFA; utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_nref; - case NREFFU: /* /\g{name}/iu */ + case REFFUN: /* /\g{name}/iu */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; type = REFFU; utf8_fold_flags = 0; goto do_nref; - case NREFF: /* /\g{name}/i */ + case REFFN: /* /\g{name}/i */ folder = foldEQ; fold_array = PL_fold; type = REFF; utf8_fold_flags = 0; goto do_nref; - case NREF: /* /\g{name}/ */ + case REFN: /* /\g{name}/ */ type = REF; folder = NULL; fold_array = NULL; @@ -7203,7 +7390,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) rex->recurse_locinput[arg]= locinput; DEBUG_r({ - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; DEBUG_STACK_r({ Perl_re_exec_indentf( aTHX_ "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n", @@ -7222,7 +7409,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* NOTREACHED */ case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */ - if (cur_eval && cur_eval->locinput==locinput) { + if (logical == 2 && cur_eval && cur_eval->locinput==locinput) { if ( ++nochange_depth > max_nochange_depth ) Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); } else { @@ -7450,7 +7637,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) PL_curpm = PL_reg_curpm; if (logical != 2) { - PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol); + PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol, + script_run_begin); /* NOTREACHED */ } } @@ -7550,7 +7738,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) ST.prev_eval = cur_eval; cur_eval = st; /* now continue from first node in postoned RE */ - PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput, loceol); + PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput, + loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -7706,7 +7895,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); break; - case NGROUPP: /* (?()) */ + case GROUPPN: /* (?()) */ /* reg_check_named_buff_matched returns 0 for no match */ sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan)); break; @@ -7850,7 +8039,8 @@ NULL ST.count = -1; /* this will be updated by WHILEM */ ST.lastloc = NULL; /* this will be updated by WHILEM */ - PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol); + PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -7898,7 +8088,8 @@ NULL cur_curlyx->u.curlyx.lastloc = locinput; REGCP_SET(ST.lastcp); - PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol); + PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -8006,7 +8197,7 @@ NULL ST.save_curlyx = cur_curlyx; cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, - locinput, loceol); + locinput, loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -8017,7 +8208,8 @@ NULL maxopenparen); cur_curlyx->u.curlyx.lastloc = locinput; REGCP_SET(ST.lastcp); - PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol); + PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } goto do_whilem_B_max; @@ -8069,7 +8261,7 @@ NULL ST.save_curlyx = cur_curlyx; cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, - locinput, loceol); + locinput, loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ @@ -8100,7 +8292,7 @@ NULL REGCP_SET(ST.lastcp); PUSH_STATE_GOTO(WHILEM_A_min, /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS, - locinput, loceol); + locinput, loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ #undef ST @@ -8122,9 +8314,11 @@ NULL /* Now go into the branch */ if (has_cutgroup) { - PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol); + PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol, + script_run_begin); } else { - PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol); + PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol, + script_run_begin); } NOT_REACHED; /* NOTREACHED */ @@ -8132,7 +8326,8 @@ NULL sv_yes_mark = st->u.mark.mark_name = scan->flags ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ]) : NULL; - PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol); + PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case CUTGROUP_next_fail: @@ -8209,7 +8404,8 @@ NULL goto curlym_do_B; curlym_do_A: /* execute the A in /A{m,n}B/ */ - PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol); /* match A */ + PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */ + script_run_begin); NOT_REACHED; /* NOTREACHED */ case CURLYM_A: /* we've just matched an A */ @@ -8333,7 +8529,8 @@ NULL } } - PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol); /* match B */ + PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */ + script_run_begin); NOT_REACHED; /* NOTREACHED */ case CURLYM_B_fail: /* just failed to match a B */ @@ -8633,7 +8830,8 @@ NULL curly_try_B_min: CURLY_SETPAREN(ST.paren, ST.count); - PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol); + PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ @@ -8660,7 +8858,8 @@ NULL } if (ST.c1 == CHRTEST_VOID || could_match) { CURLY_SETPAREN(ST.paren, ST.count); - PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol); + PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } } @@ -8715,8 +8914,9 @@ NULL SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput); - PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B, - locinput, loceol); /* match B */ + PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */ + st->u.eval.prev_eval->u.eval.B, + locinput, loceol, script_run_begin); } if (locinput < reginfo->till) { @@ -8802,7 +9002,8 @@ NULL logical = 0; /* XXX: reset state of logical once it has been saved into ST */ /* execute body of (?...A) */ - PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start, ST.end); + PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start, + ST.end, script_run_begin); NOT_REACHED; /* NOTREACHED */ { @@ -8836,6 +9037,7 @@ NULL /* restore old position except for (?>...) */ locinput = st->locinput; loceol = st->loceol; + script_run_begin = st->sr0; } scan = ST.me + ARG(ST.me); if (scan == ST.me) @@ -8859,7 +9061,8 @@ NULL case PRUNE: /* (*PRUNE) */ if (scan->flags) sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); - PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol); + PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case COMMIT_next_fail: @@ -8889,7 +9092,8 @@ NULL = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); mark_state = st; ST.mark_loc = locinput; - PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol); + PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case MARKPOINT_next: @@ -8922,7 +9126,8 @@ NULL /* (*SKIP) : if we fail we cut here*/ ST.mark_name = NULL; ST.mark_loc = locinput; - PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol); + PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol, + script_run_begin); } else { /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, otherwise do nothing. Meaning we need to scan @@ -8935,7 +9140,8 @@ NULL find ) ) { ST.mark_name = find; - PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol); + PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol, + script_run_begin); } cur = cur->u.mark.prev_mark; } @@ -9032,6 +9238,7 @@ NULL depth++; st->locinput = locinput; st->loceol = loceol; + st->sr0 = script_run_begin; newst = st+1; if (newst > SLAB_LAST(PL_regmatch_slab)) newst = S_push_slab(aTHX); @@ -9039,6 +9246,7 @@ NULL locinput = pushinput; loceol = pusheol; + script_run_begin = pushsr0; st = newst; continue; /* NOTREACHED */ @@ -9094,6 +9302,7 @@ NULL if (no_final) { locinput= st->locinput; loceol= st->loceol; + script_run_begin = st->sr0; } state_num = st->resume_state + no_final; goto reenter_switch; @@ -9145,6 +9354,7 @@ NULL PL_regmatch_state = st; locinput= st->locinput; loceol= st->loceol; + script_run_begin = st->sr0; DEBUG_STATE_pp("pop"); depth--; @@ -9281,6 +9491,22 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, else scan = this_eol; break; + + case LEXACT_REQ8: + if (! utf8_target) { + break; + } + /* FALLTHROUGH */ + + case LEXACT: + { + U8 * string; + Size_t str_len; + + string = (U8 *) STRINGl(p); + str_len = STR_LENl(p); + goto join_short_long_exact; + case EXACTL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) { @@ -9288,16 +9514,20 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } goto do_exact; - case EXACT_ONLY8: + case EXACT_REQ8: if (! utf8_target) { break; } /* FALLTHROUGH */ case EXACT: do_exact: - assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + string = (U8 *) STRINGs(p); + str_len = STR_LENs(p); + + join_short_long_exact: + assert(str_len == reginfo->is_utf8_pat ? UTF8SKIP(string) : 1); - c = (U8)*STRING(p); + c = *string; /* Can use a simple find if the pattern char to match on is invariant * under UTF-8, or both target and pattern aren't UTF-8. Note that we @@ -9319,8 +9549,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, * string EQ */ while (hardcount < max && scan < this_eol - && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p) - && memEQ(scan, STRING(p), scan_char_len)) + && (scan_char_len = UTF8SKIP(scan)) <= str_len + && memEQ(scan, string, scan_char_len)) { scan += scan_char_len; hardcount++; @@ -9330,7 +9560,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* Target isn't utf8; convert the character in the UTF-8 * pattern to non-UTF8, and do a simple find */ - c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); + c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(string + 1)); scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c); } /* else pattern char is above Latin1, can't possibly match the non-UTF-8 target */ @@ -9354,6 +9584,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } } break; + } case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */ assert(! reginfo->is_utf8_pat); @@ -9386,7 +9617,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, | FOLDEQ_S2_FOLDS_SANE; goto do_exactf; - case EXACTFU_ONLY8: + case EXACTFU_REQ8: if (! utf8_target) { break; } @@ -9404,7 +9635,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, int c1, c2; U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; - assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + assert(STR_LENs(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRINGs(p)) : 1); if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8, reginfo)) @@ -9412,10 +9643,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (c1 == CHRTEST_VOID) { /* Use full Unicode fold matching */ char *tmpeol = loceol; - STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1; + STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRINGs(p)) : 1; while (hardcount < max && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, - STRING(p), NULL, pat_len, + STRINGs(p), NULL, pat_len, reginfo->is_utf8_pat, utf8_flags)) { scan = tmpeol; @@ -9534,22 +9765,42 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, case ANYOFH: if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ - if (ANYOF_FLAGS(p)) { /* If we know the first byte of what - matches, we can avoid calling reginclass - */ - while ( hardcount < max - && scan < this_eol - && (U8) *scan == ANYOF_FLAGS(p) - && reginclass(prog, p, (U8*)scan, (U8*) this_eol, - TRUE)) - { - scan += UTF8SKIP(scan); - hardcount++; - } + while ( hardcount < max + && scan < this_eol + && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case ANYOFHb: + if (utf8_target) { /* ANYOFHb only can match UTF-8 targets */ + + /* we know the first byte must be the FLAGS field */ + while ( hardcount < max + && scan < this_eol + && (U8) *scan == ANYOF_FLAGS(p) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, + TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; } - else while ( hardcount < max - && scan < this_eol - && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) + } + break; + + case ANYOFHr: + if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ + while ( hardcount < max + && scan < this_eol + && inRANGE(NATIVE_UTF8_TO_I8(*scan), + LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)), + HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p))) + && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) { scan += UTF8SKIP(scan); hardcount++; @@ -9557,6 +9808,69 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; + case ANYOFHs: + if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ + while ( hardcount < max + && scan + FLAGS(p) < this_eol + && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p)) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case ANYOFR: + if (utf8_target) { + while ( hardcount < max + && scan < this_eol + && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) + && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan, + (U8 *) this_eol, + NULL), + ANYOFRbase(p), ANYOFRdelta(p))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else { + while ( hardcount < max + && scan < this_eol + && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p))) + { + scan++; + hardcount++; + } + } + break; + + case ANYOFRb: + if (utf8_target) { + while ( hardcount < max + && scan < this_eol + && (U8) *scan == ANYOF_FLAGS(p) + && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan, + (U8 *) this_eol, + NULL), + ANYOFRbase(p), ANYOFRdelta(p))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else { + while ( hardcount < max + && scan < this_eol + && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p))) + { + scan++; + hardcount++; + } + } + break; + /* The argument (FLAGS) to all the POSIX node types is the class number */ case NPOSIXL: @@ -9768,7 +10082,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, *startposp = scan; DEBUG_r({ - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); regprop(prog, prop, p, reginfo, NULL); @@ -9800,7 +10114,9 @@ STATIC bool S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) { dVAR; - const char flags = (OP(n) == ANYOFH) ? 0 : ANYOF_FLAGS(n); + const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs)) + ? 0 + : ANYOF_FLAGS(n); bool match = FALSE; UV c = *p; @@ -9827,7 +10143,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } /* If this character is potentially in the bitmap, check it */ - if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) { + if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; else if ((flags @@ -10129,6 +10445,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; eval_state->rex = rex; + eval_state->sv = reginfo->sv; if (reginfo->sv) { /* Make $_ available to executed code. */ @@ -10136,6 +10453,8 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) SAVE_DEFSV; DEFSV_set(reginfo->sv); } + /* will be dec'd by S_cleanup_regmatch_info_aux */ + SvREFCNT_inc_NN(reginfo->sv); if (!(mg = mg_find_mglob(reginfo->sv))) { /* prepare for quick setting of pos */ @@ -10227,6 +10546,7 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg) } PL_curpm = eval_state->curpm; + SvREFCNT_dec(eval_state->sv); } PL_regmatch_state = aux->old_regmatch_state; @@ -10297,6 +10617,7 @@ S_to_byte_substr(pTHX_ regexp *prog) && !prog->substrs->data[i].substr) { SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); if (! sv_utf8_downgrade(sv, TRUE)) { + SvREFCNT_dec_NN(sv); return FALSE; } if (SvVALID(prog->substrs->data[i].utf8_substr)) { @@ -10320,13 +10641,13 @@ S_to_byte_substr(pTHX_ regexp *prog) #ifndef PERL_IN_XSUB_RE bool -Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp) +Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp) { /* Temporary helper function for toke.c. Verify that the code point 'cp' * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in * the larger string bounded by 'strbeg' and 'strend'. * - * 'cp' needs to be assigned (if not a future version of the Unicode + * 'cp' needs to be assigned (if not, a future version of the Unicode * Standard could make it something that combines with adjacent characters, * so code using it would then break), and there has to be a GCB break * before and after the character. */ @@ -10336,7 +10657,7 @@ Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, cons GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val; const U8 * prev_cp_start; - PERL_ARGS_ASSERT__IS_GRAPHEME; + PERL_ARGS_ASSERT_IS_GRAPHEME; if ( UNLIKELY(UNICODE_IS_SUPER(cp)) || UNLIKELY(UNICODE_IS_NONCHAR(cp)))