X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/2abbd513b87245ddb806e6bc4f59945ecb46dced..8de16cf63a494da6b361ce55148e6c06cf18461c:/regexec.c diff --git a/regexec.c b/regexec.c index cafc6b7..91fb3d2 100644 --- a/regexec.c +++ b/regexec.c @@ -92,7 +92,7 @@ static const char utf8_locale_required[] = #ifdef DEBUGGING /* At least one required character in the target string is expressible only in * UTF-8. */ -static const char* const non_utf8_target_but_utf8_required +static const char non_utf8_target_but_utf8_required[] = "Can't match, because target string needs to be in UTF-8\n"; #endif @@ -218,7 +218,7 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH) const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; I32 p; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_REGCPPUSH; @@ -328,7 +328,7 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH) { UV i; U32 paren; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_REGCPPOP; @@ -422,7 +422,7 @@ Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character) * Ideally this could be replaced by a just an array of function pointers * to the C library functions that implement the macros this calls. * However, to compile, the precise function signatures are required, and - * these may vary from platform to to platform. To avoid having to figure + * these may vary from platform to platform. To avoid having to figure * out what those all are on each platform, I (khw) am using this method, * which adds an extra layer of function call overhead (unless the C * optimizer strips it away). But we don't particularly care about @@ -496,7 +496,6 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e) * rules, ignoring any locale. So use the Unicode function if this class * requires an inversion list, and use the Unicode macro otherwise. */ - dVAR; PERL_ARGS_ASSERT_ISFOO_UTF8_LC; @@ -579,7 +578,7 @@ S_find_span_end(U8 * s, const U8 * send, const U8 span_byte) span_word |= span_word << 4; /* That reduces the problem to what this function solves */ - return s + _variant_byte_number(span_word); + return s + variant_byte_number(span_word); #endif @@ -657,7 +656,7 @@ S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask) masked &= PERL_VARIANTS_WORD_MASK; /* This reduces the problem to that solved by this function */ - s += _variant_byte_number(masked); + s += variant_byte_number(masked); return s; } while (s + PERL_WORDSIZE <= send); @@ -723,7 +722,7 @@ S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask) masked |= masked << 1; masked |= masked << 2; masked |= masked << 4; - return s + _variant_byte_number(masked); + return s + variant_byte_number(masked); #endif @@ -859,7 +858,7 @@ Perl_re_intuit_start(pTHX_ RXi_GET_DECL(prog,progi); regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ regmatch_info *const reginfo = ®info_buf; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_RE_INTUIT_START; PERL_UNUSED_ARG(flags); @@ -1173,8 +1172,8 @@ Perl_re_intuit_start(pTHX_ /* now look for the 'other' substring if defined */ - if (utf8_target ? prog->substrs->data[other_ix].utf8_substr - : prog->substrs->data[other_ix].substr) + if (prog->substrs->data[other_ix].utf8_substr + || prog->substrs->data[other_ix].substr) { /* Take into account the "other" substring. */ char *last, *last1; @@ -1184,6 +1183,11 @@ Perl_re_intuit_start(pTHX_ do_other_substr: other = &prog->substrs->data[other_ix]; + if (!utf8_target && !other->substr) { + if (!to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); + } + } /* if "other" is anchored: * we've previously found a floating substr starting at check_at. @@ -1467,10 +1471,10 @@ Perl_re_intuit_start(pTHX_ const U8* const str = (U8*)STRING(progi->regstclass); /* XXX this value could be pre-computed */ - const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT + const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT ? (reginfo->is_utf8_pat - ? utf8_distance(str + STR_LEN(progi->regstclass), str) - : STR_LEN(progi->regstclass)) + ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str) + : (SSize_t)STR_LEN(progi->regstclass)) : 1); char * endpos; char *s; @@ -1720,7 +1724,7 @@ STMT_START { } else { \ uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \ flags); \ - len = UTF8SKIP(uc); \ + len = UTF8_SAFE_SKIP(uc, uc_end); \ skiplen = UVCHR_SKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ @@ -1782,7 +1786,9 @@ STMT_START { STMT_START { \ while (s < strend) { \ CODE \ - s += ((UTF8) ? UTF8SKIP(s) : 1); \ + s += ((UTF8) \ + ? UTF8_SAFE_SKIP(s, reginfo->strend) \ + : 1); \ } \ } STMT_END @@ -1796,7 +1802,7 @@ STMT_START { #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \ if (COND) { \ FBC_CHECK_AND_TRY \ - s += ((UTF8) ? UTF8SKIP(s) : 1); \ + s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\ previous_occurrence_end = s; \ } \ else { \ @@ -1815,12 +1821,13 @@ STMT_START { * of the one we're looking for. Knowing that, we can see right away if the * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we * don't accept the 2nd and succeeding adjacent occurrences */ -#define FBC_CHECK_AND_TRY \ - if ( ( doevery \ - || s != previous_occurrence_end) \ - && (reginfo->intuit || regtry(reginfo, &s))) \ - { \ - goto got_it; \ +#define FBC_CHECK_AND_TRY \ + if ( ( doevery \ + || s != previous_occurrence_end) \ + && ( reginfo->intuit \ + || (s <= reginfo->strend && regtry(reginfo, &s)))) \ + { \ + goto got_it; \ } @@ -1839,6 +1846,28 @@ STMT_START { previous_occurrence_end = s; \ } +/* This differs from the above macros in that it is passed a single byte that + * is known to begin the next occurrence of the thing being looked for in 's'. + * It does a memchr to find the next occurrence of 'byte', before trying 'COND' + * at that position. */ +#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \ + while (s < strend) { \ + s = (char *) memchr(s, byte, strend -s); \ + if (s == NULL) { \ + s = (char *) strend; \ + break; \ + } \ + \ + if (COND) { \ + FBC_CHECK_AND_TRY \ + s += UTF8_SAFE_SKIP(s, reginfo->strend); \ + previous_occurrence_end = s; \ + } \ + else { \ + s += UTF8SKIP(s); \ + } \ + } + /* The three macros below are slightly different versions of the same logic. * * The first is for /a and /aa when the target string is UTF-8. This can only @@ -1890,7 +1919,8 @@ STMT_START { /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and * TEST_UTF8 is a macro that for the same input code points returns identically - * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */ + * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an + * end pointer as well) */ #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \ if (s == reginfo->strbeg) { \ tmp = '\n'; \ @@ -1945,9 +1975,12 @@ STMT_START { } /* This is the macro to use when we want to see if something that looks like it - * could match, actually does, and if so exits the loop */ -#define REXEC_FBC_TRYIT \ - if ((reginfo->intuit || regtry(reginfo, &s))) \ + * could match, actually does, and if so exits the loop. It needs to be used + * only for bounds checking macros, as it allows for matching beyond the end of + * string (which should be zero length without having to look at the string + * contents) */ +#define REXEC_FBC_TRYIT \ + if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \ goto got_it /* The only difference between the BOUND and NBOUND cases is that @@ -2065,7 +2098,6 @@ STATIC char * S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char *strend, regmatch_info *reginfo) { - dVAR; /* TRUE if x+ need not match at just the 1st pos of run of x's */ const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; @@ -2129,21 +2161,89 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; case ANYOFM: /* ARG() is the base byte; FLAGS() the mask byte */ - /* UTF-8ness doesn't matter, so use 0 */ + /* UTF-8ness doesn't matter because only matches UTF-8 invariants, so + * use 0 */ REXEC_FBC_FIND_NEXT_SCAN(0, (char *) find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c))); break; - case NANYOFM: - REXEC_FBC_FIND_NEXT_SCAN(0, + case NANYOFM: /* UTF-8ness does matter because can match UTF-8 variants. + */ + REXEC_FBC_FIND_NEXT_SCAN(utf8_target, (char *) find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c))); break; case ANYOFH: - if (utf8_target) REXEC_FBC_CLASS_SCAN(TRUE, + if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ + REXEC_FBC_CLASS_SCAN(TRUE, + ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) + && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target))); + } + break; + + case ANYOFHb: + if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ + + /* We know what the first byte of any matched string should be */ + U8 first_byte = FLAGS(c); + + REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); + } + break; + + case ANYOFHr: + if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ + REXEC_FBC_CLASS_SCAN(TRUE, + ( inRANGE(NATIVE_UTF8_TO_I8(*s), + LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)), + HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c))) + && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target))); + } + break; + + case ANYOFHs: + if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ + REXEC_FBC_CLASS_SCAN(TRUE, + ( strend -s >= FLAGS(c) + && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c)) + && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target))); + } + break; + + case ANYOFR: + if (utf8_target) { + REXEC_FBC_CLASS_SCAN(TRUE, + ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) + && withinCOUNT(utf8_to_uvchr_buf((U8 *) s, + (U8 *) strend, + NULL), + ANYOFRbase(c), ANYOFRdelta(c)))); + } + else { + REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s, + ANYOFRbase(c), ANYOFRdelta(c))); + } + break; + + case ANYOFRb: + if (utf8_target) { + + /* We know what the first byte of any matched string should be */ + U8 first_byte = FLAGS(c); + + REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, + withinCOUNT(utf8_to_uvchr_buf((U8 *) s, + (U8 *) strend, + NULL), + ANYOFRbase(c), ANYOFRdelta(c))); + } + else { + REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s, + ANYOFRbase(c), ANYOFRdelta(c))); + } break; case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */ @@ -2207,7 +2307,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, | FOLDEQ_S2_FOLDS_SANE; goto do_exactf_utf8; - case EXACTFU_ONLY8: + case EXACTFU_REQ8: if (! utf8_target) { break; } @@ -2239,8 +2339,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * first character. c2 is its fold. This logic will not work for * Unicode semantics and the german sharp ss, which hence should * not be compiled into a node that gets here. */ - pat_string = STRING(c); - ln = STR_LEN(c); /* length to match in octets/bytes */ + pat_string = STRINGs(c); + ln = STR_LENs(c); /* length to match in octets/bytes */ /* We know that we have to match at least 'ln' bytes (which is the * same as characters, since not utf8). If we have to match 3 @@ -2315,8 +2415,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* If one of the operands is in utf8, we can't use the simpler folding * above, due to the fact that many different characters can have the * same fold, or portion of a fold, or different- length fold */ - pat_string = STRING(c); - ln = STR_LEN(c); /* length to match in octets/bytes */ + pat_string = STRINGs(c); + ln = STR_LENs(c); /* length to match in octets/bytes */ pat_end = pat_string + ln; lnc = is_utf8_pat /* length to match in characters */ ? utf8_length((U8 *) pat_string, (U8 *) pat_end) @@ -2355,7 +2455,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, { goto got_it; } - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; } break; } @@ -2439,7 +2539,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } /* Didn't match. Try at the next position (if there is one) */ - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; if (UNLIKELY(s >= reginfo->strend)) { break; } @@ -2463,7 +2563,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto got_it; } before = after; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, reginfo->strend); } } else { /* Not utf8. Everything is a GCB except between CR and @@ -2481,7 +2581,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* And, since this is a bound, it can match after the final * character in the string */ - if ((reginfo->intuit || regtry(reginfo, &s))) { + if ( reginfo->intuit + || (s <= reginfo->strend && regtry(reginfo, &s))) + { goto got_it; } break; @@ -2491,7 +2593,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; if (UNLIKELY(s >= reginfo->strend)) { break; } @@ -2515,7 +2617,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto got_it; } before = after; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, reginfo->strend); } } else { /* Not utf8. */ @@ -2537,7 +2639,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } } - if (reginfo->intuit || regtry(reginfo, &s)) { + if ( reginfo->intuit + || (s <= reginfo->strend && regtry(reginfo, &s))) + { goto got_it; } @@ -2548,7 +2652,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; if (UNLIKELY(s >= reginfo->strend)) { break; } @@ -2573,7 +2677,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto got_it; } before = after; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, reginfo->strend); } } else { /* Not utf8. */ @@ -2598,7 +2702,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* Here are at the final position in the target string. The SB * value is always true here, so matches, depending on other * constraints */ - if (reginfo->intuit || regtry(reginfo, &s)) { + if ( reginfo->intuit + || (s <= reginfo->strend && regtry(reginfo, &s))) + { goto got_it; } @@ -2609,7 +2715,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } - s += (utf8_target) ? UTF8SKIP(s) : 1; + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; if (UNLIKELY(s >= reginfo->strend)) { break; } @@ -2643,7 +2749,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } previous = before; before = after; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, reginfo->strend); } } else { /* Not utf8. */ @@ -2668,7 +2774,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } } - if (reginfo->intuit || regtry(reginfo, &s)) { + if ( reginfo->intuit + || (s <= reginfo->strend && regtry(reginfo, &s))) + { goto got_it; } } @@ -2805,7 +2913,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, U8 *bitmap=NULL; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; /* We can't just allocate points here. We need to wrap it in * an SV so it gets freed properly if there is a croak while @@ -2985,7 +3093,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, LEAVE; goto got_it; } - s = HOPc(s,1); + if (s < reginfo->strend) { + s = HOPc(s,1); + } DEBUG_TRIE_EXECUTE_r({ Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n"); }); @@ -3186,7 +3296,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, regmatch_info *const reginfo = ®info_buf; regexp_paren_pair *swap = NULL; I32 oldsave; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_REGEXEC_FLAGS; PERL_UNUSED_ARG(data); @@ -3240,7 +3350,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, if (!startpos || ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg)) { - DEBUG_r(Perl_re_printf( aTHX_ + DEBUG_GPOS_r(Perl_re_printf( aTHX_ "fail: ganch-gofs before earliest possible start\n")); return 0; } @@ -3259,8 +3369,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, minlen = prog->minlen; if ((startpos + minlen) > strend || startpos < strbeg) { - DEBUG_r(Perl_re_printf( aTHX_ - "Regex match can't succeed, so not even tried\n")); + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + "Regex match can't succeed, so not even tried\n")); return 0; } @@ -3305,7 +3415,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, RXp_MATCH_UTF8_set(prog, utf8_target); prog->offs[0].start = s - strbeg; prog->offs[0].end = utf8_target - ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg + ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg : s - strbeg + prog->minlenret; if ( !(flags & REXEC_NOT_FIRST) ) S_reg_set_capture_string(aTHX_ rx, @@ -3500,11 +3610,11 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, to_utf8_substr(prog); } ch = SvPVX_const(prog->anchored_utf8)[0]; - REXEC_FBC_SCAN(0, /* 0=>not-utf8 */ + REXEC_FBC_SCAN(1, /* 1=>utf8 */ if (*s == ch) { DEBUG_EXECUTE_r( did_match = 1 ); if (regtry(reginfo, &s)) goto got_it; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, strend); while (s < strend && *s == ch) s += UTF8SKIP(s); } @@ -3872,7 +3982,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) U32 depth = 0; /* used by REGCP_SET */ #endif RXi_GET_DECL(prog,progi); - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_REGTRY; @@ -4162,13 +4272,14 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, * to/from code points */ bool utf8_has_been_setup = FALSE; - dVAR; U8 *pat = (U8*)STRING(text_node); U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; if ( OP(text_node) == EXACT - || OP(text_node) == EXACT_ONLY8 + || OP(text_node) == LEXACT + || OP(text_node) == EXACT_REQ8 + || OP(text_node) == LEXACT_REQ8 || OP(text_node) == EXACTL) { @@ -4177,7 +4288,8 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, * copy the input to the output, avoiding finding the code point of * that character */ if (!is_utf8_pat) { - assert(OP(text_node) != EXACT_ONLY8); + assert( OP(text_node) != EXACT_REQ8 + && OP(text_node) != LEXACT_REQ8); c2 = c1 = *pat; } else if (utf8_target) { @@ -4185,7 +4297,9 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, Copy(pat, c2_utf8, UTF8SKIP(pat), U8); utf8_has_been_setup = TRUE; } - else if (OP(text_node) == EXACT_ONLY8) { + else if ( OP(text_node) == EXACT_REQ8 + || OP(text_node) == LEXACT_REQ8) + { return FALSE; /* Can only match UTF-8 target */ } else { @@ -4193,7 +4307,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } } else { /* an EXACTFish node */ - U8 *pat_end = pat + STR_LEN(text_node); + U8 *pat_end = pat + STR_LENs(text_node); /* An EXACTFL node has at least some characters unfolded, because what * they match is not known until now. So, now is the time to fold @@ -4275,8 +4389,8 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } } else if (c1 > 255) { - const unsigned int * remaining_folds; - unsigned int first_fold; + const U32 * remaining_folds; + U32 first_fold; /* Look up what code points (besides c1) fold to c1; e.g., * [ 'K', KELVIN_SIGN ] both fold to 'k'. */ @@ -4358,7 +4472,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, case EXACTFU: c2 = PL_fold_latin1[c1]; break; - case EXACTFU_ONLY8: + case EXACTFU_REQ8: return FALSE; NOT_REACHED; /* NOTREACHED */ @@ -4410,7 +4524,7 @@ STATIC bool S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target) { /* returns a boolean indicating if there is a Grapheme Cluster Boundary - * between the inputs. See http://www.unicode.org/reports/tr29/. */ + * between the inputs. See https://www.unicode.org/reports/tr29/. */ PERL_ARGS_ASSERT_ISGCB; @@ -4472,7 +4586,7 @@ S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strb } while (prev == GCB_Extend); - return prev != GCB_XPG_XX; + return prev != GCB_ExtPict_XX; } default: @@ -4490,7 +4604,6 @@ S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strb STATIC GCB_enum S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) { - dVAR; GCB_enum gcb; PERL_ARGS_ASSERT_BACKUP_ONE_GCB; @@ -4768,7 +4881,6 @@ S_isLB(pTHX_ LB_enum before, STATIC LB_enum S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) { - dVAR; LB_enum lb; @@ -4799,7 +4911,6 @@ S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_ta STATIC LB_enum S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) { - dVAR; LB_enum lb; PERL_ARGS_ASSERT_BACKUP_ONE_LB; @@ -4847,7 +4958,7 @@ S_isSB(pTHX_ SB_enum before, const bool utf8_target) { /* returns a boolean indicating if there is a Sentence Boundary Break - * between the inputs. See http://www.unicode.org/reports/tr29/ */ + * between the inputs. See https://www.unicode.org/reports/tr29/ */ U8 * lpos = (U8 *) curpos; bool has_para_sep = FALSE; @@ -5036,7 +5147,6 @@ S_isSB(pTHX_ SB_enum before, STATIC SB_enum S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) { - dVAR; SB_enum sb; PERL_ARGS_ASSERT_ADVANCE_ONE_SB; @@ -5070,7 +5180,6 @@ S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_ta STATIC SB_enum S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) { - dVAR; SB_enum sb; PERL_ARGS_ASSERT_BACKUP_ONE_SB; @@ -5307,7 +5416,6 @@ S_advance_one_WB(pTHX_ U8 ** curpos, const bool utf8_target, const bool skip_Extend_Format) { - dVAR; WB_enum wb; PERL_ARGS_ASSERT_ADVANCE_ONE_WB; @@ -5345,7 +5453,6 @@ S_advance_one_WB(pTHX_ U8 ** curpos, STATIC WB_enum S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target) { - dVAR; WB_enum wb; PERL_ARGS_ASSERT_BACKUP_ONE_WB; @@ -5465,18 +5572,20 @@ S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, /* push a new state then goto it */ -#define PUSH_STATE_GOTO(state, node, input, eol) \ +#define PUSH_STATE_GOTO(state, node, input, eol, sr0) \ pushinput = input; \ pusheol = eol; \ + pushsr0 = sr0; \ scan = node; \ st->resume_state = state; \ goto push_state; /* push a new state with success backtracking, then goto it */ -#define PUSH_YES_STATE_GOTO(state, node, input, eol) \ +#define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \ pushinput = input; \ pusheol = eol; \ + pushsr0 = sr0; \ scan = node; \ st->resume_state = state; \ goto push_yes_state; @@ -5532,7 +5641,7 @@ the subpattern to be matched possibly multiple times, while B is the entire rest of the pattern. Variable and state names reflect this convention. The states in the main switch are the union of ops and failure/success of -substates associated with with that op. For example, IFMATCH is the op +substates associated with that op. For example, IFMATCH is the op that does lookahead assertions /(?=A)B/ and so the IFMATCH state means 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just successfully matched A and IFMATCH_A_fail is a state saying that we have @@ -5642,7 +5751,6 @@ bounds of our window into the string. STATIC SSize_t S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) { - dVAR; const bool utf8_target = reginfo->is_utf8_target; const U32 uniflags = UTF8_ALLOW_DEFAULT; REGEXP *rex_sv = reginfo->prog; @@ -5660,6 +5768,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) char *loceol = reginfo->strend; char *pushinput; /* where to continue after a PUSH */ char *pusheol; /* where to stop matching (loceol) after a PUSH */ + U8 *pushsr0; /* save starting pos of script run */ I32 nextchr; /* is always set to UCHARAT(locinput), or -1 at EOS */ bool result = 0; /* return value of S_regmatch */ @@ -5721,7 +5830,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) #endif #ifdef DEBUGGING - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; #endif /* protect against undef(*^R) */ @@ -5796,7 +5905,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* update the startpoint */ st->u.keeper.val = rex->offs[0].start; rex->offs[0].start = locinput - reginfo->strbeg; - PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol); + PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case KEEPS_next_fail: @@ -6127,6 +6237,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (chars) { if (utf8_target) { + /* XXX This assumes the length is well-formed, as + * does the UTF8SKIP below */ uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, uniflags); uc += len; @@ -6170,7 +6282,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) }); if ( ST.accepted > 1 || has_cutgroup || ST.jump ) { - PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol); + PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } /* only one choice left - just continue */ @@ -6198,6 +6311,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } #undef ST + case LEXACT_REQ8: + if (! utf8_target) { + sayNO; + } + /* FALLTHROUGH */ + + case LEXACT: + { + char *s; + + s = STRINGl(scan); + ln = STR_LENl(scan); + goto join_short_long_exact; + case EXACTL: /* /abc/l */ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; @@ -6211,16 +6338,18 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend); } goto do_exact; - case EXACT_ONLY8: + case EXACT_REQ8: if (! utf8_target) { sayNO; } /* FALLTHROUGH */ - case EXACT: { /* /abc/ */ - char *s; + + case EXACT: /* /abc/ */ do_exact: - s = STRING(scan); - ln = STR_LEN(scan); + s = STRINGs(scan); + ln = STR_LENs(scan); + + join_short_long_exact: if (utf8_target != is_utf8_pat) { /* The target and the pattern have differing utf8ness. */ char *l = locinput; @@ -6323,7 +6452,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) fold_array = PL_fold_latin1; goto do_exactf; - case EXACTFU_ONLY8: /* /abc/iu with something in /abc/ > 255 */ + case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */ if (! utf8_target) { sayNO; } @@ -6372,8 +6501,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) fold_utf8_flags = 0; do_exactf: - s = STRING(scan); - ln = STR_LEN(scan); + s = STRINGs(scan); + ln = STR_LENs(scan); if ( utf8_target || is_utf8_pat @@ -6430,9 +6559,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (locinput == reginfo->strbeg) b1 = isWORDCHAR_LC('\n'); else { - b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1, - (U8*)(reginfo->strbeg)), - (U8*)(reginfo->strend)); + U8 *p = reghop3((U8*)locinput, -1, + (U8*)(reginfo->strbeg)); + b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend)); } b2 = (NEXTCHR_IS_EOS) ? isWORDCHAR_LC('\n') @@ -6509,13 +6638,15 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case TRADITIONAL_BOUND: { bool b1, b2; - b1 = (locinput == reginfo->strbeg) - ? 0 /* isWORDCHAR_L1('\n') */ - : isWORDCHAR_utf8_safe( - reghop3((U8*)locinput, - -1, - (U8*)(reginfo->strbeg)), - (U8*) reginfo->strend); + if (locinput == reginfo->strbeg) { + b1 = 0 /* isWORDCHAR_L1('\n') */; + } + else { + U8 *p = reghop3((U8*)locinput, -1, + (U8*)(reginfo->strbeg)); + + b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend); + } b2 = (NEXTCHR_IS_EOS) ? 0 /* isWORDCHAR_L1('\n') */ : isWORDCHAR_utf8_safe((U8*)locinput, @@ -6740,6 +6871,33 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case ANYOFH: if ( ! utf8_target || NEXTCHR_IS_EOS + || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput) + || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, + utf8_target)) + { + sayNO; + } + goto increment_locinput; + break; + + case ANYOFHb: + if ( ! utf8_target + || NEXTCHR_IS_EOS + || ANYOF_FLAGS(scan) != (U8) *locinput + || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, + utf8_target)) + { + sayNO; + } + goto increment_locinput; + break; + + case ANYOFHr: + if ( ! utf8_target + || NEXTCHR_IS_EOS + || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput), + LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)), + HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan))) || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, utf8_target)) { @@ -6748,6 +6906,69 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto increment_locinput; break; + case ANYOFHs: + if ( ! utf8_target + || NEXTCHR_IS_EOS + || loceol - locinput < FLAGS(scan) + || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan)) + || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, + utf8_target)) + { + sayNO; + } + goto increment_locinput; + break; + + case ANYOFR: + if (NEXTCHR_IS_EOS) { + sayNO; + } + + if (utf8_target) { + if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput) + || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput, + (U8 *) reginfo->strend, + NULL), + ANYOFRbase(scan), ANYOFRdelta(scan))) + { + sayNO; + } + } + else { + if (! withinCOUNT((U8) *locinput, + ANYOFRbase(scan), ANYOFRdelta(scan))) + { + sayNO; + } + } + goto increment_locinput; + break; + + case ANYOFRb: + if (NEXTCHR_IS_EOS) { + sayNO; + } + + if (utf8_target) { + if ( ANYOF_FLAGS(scan) != (U8) *locinput + || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput, + (U8 *) reginfo->strend, + NULL), + ANYOFRbase(scan), ANYOFRdelta(scan))) + { + sayNO; + } + } + else { + if (! withinCOUNT((U8) *locinput, + ANYOFRbase(scan), ANYOFRdelta(scan))) + { + sayNO; + } + } + goto increment_locinput; + break; + /* The argument (FLAGS) to all the POSIX node types is the class number * */ @@ -6919,7 +7140,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } break; } - locinput += UTF8SKIP(locinput); + locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend); } break; @@ -6969,7 +7190,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } break; - case NREFFL: /* /\g{name}/il */ + case REFFLN: /* /\g{name}/il */ { /* The capture buffer cases. The ones beginning with N for the named buffers just convert to the equivalent numbered and pretend they were called as the corresponding numbered buffer @@ -6989,28 +7210,28 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) utf8_fold_flags = FOLDEQ_LOCALE; goto do_nref; - case NREFFA: /* /\g{name}/iaa */ + case REFFAN: /* /\g{name}/iaa */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; type = REFFA; utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_nref; - case NREFFU: /* /\g{name}/iu */ + case REFFUN: /* /\g{name}/iu */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; type = REFFU; utf8_fold_flags = 0; goto do_nref; - case NREFF: /* /\g{name}/i */ + case REFFN: /* /\g{name}/i */ folder = foldEQ; fold_array = PL_fold; type = REFF; utf8_fold_flags = 0; goto do_nref; - case NREF: /* /\g{name}/ */ + case REFN: /* /\g{name}/ */ type = REF; folder = NULL; fold_array = NULL; @@ -7159,7 +7380,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) rex->recurse_locinput[arg]= locinput; DEBUG_r({ - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; DEBUG_STACK_r({ Perl_re_exec_indentf( aTHX_ "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n", @@ -7178,7 +7399,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* NOTREACHED */ case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */ - if (cur_eval && cur_eval->locinput==locinput) { + if (logical == 2 && cur_eval && cur_eval->locinput==locinput) { if ( ++nochange_depth > max_nochange_depth ) Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); } else { @@ -7406,7 +7627,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) PL_curpm = PL_reg_curpm; if (logical != 2) { - PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol); + PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol, + script_run_begin); /* NOTREACHED */ } } @@ -7506,7 +7728,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) ST.prev_eval = cur_eval; cur_eval = st; /* now continue from first node in postoned RE */ - PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput, loceol); + PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput, + loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -7662,7 +7885,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); break; - case NGROUPP: /* (?()) */ + case GROUPPN: /* (?()) */ /* reg_check_named_buff_matched returns 0 for no match */ sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan)); break; @@ -7806,7 +8029,8 @@ NULL ST.count = -1; /* this will be updated by WHILEM */ ST.lastloc = NULL; /* this will be updated by WHILEM */ - PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol); + PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -7854,7 +8078,8 @@ NULL cur_curlyx->u.curlyx.lastloc = locinput; REGCP_SET(ST.lastcp); - PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol); + PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -7962,7 +8187,7 @@ NULL ST.save_curlyx = cur_curlyx; cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, - locinput, loceol); + locinput, loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -7973,7 +8198,8 @@ NULL maxopenparen); cur_curlyx->u.curlyx.lastloc = locinput; REGCP_SET(ST.lastcp); - PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol); + PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } goto do_whilem_B_max; @@ -8025,7 +8251,7 @@ NULL ST.save_curlyx = cur_curlyx; cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, - locinput, loceol); + locinput, loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ @@ -8056,7 +8282,7 @@ NULL REGCP_SET(ST.lastcp); PUSH_STATE_GOTO(WHILEM_A_min, /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS, - locinput, loceol); + locinput, loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ #undef ST @@ -8078,9 +8304,11 @@ NULL /* Now go into the branch */ if (has_cutgroup) { - PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol); + PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol, + script_run_begin); } else { - PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol); + PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol, + script_run_begin); } NOT_REACHED; /* NOTREACHED */ @@ -8088,7 +8316,8 @@ NULL sv_yes_mark = st->u.mark.mark_name = scan->flags ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ]) : NULL; - PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol); + PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case CUTGROUP_next_fail: @@ -8165,7 +8394,8 @@ NULL goto curlym_do_B; curlym_do_A: /* execute the A in /A{m,n}B/ */ - PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol); /* match A */ + PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */ + script_run_begin); NOT_REACHED; /* NOTREACHED */ case CURLYM_A: /* we've just matched an A */ @@ -8235,8 +8465,15 @@ NULL ); if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) { if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) { - if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) - && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + + /* (We can use memEQ and memNE in this file without + * having to worry about one being shorter than the + * other, since the first byte of each gives the + * length of the character) */ + if ( memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput, + reginfo->strend)) + && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput, + reginfo->strend))) { /* simulate B failing */ DEBUG_OPTIMISE_r( @@ -8282,7 +8519,8 @@ NULL } } - PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol); /* match B */ + PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */ + script_run_begin); NOT_REACHED; /* NOTREACHED */ case CURLYM_B_fail: /* just failed to match a B */ @@ -8498,20 +8736,26 @@ NULL n = (ST.oldloc == locinput) ? 0 : 1; if (ST.c1 == ST.c2) { /* set n to utf8_distance(oldloc, locinput) */ - while (locinput <= ST.maxpos - && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))) + while ( locinput <= ST.maxpos + && locinput < loceol + && memNE(locinput, ST.c1_utf8, + UTF8_SAFE_SKIP(locinput, reginfo->strend))) { - locinput += UTF8SKIP(locinput); + locinput += UTF8_SAFE_SKIP(locinput, + reginfo->strend); n++; } } else { /* set n to utf8_distance(oldloc, locinput) */ - while (locinput <= ST.maxpos - && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) - && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + while ( locinput <= ST.maxpos + && locinput < loceol + && memNE(locinput, ST.c1_utf8, + UTF8_SAFE_SKIP(locinput, reginfo->strend)) + && memNE(locinput, ST.c2_utf8, + UTF8_SAFE_SKIP(locinput, reginfo->strend))) { - locinput += UTF8SKIP(locinput); + locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend); n++; } } @@ -8576,7 +8820,8 @@ NULL curly_try_B_min: CURLY_SETPAREN(ST.paren, ST.count); - PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol); + PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ @@ -8589,21 +8834,22 @@ NULL if (ST.c1 != CHRTEST_VOID && could_match) { if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target) { - could_match = memEQ(locinput, - ST.c1_utf8, - UTF8SKIP(locinput)) - || memEQ(locinput, - ST.c2_utf8, - UTF8SKIP(locinput)); + could_match = memEQ(locinput, ST.c1_utf8, + UTF8_SAFE_SKIP(locinput, + reginfo->strend)) + || memEQ(locinput, ST.c2_utf8, + UTF8_SAFE_SKIP(locinput, + reginfo->strend)); } else { - could_match = UCHARAT(locinput) == ST.c1 - || UCHARAT(locinput) == ST.c2; + could_match = UCHARAT(locinput) == ST.c1 + || UCHARAT(locinput) == ST.c2; } } if (ST.c1 == CHRTEST_VOID || could_match) { CURLY_SETPAREN(ST.paren, ST.count); - PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol); + PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } } @@ -8658,8 +8904,9 @@ NULL SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput); - PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B, - locinput, loceol); /* match B */ + PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */ + st->u.eval.prev_eval->u.eval.B, + locinput, loceol, script_run_begin); } if (locinput < reginfo->till) { @@ -8710,8 +8957,8 @@ NULL PERL_UINT_FAST8_T back_count = scan->flags; char * s; - /* Lookbehind ends here */ - ST.end = locinput; + /* Lookbehind can look beyond the current position */ + ST.end = loceol; /* ... and starts at the first place in the input that is in * the range of the possible start positions */ @@ -8725,19 +8972,18 @@ NULL /* If the lookbehind doesn't start in the actual string, is a * trivial match failure */ - if (logical) { - logical = 0; - sw = 1 - cBOOL(ST.wanted); - } - else if (ST.wanted) - sayNO; + if (logical) { + logical = 0; + sw = 1 - cBOOL(ST.wanted); + } + else if (ST.wanted) + sayNO; - /* Here, we didn't want it to match, so is actually success - * */ - next = scan + ARG(scan); - if (next == scan) - next = NULL; - break; + /* Here, we didn't want it to match, so is actually success */ + next = scan + ARG(scan); + if (next == scan) + next = NULL; + break; } do_ifmatch: @@ -8746,7 +8992,8 @@ NULL logical = 0; /* XXX: reset state of logical once it has been saved into ST */ /* execute body of (?...A) */ - PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start, ST.end); + PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start, + ST.end, script_run_begin); NOT_REACHED; /* NOTREACHED */ { @@ -8780,6 +9027,7 @@ NULL /* restore old position except for (?>...) */ locinput = st->locinput; loceol = st->loceol; + script_run_begin = st->sr0; } scan = ST.me + ARG(ST.me); if (scan == ST.me) @@ -8803,7 +9051,8 @@ NULL case PRUNE: /* (*PRUNE) */ if (scan->flags) sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); - PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol); + PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case COMMIT_next_fail: @@ -8833,7 +9082,8 @@ NULL = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); mark_state = st; ST.mark_loc = locinput; - PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol); + PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case MARKPOINT_next: @@ -8866,7 +9116,8 @@ NULL /* (*SKIP) : if we fail we cut here*/ ST.mark_name = NULL; ST.mark_loc = locinput; - PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol); + PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol, + script_run_begin); } else { /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, otherwise do nothing. Meaning we need to scan @@ -8879,7 +9130,8 @@ NULL find ) ) { ST.mark_name = find; - PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol); + PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol, + script_run_begin); } cur = cur->u.mark.prev_mark; } @@ -8950,8 +9202,10 @@ NULL /* push a new regex state, then continue at scan */ { regmatch_state *newst; + DECLARE_AND_GET_RE_DEBUG_FLAGS; - DEBUG_STACK_r({ + DEBUG_r( /* DEBUG_STACK_r */ + if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) { regmatch_state *cur = st; regmatch_state *curyes = yes_state; U32 i; @@ -8970,12 +9224,13 @@ NULL if (curyes == cur) curyes = cur->u.yes.prev_yes_state; } - } else + } else { DEBUG_STATE_pp("push") - ); + }); depth++; st->locinput = locinput; st->loceol = loceol; + st->sr0 = script_run_begin; newst = st+1; if (newst > SLAB_LAST(PL_regmatch_slab)) newst = S_push_slab(aTHX); @@ -8983,6 +9238,7 @@ NULL locinput = pushinput; loceol = pusheol; + script_run_begin = pushsr0; st = newst; continue; /* NOTREACHED */ @@ -9038,6 +9294,7 @@ NULL if (no_final) { locinput= st->locinput; loceol= st->loceol; + script_run_begin = st->sr0; } state_num = st->resume_state + no_final; goto reenter_switch; @@ -9089,6 +9346,7 @@ NULL PL_regmatch_state = st; locinput= st->locinput; loceol= st->loceol; + script_run_begin = st->sr0; DEBUG_STATE_pp("pop"); depth--; @@ -9154,7 +9412,6 @@ STATIC I32 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH) { - dVAR; char *scan; /* Pointer to current position in target string */ I32 c; char *this_eol = loceol; /* potentially adjusted version. */ @@ -9225,6 +9482,22 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, else scan = this_eol; break; + + case LEXACT_REQ8: + if (! utf8_target) { + break; + } + /* FALLTHROUGH */ + + case LEXACT: + { + U8 * string; + Size_t str_len; + + string = (U8 *) STRINGl(p); + str_len = STR_LENl(p); + goto join_short_long_exact; + case EXACTL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) { @@ -9232,16 +9505,20 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } goto do_exact; - case EXACT_ONLY8: + case EXACT_REQ8: if (! utf8_target) { break; } /* FALLTHROUGH */ case EXACT: do_exact: - assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + string = (U8 *) STRINGs(p); + str_len = STR_LENs(p); + + join_short_long_exact: + assert(str_len == reginfo->is_utf8_pat ? UTF8SKIP(string) : 1); - c = (U8)*STRING(p); + c = *string; /* Can use a simple find if the pattern char to match on is invariant * under UTF-8, or both target and pattern aren't UTF-8. Note that we @@ -9263,8 +9540,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, * string EQ */ while (hardcount < max && scan < this_eol - && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p) - && memEQ(scan, STRING(p), scan_char_len)) + && (scan_char_len = UTF8SKIP(scan)) <= str_len + && memEQ(scan, string, scan_char_len)) { scan += scan_char_len; hardcount++; @@ -9274,7 +9551,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* Target isn't utf8; convert the character in the UTF-8 * pattern to non-UTF8, and do a simple find */ - c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); + c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(string + 1)); scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c); } /* else pattern char is above Latin1, can't possibly match the non-UTF-8 target */ @@ -9298,6 +9575,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } } break; + } case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */ assert(! reginfo->is_utf8_pat); @@ -9330,7 +9608,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, | FOLDEQ_S2_FOLDS_SANE; goto do_exactf; - case EXACTFU_ONLY8: + case EXACTFU_REQ8: if (! utf8_target) { break; } @@ -9348,7 +9626,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, int c1, c2; U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; - assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + assert(STR_LENs(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRINGs(p)) : 1); if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8, reginfo)) @@ -9356,10 +9634,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (c1 == CHRTEST_VOID) { /* Use full Unicode fold matching */ char *tmpeol = loceol; - STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1; + STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRINGs(p)) : 1; while (hardcount < max && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, - STRING(p), NULL, pat_len, + STRINGs(p), NULL, pat_len, reginfo->is_utf8_pat, utf8_flags)) { scan = tmpeol; @@ -9371,19 +9649,22 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (c1 == c2) { while (scan < this_eol && hardcount < max - && memEQ(scan, c1_utf8, UTF8SKIP(scan))) + && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan, + loceol))) { - scan += UTF8SKIP(scan); + scan += UTF8SKIP(c1_utf8); hardcount++; } } else { while (scan < this_eol && hardcount < max - && (memEQ(scan, c1_utf8, UTF8SKIP(scan)) - || memEQ(scan, c2_utf8, UTF8SKIP(scan)))) + && ( memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan, + loceol)) + || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan, + loceol)))) { - scan += UTF8SKIP(scan); + scan += UTF8_SAFE_SKIP(scan, loceol); hardcount++; } } @@ -9474,13 +9755,110 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, break; case ANYOFH: - if (utf8_target) while ( hardcount < max - && scan < this_eol - && reginclass(prog, p, (U8*)scan, (U8*) this_eol, - TRUE)) - { - scan += UTF8SKIP(scan); - hardcount++; + if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ + while ( hardcount < max + && scan < this_eol + && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case ANYOFHb: + if (utf8_target) { /* ANYOFHb only can match UTF-8 targets */ + + /* we know the first byte must be the FLAGS field */ + while ( hardcount < max + && scan < this_eol + && (U8) *scan == ANYOF_FLAGS(p) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, + TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case ANYOFHr: + if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ + while ( hardcount < max + && scan < this_eol + && inRANGE(NATIVE_UTF8_TO_I8(*scan), + LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)), + HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p))) + && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case ANYOFHs: + if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ + while ( hardcount < max + && scan + FLAGS(p) < this_eol + && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p)) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case ANYOFR: + if (utf8_target) { + while ( hardcount < max + && scan < this_eol + && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) + && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan, + (U8 *) this_eol, + NULL), + ANYOFRbase(p), ANYOFRdelta(p))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else { + while ( hardcount < max + && scan < this_eol + && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p))) + { + scan++; + hardcount++; + } + } + break; + + case ANYOFRb: + if (utf8_target) { + while ( hardcount < max + && scan < this_eol + && (U8) *scan == ANYOF_FLAGS(p) + && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan, + (U8 *) this_eol, + NULL), + ANYOFRbase(p), ANYOFRdelta(p))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else { + while ( hardcount < max + && scan < this_eol + && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p))) + { + scan++; + hardcount++; + } } break; @@ -9695,7 +10073,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, *startposp = scan; DEBUG_r({ - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); regprop(prog, prop, p, reginfo, NULL); @@ -9726,8 +10104,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, STATIC bool S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) { - dVAR; - const char flags = ANYOF_FLAGS(n); + const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs)) + ? 0 + : ANYOF_FLAGS(n); bool match = FALSE; UV c = *p; @@ -9754,7 +10133,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } /* If this character is potentially in the bitmap, check it */ - if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) { + if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; else if ((flags @@ -9767,7 +10146,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } else if (flags & ANYOF_LOCALE_FLAGS) { if ( (flags & ANYOFL_FOLD) - && c < sizeof(PL_fold_locale) + && c < 256 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { match = TRUE; @@ -9855,8 +10234,14 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const && IN_UTF8_CTYPE_LOCALE))) { SV* only_utf8_locale = NULL; - SV * const definition = _get_regclass_nonbitmap_data(prog, n, TRUE, - 0, &only_utf8_locale, NULL); + SV * const definition = +#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) + get_regclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale, NULL); +#else + get_re_gclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale, NULL); +#endif if (definition) { U8 utf8_buffer[2]; U8 * utf8_p; @@ -10056,6 +10441,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; eval_state->rex = rex; + eval_state->sv = reginfo->sv; if (reginfo->sv) { /* Make $_ available to executed code. */ @@ -10063,6 +10449,8 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) SAVE_DEFSV; DEFSV_set(reginfo->sv); } + /* will be dec'd by S_cleanup_regmatch_info_aux */ + SvREFCNT_inc_NN(reginfo->sv); if (!(mg = mg_find_mglob(reginfo->sv))) { /* prepare for quick setting of pos */ @@ -10088,7 +10476,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) /* this regexp is also owned by the new PL_reg_curpm, which will try to free it. */ av_push(PL_regex_padav, repointer); - PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav); + PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav); PL_regex_pad = AvARRAY(PL_regex_padav); } #endif @@ -10154,6 +10542,7 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg) } PL_curpm = eval_state->curpm; + SvREFCNT_dec(eval_state->sv); } PL_regmatch_state = aux->old_regmatch_state; @@ -10224,6 +10613,7 @@ S_to_byte_substr(pTHX_ regexp *prog) && !prog->substrs->data[i].substr) { SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); if (! sv_utf8_downgrade(sv, TRUE)) { + SvREFCNT_dec_NN(sv); return FALSE; } if (SvVALID(prog->substrs->data[i].utf8_substr)) { @@ -10247,23 +10637,22 @@ S_to_byte_substr(pTHX_ regexp *prog) #ifndef PERL_IN_XSUB_RE bool -Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp) +Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp) { /* Temporary helper function for toke.c. Verify that the code point 'cp' * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in * the larger string bounded by 'strbeg' and 'strend'. * - * 'cp' needs to be assigned (if not a future version of the Unicode + * 'cp' needs to be assigned (if not, a future version of the Unicode * Standard could make it something that combines with adjacent characters, * so code using it would then break), and there has to be a GCB break * before and after the character. */ - dVAR; GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val; const U8 * prev_cp_start; - PERL_ARGS_ASSERT__IS_GRAPHEME; + PERL_ARGS_ASSERT_IS_GRAPHEME; if ( UNLIKELY(UNICODE_IS_SUPER(cp)) || UNLIKELY(UNICODE_IS_NONCHAR(cp))) @@ -10311,7 +10700,7 @@ Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, cons } /* -=head1 Unicode Support +=for apidoc_section Unicode Support =for apidoc isSCRIPT_RUN @@ -10380,7 +10769,6 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) * characters for at least one language in the Unicode Common Locale Data * Repository [CLDR]. */ - dVAR; /* Things that match /\d/u */ SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT]; @@ -10469,10 +10857,7 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) /* If is within the range [+0 .. +9] of the script's zero, it also is a * digit in that script. We can skip the rest of this code for this * character. */ - if (UNLIKELY( zero_of_run - && cp >= zero_of_run - && cp - zero_of_run <= 9)) - { + if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) { continue; } @@ -10693,7 +11078,7 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) * several scripts, and the intersection is not empty. However, if the * character is a decimal digit, it could still mean failure if it is * from the wrong sequence of 10. So, we need to look at if it's a - * digit. We've already handled the 10 decimal digits, and the next + * digit. We've already handled the 10 digits [0-9], and the next * lowest one is this one: */ if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) { continue; /* Not a digit; this character is part of the run */ @@ -10705,9 +11090,7 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) if ( script_of_char >= 0 && (zero_of_char = script_zeros[script_of_char])) { - if ( cp < zero_of_char - || cp > zero_of_char + 9) - { + if (! withinCOUNT(cp, zero_of_char, 9)) { continue; /* Not a digit; this character is part of the run */ }