X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/2daa8fee180b10307a7045b863759bd98f6fe154..6c6525b86477e0001fa63ee65eb355329aeef26a:/regexec.c diff --git a/regexec.c b/regexec.c index 0fea67f..4275b37 100644 --- a/regexec.c +++ b/regexec.c @@ -138,7 +138,7 @@ /* Doesn't do an assert to verify that is correct */ #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \ if (!CAT2(PL_utf8_,class)) { \ - bool throw_away __attribute__unused__; \ + bool throw_away PERL_UNUSED_DECL; \ ENTER; save_re_context(); \ throw_away = CAT2(is_utf8_,class)((const U8*)" "); \ LEAVE; } } STMT_END @@ -1200,8 +1200,8 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ uscan += len; \ len=0; \ } else { \ - uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \ - uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \ + uvc = to_utf8_fold( (U8 *) uc, foldbuf, &foldlen ); \ + len = UTF8SKIP(uc); \ foldlen -= UNISKIP( uvc ); \ uscan = foldbuf + UNISKIP( uvc ); \ } \ @@ -1552,10 +1552,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * fact that the Latin 1 folds are already determined, and the * only multi-char fold in that range is the sharp-s folding to * 'ss'. Thus, a pattern character can match as little as 1/3 of a - * string character. Adjust lnc accordingly, always matching at - * least 1 */ + * string character. Adjust lnc accordingly, rounding up, so that + * if we need to match at least 4+1/3 chars, that really is 5. */ expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; - lnc = (lnc < expansion) ? 1 : lnc / expansion; + lnc = (lnc + expansion - 1) / expansion; /* As in the non-UTF8 case, if we have to match 3 characters, and * only 2 are left, it's guaranteed to fail, so don't start a @@ -1567,10 +1567,12 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, e = s; /* Due to minlen logic of intuit() */ } - /* XXX Note that we could recalculate e every so-often through the - * loop to stop earlier, as the worst case expansion above will - * rarely be met, and as we go along we would usually find that e - * moves further to the left. Unclear if worth the expense */ + /* XXX Note that we could recalculate e to stop the loop earlier, + * as the worst case expansion above will rarely be met, and as we + * go along we would usually find that e moves further to the left. + * This would happen only after we reached the point in the loop + * where if there were no expansion we should fail. Unclear if + * worth the expense */ while (s <= e) { char *my_strend= (char *)strend; @@ -1580,7 +1582,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, { goto got_it; } - s += UTF8SKIP(s); + s += (utf8_target) ? UTF8SKIP(s) : 1; } break; } @@ -4841,8 +4843,9 @@ NULL && !(PL_reg_flags & RF_warned)) { PL_reg_flags |= RF_warned; - Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded", - "Complex regular subexpression recursion", + Perl_warner(aTHX_ packWARN(WARN_REGEXP), + "Complex regular subexpression recursion limit (%d) " + "exceeded", REG_INFTY - 1); } @@ -4865,8 +4868,8 @@ NULL { PL_reg_flags |= RF_warned; Perl_warner(aTHX_ packWARN(WARN_REGEXP), - "%s limit (%d) exceeded", - "Complex regular subexpression recursion", + "Complex regular subexpression recursion " + "limit (%d) exceeded", REG_INFTY - 1); } cur_curlyx->u.curlyx.count--; @@ -6006,7 +6009,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) /* Here, the string is utf8, and the pattern char is different * in utf8 than not, so can't compare them directly. Outside the - * loop, find find the two utf8 bytes that represent c, and then + * loop, find the two utf8 bytes that represent c, and then * look for those in sequence in the utf8 string */ U8 high = UTF8_TWO_BYTE_HI(c); U8 low = UTF8_TWO_BYTE_LO(c); @@ -6033,7 +6036,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) case EXACTF: case EXACTFU: - utf8_flags = 0; + utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0; /* The comments for the EXACT case above apply as well to these fold * ones */ @@ -6473,20 +6476,39 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) /* -- regclass_swash - prepare the utf8 swash -*/ - +- regclass_swash - prepare the utf8 swash. Wraps the shared core version to +create a copy so that changes the caller makes won't change the shared one + */ SV * Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp) { + PERL_ARGS_ASSERT_REGCLASS_SWASH; + return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp)); +} +#endif + +STATIC SV * +S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp) +{ + /* Returns the swash for the input 'node' in the regex 'prog'. + * If is true, will attempt to create the swash if not already + * done. + * If is non-null, will return the swash initialization string in + * it. + * If is non-null, will return the alternates to the regular swash + * in it + * Tied intimately to how regcomp.c sets up the data structure */ + dVAR; SV *sw = NULL; SV *si = NULL; SV *alt = NULL; + SV* invlist = NULL; + RXi_GET_DECL(prog,progi); const struct reg_data * const data = prog ? progi->data : NULL; - PERL_ARGS_ASSERT_REGCLASS_SWASH; + PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH; assert(ANYOF_NONBITMAP(node)); @@ -6497,34 +6519,82 @@ Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool SV * const rv = MUTABLE_SV(data->data[n]); AV * const av = MUTABLE_AV(SvRV(rv)); SV **const ary = AvARRAY(av); - SV **a, **b; + bool invlist_has_user_defined_property; - /* See the end of regcomp.c:S_regclass() for - * documentation of these array elements. */ - - si = *ary; - a = SvROK(ary[1]) ? &ary[1] : NULL; - b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL; + si = *ary; /* ary[0] = the string to initialize the swash with */ + + /* Elements 3 and 4 are either both present or both absent. [3] is + * any inversion list generated at compile time; [4] indicates if + * that inversion list has any user-defined properties in it. */ + if (av_len(av) >= 3) { + invlist = ary[3]; + invlist_has_user_defined_property = cBOOL(SvUV(ary[4])); + } + else { + invlist = NULL; + invlist_has_user_defined_property = FALSE; + } - if (a) - sw = *a; + /* Element [1] is reserved for the set-up swash. If already there, + * return it; if not, create it and store it there */ + if (SvROK(ary[1])) { + sw = ary[1]; + } else if (si && doinit) { - sw = swash_init("utf8", "", si, 1, 0); + + sw = _core_swash_init("utf8", /* the utf8 package */ + "", /* nameless */ + si, + 1, /* binary */ + 0, /* not from tr/// */ + FALSE, /* is error if can't find + property */ + invlist, + invlist_has_user_defined_property); (void)av_store(av, 1, sw); } - if (b) - alt = *b; + + /* Element [2] is for any multi-char folds. Note that is a + * fundamentally flawed design, because can't backtrack and try + * again. See [perl #89774] */ + if (SvTYPE(ary[2]) == SVt_PVAV) { + alt = ary[2]; + } } } - if (listsvp) - *listsvp = si; + if (listsvp) { + SV* matches_string = newSVpvn("", 0); + SV** invlistsvp; + + /* Use the swash, if any, which has to have incorporated into it all + * possibilities */ + if ( sw + && SvROK(sw) + && SvTYPE(SvRV(sw)) == SVt_PVHV + && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE))) + { + invlist = *invlistsvp; + } + else if (si && si != &PL_sv_undef) { + + /* If no swash, use the input nitialization string, if available */ + sv_catsv(matches_string, si); + } + + /* Add the inversion list to whatever we have. This may have come from + * the swash, or from an input parameter */ + if (invlist) { + sv_catsv(matches_string, _invlist_contents(invlist)); + } + *listsvp = matches_string; + } + if (altsvp) *altsvp = alt; return sw; } -#endif /* - reginclass - determine if a character falls into a character class