/* Doesn't do an assert to verify that is correct */
#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
if (!CAT2(PL_utf8_,class)) { \
- bool throw_away __attribute__unused__; \
+ bool throw_away PERL_UNUSED_DECL; \
ENTER; save_re_context(); \
throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
LEAVE; } } STMT_END
uscan += len; \
len=0; \
} else { \
- uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
- uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
+ uvc = to_utf8_fold( (U8 *) uc, foldbuf, &foldlen ); \
+ len = UTF8SKIP(uc); \
foldlen -= UNISKIP( uvc ); \
uscan = foldbuf + UNISKIP( uvc ); \
} \
* fact that the Latin 1 folds are already determined, and the
* only multi-char fold in that range is the sharp-s folding to
* 'ss'. Thus, a pattern character can match as little as 1/3 of a
- * string character. Adjust lnc accordingly, always matching at
- * least 1 */
+ * string character. Adjust lnc accordingly, rounding up, so that
+ * if we need to match at least 4+1/3 chars, that really is 5. */
expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
- lnc = (lnc < expansion) ? 1 : lnc / expansion;
+ lnc = (lnc + expansion - 1) / expansion;
/* As in the non-UTF8 case, if we have to match 3 characters, and
* only 2 are left, it's guaranteed to fail, so don't start a
e = s; /* Due to minlen logic of intuit() */
}
- /* XXX Note that we could recalculate e every so-often through the
- * loop to stop earlier, as the worst case expansion above will
- * rarely be met, and as we go along we would usually find that e
- * moves further to the left. Unclear if worth the expense */
+ /* XXX Note that we could recalculate e to stop the loop earlier,
+ * as the worst case expansion above will rarely be met, and as we
+ * go along we would usually find that e moves further to the left.
+ * This would happen only after we reached the point in the loop
+ * where if there were no expansion we should fail. Unclear if
+ * worth the expense */
while (s <= e) {
char *my_strend= (char *)strend;
{
goto got_it;
}
- s += UTF8SKIP(s);
+ s += (utf8_target) ? UTF8SKIP(s) : 1;
}
break;
}
&& !(PL_reg_flags & RF_warned))
{
PL_reg_flags |= RF_warned;
- Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
- "Complex regular subexpression recursion",
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP),
+ "Complex regular subexpression recursion limit (%d) "
+ "exceeded",
REG_INFTY - 1);
}
{
PL_reg_flags |= RF_warned;
Perl_warner(aTHX_ packWARN(WARN_REGEXP),
- "%s limit (%d) exceeded",
- "Complex regular subexpression recursion",
+ "Complex regular subexpression recursion "
+ "limit (%d) exceeded",
REG_INFTY - 1);
}
cur_curlyx->u.curlyx.count--;
/* Here, the string is utf8, and the pattern char is different
* in utf8 than not, so can't compare them directly. Outside the
- * loop, find find the two utf8 bytes that represent c, and then
+ * loop, find the two utf8 bytes that represent c, and then
* look for those in sequence in the utf8 string */
U8 high = UTF8_TWO_BYTE_HI(c);
U8 low = UTF8_TWO_BYTE_LO(c);
case EXACTF:
case EXACTFU:
- utf8_flags = 0;
+ utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
/* The comments for the EXACT case above apply as well to these fold
* ones */
#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
/*
-- regclass_swash - prepare the utf8 swash
-*/
-
+- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
+create a copy so that changes the caller makes won't change the shared one
+ */
SV *
Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
{
+ PERL_ARGS_ASSERT_REGCLASS_SWASH;
+ return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
+}
+#endif
+
+STATIC SV *
+S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
+{
+ /* Returns the swash for the input 'node' in the regex 'prog'.
+ * If <doinit> is true, will attempt to create the swash if not already
+ * done.
+ * If <listsvp> is non-null, will return the swash initialization string in
+ * it.
+ * If <altsvp> is non-null, will return the alternates to the regular swash
+ * in it
+ * Tied intimately to how regcomp.c sets up the data structure */
+
dVAR;
SV *sw = NULL;
SV *si = NULL;
SV *alt = NULL;
+ SV* invlist = NULL;
+
RXi_GET_DECL(prog,progi);
const struct reg_data * const data = prog ? progi->data : NULL;
- PERL_ARGS_ASSERT_REGCLASS_SWASH;
+ PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
assert(ANYOF_NONBITMAP(node));
SV * const rv = MUTABLE_SV(data->data[n]);
AV * const av = MUTABLE_AV(SvRV(rv));
SV **const ary = AvARRAY(av);
- SV **a, **b;
+ bool invlist_has_user_defined_property;
- /* See the end of regcomp.c:S_regclass() for
- * documentation of these array elements. */
-
- si = *ary;
- a = SvROK(ary[1]) ? &ary[1] : NULL;
- b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
+ si = *ary; /* ary[0] = the string to initialize the swash with */
+
+ /* Elements 3 and 4 are either both present or both absent. [3] is
+ * any inversion list generated at compile time; [4] indicates if
+ * that inversion list has any user-defined properties in it. */
+ if (av_len(av) >= 3) {
+ invlist = ary[3];
+ invlist_has_user_defined_property = cBOOL(SvUV(ary[4]));
+ }
+ else {
+ invlist = NULL;
+ invlist_has_user_defined_property = FALSE;
+ }
- if (a)
- sw = *a;
+ /* Element [1] is reserved for the set-up swash. If already there,
+ * return it; if not, create it and store it there */
+ if (SvROK(ary[1])) {
+ sw = ary[1];
+ }
else if (si && doinit) {
- sw = swash_init("utf8", "", si, 1, 0);
+
+ sw = _core_swash_init("utf8", /* the utf8 package */
+ "", /* nameless */
+ si,
+ 1, /* binary */
+ 0, /* not from tr/// */
+ FALSE, /* is error if can't find
+ property */
+ invlist,
+ invlist_has_user_defined_property);
(void)av_store(av, 1, sw);
}
- if (b)
- alt = *b;
+
+ /* Element [2] is for any multi-char folds. Note that is a
+ * fundamentally flawed design, because can't backtrack and try
+ * again. See [perl #89774] */
+ if (SvTYPE(ary[2]) == SVt_PVAV) {
+ alt = ary[2];
+ }
}
}
- if (listsvp)
- *listsvp = si;
+ if (listsvp) {
+ SV* matches_string = newSVpvn("", 0);
+ SV** invlistsvp;
+
+ /* Use the swash, if any, which has to have incorporated into it all
+ * possibilities */
+ if ( sw
+ && SvROK(sw)
+ && SvTYPE(SvRV(sw)) == SVt_PVHV
+ && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE)))
+ {
+ invlist = *invlistsvp;
+ }
+ else if (si && si != &PL_sv_undef) {
+
+ /* If no swash, use the input nitialization string, if available */
+ sv_catsv(matches_string, si);
+ }
+
+ /* Add the inversion list to whatever we have. This may have come from
+ * the swash, or from an input parameter */
+ if (invlist) {
+ sv_catsv(matches_string, _invlist_contents(invlist));
+ }
+ *listsvp = matches_string;
+ }
+
if (altsvp)
*altsvp = alt;
return sw;
}
-#endif
/*
- reginclass - determine if a character falls into a character class