#define RExC_mysv2 (pRExC_state->mysv2)
#endif
+ bool seen_unfolded_sharp_s;
};
#define RExC_flags (pRExC_state->flags)
#define RExC_end (pRExC_state->end)
#define RExC_parse (pRExC_state->parse)
#define RExC_whilem_seen (pRExC_state->whilem_seen)
+
+/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
+ * EXACTF node, hence was parsed under /di rules. If later in the parse,
+ * something forces the pattern into using /ui rules, the sharp s should be
+ * folded into the sequence 'ss', which takes up more space than previously
+ * calculated. This means that the sizing pass needs to be restarted. (The
+ * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
+ * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
+ * so there is no need to resize [perl #125990]. */
+#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
+
#ifdef RE_TRACK_PATTERN_OFFSETS
#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
others */
} \
} STMT_END
+/* Change from /d into /u rules, and restart the parse if we've already seen
+ * something whose size would increase as a result, by setting *flagp and
+ * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
+ * we've change to /u during the parse. */
+#define REQUIRE_UNI_RULES(flagp, restart_retval) \
+ STMT_START { \
+ if (DEPENDS_SEMANTICS) { \
+ assert(PASS1); \
+ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
+ RExC_uni_semantics = 1; \
+ if (RExC_seen_unfolded_sharp_s) { \
+ *flagp |= RESTART_PASS1; \
+ return restart_retval; \
+ } \
+ } \
+ } STMT_END
+
/* This converts the named class defined in regcomp.h to its equivalent class
* number defined in handy.h. */
#define namedclass_to_classnum(class) ((int) ((class) / 2))
/* ignore the utf8ness if the pattern is 0 length */
RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
+
RExC_uni_semantics = 0;
+ RExC_seen_unfolded_sharp_s = 0;
RExC_contains_locale = 0;
RExC_contains_i = 0;
RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
});
redo_first_pass:
- /* we jump here if we upgrade the pattern to utf8 and have to
- * recompile */
+ /* we jump here if we have to recompile, e.g., from upgrading the pattern
+ * to utf8 */
if ((pm_flags & PMf_USE_RE_EVAL)
/* this second condition covers the non-regex literal case,
if (rx_flags & PMf_FOLD) {
RExC_contains_i = 1;
}
- if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
+ if ( initial_charset == REGEX_DEPENDS_CHARSET
+ && (RExC_utf8 ||RExC_uni_semantics))
+ {
/* Set to use unicode semantics if the pattern is in utf8 and has the
* 'depends' charset specified, as it means unicode when utf8 */
S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
pRExC_state->num_code_blocks);
}
+ else {
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "Need to redo pass 1\n"));
+ }
+
goto redo_first_pass;
}
Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
/* Check for proper termination. */
if (paren) {
- /* restore original flags, but keep (?p) */
+ /* restore original flags, but keep (?p) and, if we've changed from /d
+ * rules to /u, keep the /u */
RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
+ if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
+ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
+ }
if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched (");
* sequence. *node_p * will be set to a generated node returned by this
* function calling S_reg().
*
- * The final possibility, which happens only when the fourth one would
- * otherwise be in effect, is that one of those code points requires the
+ * The final possibility, which happens is that it is premature to be calling
+ * this function; that pass1 needs to be restarted. This can happen when this
+ * changes from /d to /u rules, or when the pattern needs to be upgraded to
+ * UTF-8. The latter occurs only when the fourth possibility would otherwise
+ * be in effect, and is because one of those code points requires the
* pattern to be recompiled as UTF-8. The function returns FALSE, and sets
- * the RESTART_PASS1 and NEED_UTF8 flags in *flagp. When this happens, the
- * caller needs to desist from continuing parsing, and return this information
- * to its caller. This is not set for when there is only one code point, as
- * this can be called as part of an ANYOF node, and they can store
- * above-Latin1 code points without the pattern having to be in UTF-8.
- * XXX
+ * the RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate. When this
+ * happens, the caller needs to desist from continuing parsing, and return
+ * this information to its caller. This is not set for when there is only one
+ * code point, as this can be called as part of an ANYOF node, and they can
+ * store above-Latin1 code points without the pattern having to be in UTF-8.
*
* For non-single-quoted regexes, the tokenizer has resolved character and
* sequence names inside \N{...} into their Unicode values, normalizing the
vFAIL("\\N{NAME} must be resolved by the lexer");
}
- RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
+ REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
+ semantics */
if (endbrace == RExC_parse) { /* empty: \N{} */
if (cp_count) {
NOT_REACHED; /*NOTREACHED*/
}
RExC_parse = endbrace;
- RExC_uni_semantics = 1;
+ REQUIRE_UNI_RULES(flagp, NULL);
if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
OP(ret) = BOUNDU;
(bool) RExC_strict,
TRUE, /* Allow an optimized regnode result */
NULL);
+ if (*flagp & RESTART_PASS1)
+ return NULL;
/* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
* multi-char folds are allowed. */
if (!ret)
) {
if (*flagp & NEED_UTF8)
FAIL("panic: grok_bslash_N set NEED_UTF8");
+ if (*flagp & RESTART_PASS1)
+ return NULL;
/* Here, it wasn't a single code point. Go close
* up this EXACTish node. The switch() prior to
/* See if the character's fold differs between /d and
* /u. This includes the multi-char fold SHARP S to
* 'ss' */
- if (maybe_exactfu
+ if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+ RExC_seen_unfolded_sharp_s = 1;
+ maybe_exactfu = FALSE;
+ }
+ else if (maybe_exactfu
&& (PL_fold[ender] != PL_fold_latin1[ender]
#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
|| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
|| UNICODE_DOT_DOT_VERSION > 0)
- || ender == LATIN_SMALL_LETTER_SHARP_S
- || (len > 0
- && isALPHA_FOLD_EQ(ender, 's')
- && isALPHA_FOLD_EQ(*(s-1), 's'))
+ || ( len > 0
+ && isALPHA_FOLD_EQ(ender, 's')
+ && isALPHA_FOLD_EQ(*(s-1), 's'))
#endif
)) {
maybe_exactfu = FALSE;
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
}
- RExC_uni_semantics = 1; /* The use of this operator implies /u. This
- is required so that the compile time values
- are valid in all runtime cases */
+ REQUIRE_UNI_RULES(flagp, NULL); /* The use of this operator implies /u.
+ This is required so that the compile
+ time values are valid in all runtime
+ cases */
/* This will return only an ANYOF regnode, or (unlikely) something smaller
* (such as EXACT). Thus we can skip most everything if just sizing. We
if (*flagp & NEED_UTF8)
FAIL("panic: grok_bslash_N set NEED_UTF8");
+ if (*flagp & RESTART_PASS1)
+ return NULL;
if (cp_count < 0) {
vFAIL("\\N in a character class must be a named character: \\N{...}");
named */
/* \p means they want Unicode semantics */
- RExC_uni_semantics = 1;
+ REQUIRE_UNI_RULES(flagp, NULL);
}
break;
case 'n': value = '\n'; break;
/* non-Latin1 code point implies unicode semantics. Must be set in
* pass1 so is there for the whole of pass 2 */
if (value > 255) {
- RExC_uni_semantics = 1;
+ REQUIRE_UNI_RULES(flagp, NULL);
}
/* Ready to process either the single value, or the completed range.