#define RExC_mysv2 (pRExC_state->mysv2)
#endif
+ bool seen_d_op;
bool strict;
bool study_started;
bool in_script_run;
#define RExC_parse (pRExC_state->parse)
#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
#define RExC_whilem_seen (pRExC_state->whilem_seen)
+#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
+ under /d from /u ? */
#ifdef RE_TRACK_PATTERN_OFFSETS
if (DEPENDS_SEMANTICS) { \
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
RExC_uni_semantics = 1; \
+ if (RExC_seen_d_op && LIKELY(RExC_total_parens >= 0)) { \
+ /* No need to restart the parse if we haven't seen \
+ * anything that differs between /u and /d, and no need \
+ * to restart immediately if we're going to reparse \
+ * anyway to count parens */ \
*flagp |= RESTART_PARSE; \
return restart_retval; \
+ } \
} \
} STMT_END
#define REQUIRE_BRANCHJ(flagp, restart_retval) \
STMT_START { \
RExC_use_BRANCHJ = 1; \
- *flagp |= RESTART_PARSE; \
- return restart_retval; \
+ if (LIKELY(RExC_total_parens >= 0)) { \
+ /* No need to restart the parse immediately if we're \
+ * going to reparse anyway to count parens */ \
+ *flagp |= RESTART_PARSE; \
+ return restart_retval; \
+ } \
} STMT_END
#define REQUIRE_PARENS_PASS \
if (RExC_total_parens == 0) RExC_total_parens = -1; \
} STMT_END
-/* Executes a return statement with the value 'X', if 'flags' contains any of
- * 'RESTART_PARSE', 'NEED_UTF8', or 'extra'. If so, *flagp is set to those
- * flags */
-#define RETURN_X_ON_RESTART_OR_FLAGS(X, flags, flagp, extra) \
+/* This is used to return failure (zero) early from the calling function if
+ * various flags in 'flags' are set. Two flags always cause a return:
+ * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
+ * additional flags that should cause a return; 0 if none. If the return will
+ * be done, '*flagp' is first set to be all of the flags that caused the
+ * return. */
+#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
STMT_START { \
if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
*(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
- return X; \
+ return 0; \
} \
} STMT_END
-#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
- RETURN_X_ON_RESTART_OR_FLAGS(0,flags,flagp,extra)
-
-#define RETURN_X_ON_RESTART(X, flags,flagp) \
- RETURN_X_ON_RESTART_OR_FLAGS( X, flags, flagp, 0)
-
-
-#define RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp,extra) \
- if (*(flagp) & (RESTART_PARSE|(extra))) return 0
-
#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
#define RETURN_FAIL_ON_RESTART(flags,flagp) \
- RETURN_X_ON_RESTART(0, flags,flagp)
+ RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
- RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp, 0)
+ if (MUST_RESTART(*(flagp))) return 0
/* This converts the named class defined in regcomp.h to its equivalent class
* number defined in handy.h. */
(regnode_charclass *) scan);
break;
+ case NANYOFM:
case ANYOFM:
{
SV* cp_list = get_ANYOFM_contents(scan);
if (flags & SCF_DO_STCLASS_OR) {
- ssc_union(data->start_class,
- cp_list,
- FALSE /* don't invert */
- );
+ ssc_union(data->start_class, cp_list, invert);
}
else if (flags & SCF_DO_STCLASS_AND) {
- ssc_intersection(data->start_class,
- cp_list,
- FALSE /* don't invert */
- );
+ ssc_intersection(data->start_class, cp_list, invert);
}
SvREFCNT_dec_NN(cp_list);
RExC_close_parens = NULL;
RExC_paren_names = NULL;
RExC_size = 0;
+ RExC_seen_d_op = FALSE;
#ifdef DEBUGGING
RExC_paren_name_list = NULL;
#endif
goto redo_parse;
}
- /* In a stable state, as here, this must be true */
- assert(RExC_size = RExC_emit + 1);
-
/* Here, we have successfully parsed and generated the pattern's program
* for the regex engine. We are ready to finish things up and look for
* optimizations. */
SvREFCNT_dec_NN(substitute_parse);
if (! *node_p) {
- RETURN_X_ON_RESTART(FALSE, flags, flagp);
+ RETURN_FAIL_ON_RESTART(flags, flagp);
FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
(UV) flags);
}
FALSE, /* don't silence non-portable warnings. */
(bool) RExC_strict,
TRUE, /* Allow an optimized regnode result */
- NULL,
NULL);
if (ret == 0) {
- RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp, NEED_UTF8);
+ RETURN_FAIL_ON_RESTART_FLAGP(flagp);
FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
(UV) *flagp);
}
RExC_seen |= REG_LOOKBEHIND_SEEN;
op = BOUND + charset;
- if (op == BOUNDL) {
+ if (op == BOUND) {
+ RExC_seen_d_op = TRUE;
+ }
+ else if (op == BOUNDL) {
RExC_contains_locale = 1;
}
else if (op == POSIXL) {
RExC_contains_locale = 1;
}
+ else if (op == POSIXD) {
+ RExC_seen_d_op = TRUE;
+ }
join_posix_op_known:
non-portables */
(bool) RExC_strict,
TRUE, /* Allow an optimized regnode result */
- NULL,
NULL);
RETURN_FAIL_ON_RESTART_FLAGP(flagp);
/* regclass() can only return RESTART_PARSE and NEED_UTF8 if
? REFFL
: REFF),
num);
+ if (OP(REGNODE_p(ret)) == REFF) {
+ RExC_seen_d_op = TRUE;
+ }
*flagp |= HASWIDTH;
/* override incorrect value set in reganode MJD */
if (! FOLD) { /* The simple case, just append the literal */
not_fold_common:
- if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
+ if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
+ *(s++) = (char) ender;
+ }
+ else {
U8 * new_s = uvchr_to_utf8((U8*)s, ender);
added_len = (char *) new_s - s;
s = (char *) new_s;
}
- else {
- *(s++) = (char) ender;
- }
}
else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
* character, and wait until runtime to fold it */
goto not_fold_common;
}
- else /* A regular FOLD code point */
- if (! UTF)
+ else /* regular fold; see if actually is in a fold */
+ if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
+ || (ender > 255
+ && ! _invlist_contains_cp(PL_utf8_foldable, ender)))
{
- /* Here, are folding and are not UTF-8 encoded; therefore
- * the character must be in the range 0-255, and is not /l.
- * (Not /l because we already handled these under /l in
- * is_PROBLEMATIC_LOCALE_FOLD_cp) */
- if (! IS_IN_SOME_FOLD_L1(ender)) {
-
- /* Start a new node for this non-folding character if
- * previous ones in the node were folded */
- if (len && node_type != EXACT) {
- p = oldp;
- goto loopdone;
- }
+ /* Here, folding, but the character isn't in a fold.
+ *
+ * Start a new node if previous characters in the node were
+ * folded */
+ if (len && node_type != EXACT) {
+ p = oldp;
+ goto loopdone;
+ }
+
+ /* Here, continuing a node with non-folded characters. Add
+ * this one */
+ if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
*(s++) = (char) ender;
}
- else { /* Here, does participate in some fold */
-
- /* if this is the first character in the node, change
- * its type to folding. Otherwise, if this is the
- * first folding character in the node, close up the
- * existing node, so can start a new node with this
- * one. */
- if (! len) {
- node_type = compute_EXACTish(pRExC_state);
+ else {
+ s = (char *) uvchr_to_utf8((U8 *) s, ender);
+ added_len = UVCHR_SKIP(ender);
+ }
+ }
+ else { /* Here, does participate in some fold */
+
+ /* If this is the first character in the node, change its
+ * type to folding. Otherwise, if this is the first
+ * folding character in the node, close up the existing
+ * node, so can start a new node with this one. */
+ if (! len) {
+ node_type = compute_EXACTish(pRExC_state);
+ }
+ else if (node_type == EXACT) {
+ p = oldp;
+ goto loopdone;
+ }
+
+ if (UTF) { /* For UTF-8, we add the folded value */
+ if (UVCHR_IS_INVARIANT(ender)) {
+ *(s)++ = (U8) toFOLD(ender);
}
- else if (node_type == EXACT) {
- p = oldp;
- goto loopdone;
+ else {
+ ender = _to_uni_fold_flags(
+ ender,
+ (U8 *) s,
+ &added_len,
+ FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
+ s += added_len;
}
+ }
+ else {
- /* See if the character's fold differs between /d and
- * /u. On non-ancient Unicode versions, this includes
- * the multi-char fold SHARP S to 'ss' */
+ /* Here is non-UTF8; we don't normally store the folded
+ * value. First, see if the character's fold differs
+ * between /d and /u. */
+ if (PL_fold[ender] != PL_fold_latin1[ender]) {
+ maybe_exactfu = FALSE;
+ }
#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
|| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
|| UNICODE_DOT_DOT_VERSION > 0)
- if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+ /* On non-ancient Unicode versions, this includes the
+ * multi-char fold SHARP S to 'ss' */
- /* See comments for join_exact() as to why we fold
- * this non-UTF at compile time */
- if (node_type == EXACTFU) {
- *(s++) = 's';
+ else if (UNLIKELY( ender == LATIN_SMALL_LETTER_SHARP_S
+ || ( len
+ && isALPHA_FOLD_EQ(ender, 's')
+ && isALPHA_FOLD_EQ(*(s-1), 's'))))
+ {
- /* Let the code below add in the extra 's' */
- ender = 's';
- added_len = 2;
+ if (node_type == EXACTFU) {
+ /* See comments for join_exact() as to why we
+ * fold this non-UTF at compile time */
+ if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+ *(s++) = 's';
+
+ /* Let the code below add in the extra 's' */
+ ender = 's';
+ added_len = 2;
+ }
}
- else if (! RExC_uni_semantics) {
+ else {
maybe_exactfu = FALSE;
}
}
- else if ( len
- && isALPHA_FOLD_EQ(ender, 's')
- && isALPHA_FOLD_EQ(*(s-1), 's'))
- {
- maybe_exactfu = FALSE;
- }
- else
#endif
- if (PL_fold[ender] != PL_fold_latin1[ender]) {
- maybe_exactfu = FALSE;
- }
-
/* Even when folding, we store just the input
* character, as we have an array that finds its fold
* quickly */
*(s++) = (char) ender;
}
- }
- else { /* FOLD, and UTF */
- /* Unlike the non-fold case, we do actually have to
- * calculate the fold in pass 1. This is for two reasons,
- * the folded length may be longer than the unfolded, and
- * we have to calculate how many EXACTish nodes it will
- * take; and we may run out of room in a node in the middle
- * of a potential multi-char fold, and have to back off
- * accordingly. */
-
- if (isASCII_uni(ender)) {
-
- /* As above, we close up and start a new node if the
- * previous characters don't match the fold/non-fold
- * state of this one. And if this is the first
- * character in the node, and it folds, we change the
- * node away from being EXACT */
- if (! IS_IN_SOME_FOLD_L1(ender)) {
- if (len && node_type != EXACT) {
- p = oldp;
- goto loopdone;
- }
-
- *(s)++ = (U8) ender;
- }
- else { /* Is in a fold */
-
- if (! len) {
- node_type = compute_EXACTish(pRExC_state);
- }
- else if (node_type == EXACT) {
- p = oldp;
- goto loopdone;
- }
-
- *(s)++ = (U8) toFOLD(ender);
- }
- }
- else { /* Not ASCII */
- STRLEN foldlen;
-
- /* As above, we close up and start a new node if the
- * previous characters don't match the fold/non-fold
- * state of this one. And if this is the first
- * character in the node, and it folds, we change the
- * node away from being EXACT */
- if (! _invlist_contains_cp(PL_utf8_foldable, ender)) {
- if (len && node_type != EXACT) {
- p = oldp;
- goto loopdone;
- }
-
- s = (char *) uvchr_to_utf8((U8 *) s, ender);
- added_len = UVCHR_SKIP(ender);
- }
- else {
-
- if (! len) {
- node_type = compute_EXACTish(pRExC_state);
- }
- else if (node_type == EXACT) {
- p = oldp;
- goto loopdone;
- }
-
- ender = _to_uni_fold_flags(
- ender,
- (U8 *) s,
- &foldlen,
- FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0));
- s += foldlen;
- added_len = foldlen;
- }
- }
} /* End of adding current character to the node */
len += added_len;
OP(REGNODE_p(ret)) = EXACTFLU8;
}
}
+ else if (node_type == EXACTF) {
+ RExC_seen_d_op = TRUE;
+ }
}
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
FALSE, /* don't silence non-portable warnings. */
TRUE, /* strict */
FALSE, /* Require return to be an ANYOF */
- ¤t,
- NULL))
+ ¤t))
{
FAIL2("panic: regclass returned failure to handle_sets, "
"flags=%#" UVxf, (UV) *flagp);
TRUE, /* silence non-portable warnings. */
TRUE, /* strict */
FALSE, /* Require return to be an ANYOF */
- ¤t,
- NULL
- ))
+ ¤t))
{
FAIL2("panic: regclass returned failure to handle_sets, "
"flags=%#" UVxf, (UV) *flagp);
they're valid on this machine */
FALSE, /* similarly, no need for strict */
FALSE, /* Require return to be an ANYOF */
- NULL,
NULL
);
}
STATIC void
-S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings)
+S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
{
- /* If the final parameter is NULL, output the elements of the array given
- * by '*posix_warnings' as REGEXP warnings. Otherwise, the elements are
- * pushed onto it, (creating if necessary) */
+ /* Output the elements of the array given by '*posix_warnings' as REGEXP
+ * warnings. */
SV * msg;
- const bool first_is_fatal = ! return_posix_warnings
- && ckDEAD(packWARN(WARN_REGEXP));
+ const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
- PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS;
+ PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
+
+ if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
+ return;
+ }
while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
- if (return_posix_warnings) {
- if (! *return_posix_warnings) { /* mortalize to not leak if
- warnings are fatal */
- *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV());
- }
- av_push(*return_posix_warnings, msg);
- }
- else {
- if (first_is_fatal) { /* Avoid leaking this */
- av_undef(posix_warnings); /* This isn't necessary if the
- array is mortal, but is a
- fail-safe */
- (void) sv_2mortal(msg);
- if (ckDEAD(packWARN(WARN_REGEXP))) {
- PREPARE_TO_DIE;
- }
- }
- if (TO_OUTPUT_WARNINGS(RExC_parse)) {
- Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s",
- SvPVX(msg));
- }
- SvREFCNT_dec_NN(msg);
+ if (first_is_fatal) { /* Avoid leaking this */
+ av_undef(posix_warnings); /* This isn't necessary if the
+ array is mortal, but is a
+ fail-safe */
+ (void) sv_2mortal(msg);
+ PREPARE_TO_DIE;
}
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
+ SvREFCNT_dec_NN(msg);
}
- if (! return_posix_warnings) {
- UPDATE_WARNINGS_LOC(RExC_parse);
- }
+ UPDATE_WARNINGS_LOC(RExC_parse);
}
STATIC AV *
const bool strict,
bool optimizable, /* ? Allow a non-ANYOF return
node */
- SV** ret_invlist, /* Return an inversion list, not a node */
- AV** return_posix_warnings
+ SV** ret_invlist /* Return an inversion list, not a node */
)
{
/* parse a bracketed class specification. Most of these will produce an
bool warn_super = ALWAYS_WARN_SUPER;
- const regnode_offset orig_emit = RExC_emit; /* Save the original RExC_emit in
- case we need to change the emitted regop to an EXACT. */
const char * orig_parse = RExC_parse;
bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
char *not_posix_region_end = RExC_parse - 1;
AV* posix_warnings = NULL;
- const bool do_posix_warnings = return_posix_warnings || ckWARN(WARN_REGEXP);
+ const bool do_posix_warnings = ckWARN(WARN_REGEXP);
U8 op = END; /* The returned node-type, initialized to an impossible
one. */
U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
PERL_UNUSED_ARG(depth);
#endif
+
+ /* If wants an inversion list returned, we can't optimize to something
+ * else. */
+ if (ret_invlist) {
+ optimizable = FALSE;
+ }
+
DEBUG_PARSE("clas");
#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
{
/* Warnings about posix class issues are considered tentative until
* we are far enough along in the parse that we can no longer
- * change our mind, at which point we either output them or add
- * them, if it has so specified, to what gets returned to the
- * caller. This is done each time through the loop so that a later
- * class won't zap them before they have been dealt with. */
- output_or_return_posix_warnings(pRExC_state, posix_warnings,
- return_posix_warnings);
+ * change our mind, at which point we output them. This is done
+ * each time through the loop so that a later class won't zap them
+ * before they have been dealt with. */
+ output_posix_warnings(pRExC_state, posix_warnings);
}
if (RExC_parse >= stop_ptr) {
range = 0; /* this range (if it was one) is done now */
} /* End of loop through all the text within the brackets */
-
if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
- output_or_return_posix_warnings(pRExC_state, posix_warnings,
- return_posix_warnings);
+ output_posix_warnings(pRExC_state, posix_warnings);
}
/* If anything in the class expands to more than one character, we have to
|| (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
{
use_anyofd = TRUE;
+ RExC_seen_d_op = TRUE;
optimizable = FALSE;
}
-
/* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
* at compile time. Besides not inverting folded locale now, we can't
* invert if there are things such as \w, which aren't known until runtime
if (optimizable) {
int posix_class = -1; /* Illegal value */
- const char * cur_parse= RExC_parse;
U8 ANYOFM_mask = 0xFF;
U32 anode_arg = 0;
UV start, end;
* usage, is optimizable into ANYOFM, and can benefit from the
* speed up. We can only do this on UTF-8 invariant bytes,
* because the variance would throw this off. */
- if ( op == END
- && invlist_highest(cp_list) <=
+ if (op == END) {
+ PERL_UINT_FAST8_T inverted = 0;
#ifdef EBCDIC
- 0xFF
+ const PERL_UINT_FAST8_T max_permissible = 0xFF;
#else
- 0x7F
+ const PERL_UINT_FAST8_T max_permissible = 0x7F;
#endif
- ) {
+ if (invlist_highest(cp_list) > max_permissible) {
+ _invlist_invert(cp_list);
+ inverted = 1;
+ }
+
+ if (invlist_highest(cp_list) <= max_permissible) {
Size_t cp_count = 0;
bool first_time = TRUE;
unsigned int lowest_cp = 0xFF;
U8 bits_differing = 0;
- /* Only needed on EBCDIC, as there, variants and non- are
- * mixed together. Could #ifdef it out on ASCII, but
- * probably the compiler will optimize it out */
+ /* Only needed on EBCDIC, as there, variants and non- are mixed
+ * together. Could #ifdef it out on ASCII, but probably the
+ * compiler will optimize it out */
bool has_variant = FALSE;
- /* Go through the bytes and find the bit positions that
- * differ */
+ /* Go through the bytes and find the bit positions that differ */
invlist_iterinit(cp_list);
while (invlist_iternext(cp_list, &start, &end)) {
unsigned int i = start;
if ( ! has_variant
&& cp_count == 1U << PL_bitcount[bits_differing])
{
- assert(cp_count > 1);
- op = ANYOFM;
+ assert(inverted || cp_count > 1);
+ op = ANYOFM + inverted;;
/* We need to make the bits that differ be 0's */
ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS
*flagp |= HASWIDTH|SIMPLE;
}
}
+ if (inverted) {
+ _invlist_invert(cp_list);
+ }
+ }
}
}
if (op != END) {
- RExC_parse = (char *)orig_parse;
- RExC_emit = orig_emit;
-
if (regarglen[op]) {
ret = reganode(pRExC_state, op, anode_arg);
} else {
ret = reg_node(pRExC_state, op);
}
-
- RExC_parse = (char *)cur_parse;
+ Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
+ RExC_parse - orig_parse);;
if (PL_regkind[op] == EXACT) {
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
STATIC SV*
S_get_ANYOFM_contents(pTHX_ const regnode * n) {
- /* Returns an inversion list of all the code points matched by the ANYOFM
- * node 'n' */
+ /* Returns an inversion list of all the code points matched by the
+ * ANYOFM/NANYOFM node 'n' */
SV * cp_list = _new_invlist(-1);
const U8 lowest = (U8) ARG(n);
}
}
+ if (OP(n) == NANYOFM) {
+ _invlist_invert(cp_list);
+ }
return cp_list;
}
SV * cp_list = get_ANYOFM_contents(o);
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+ if (OP(o) == NANYOFM) {
+ _invlist_invert(cp_list);
+ }
+
put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);