I32 in_lookbehind;
I32 contains_locale;
I32 override_recoding;
+ I32 in_multi_char_class;
struct reg_code_block *code_blocks; /* positions of literal (?{})
within pattern */
int num_code_blocks; /* size of code_blocks[] */
#define RExC_recurse_count (pRExC_state->recurse_count)
#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
#define RExC_contains_locale (pRExC_state->contains_locale)
-#define RExC_override_recoding (pRExC_state->override_recoding)
+#define RExC_override_recoding (pRExC_state->override_recoding)
+#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define WORST 0 /* Worst case. */
#define HASWIDTH 0x01 /* Known to match non-null strings. */
-/* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
- * character. Note that this is not the same thing as REGNODE_SIMPLE */
+/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
+ * character. (There needs to be a case: in the switch statement in regexec.c
+ * for any node marked SIMPLE.) Note that this is not the same thing as
+ * REGNODE_SIMPLE */
#define SIMPLE 0x02
-#define SPSTART 0x04 /* Starts with * or +. */
+#define SPSTART 0x04 /* Starts with * or + */
#define TRYAGAIN 0x08 /* Weeded out a declaration. */
#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
ANYOF_BITMAP_SETALL(cl);
cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
- |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
+ |ANYOF_NON_UTF8_LATIN1_ALL;
/* If any portion of the regex is to operate under locale rules,
* initialization includes it. The reason this isn't done for all regexes
* necessary. */
if (RExC_contains_locale) {
ANYOF_CLASS_SETALL(cl); /* /l uses class */
- cl->flags |= ANYOF_LOCALE;
+ cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
}
else {
ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
&& !(ANYOF_CLASS_TEST_ANY_SET(cl))
&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
- && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
+ && !(and_with->flags & ANYOF_LOC_FOLD)
+ && !(cl->flags & ANYOF_LOC_FOLD)) {
int i;
if (and_with->flags & ANYOF_INVERT)
* (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
*/
else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
- && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
+ && !(or_with->flags & ANYOF_LOC_FOLD)
+ && !(cl->flags & ANYOF_LOC_FOLD) ) {
int i;
for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
} else { /* 'or_with' is not inverted */
/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
- || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
+ && (!(or_with->flags & ANYOF_LOC_FOLD)
+ || (cl->flags & ANYOF_LOC_FOLD)) ) {
int i;
/* OR char bitmap and class bitmap separately */
/* Here, the pattern is not UTF-8. Look for the multi-char folds
* that are all ASCII. As in the above case, EXACTFL and EXACTFA
* nodes can't have multi-char folds to this range (and there are
- * no existing ones to the upper latin1 range). In the EXACTF
+ * no existing ones in the upper latin1 range). In the EXACTF
* case we look also for the sharp s, which can be in the final
* position. Otherwise we can stop looking 1 byte earlier because
* have to find at least two characters for a multi-fold */
const U8 s_masked = 's' & S_or_s_mask;
while (s < upper) {
- int len = is_MULTI_CHAR_FOLD_low_safe(s, s_end);
+ int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
if (! len) { /* Not a multi-char fold. */
if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
{
if (uc >= 0x100 ||
(!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
&& !ANYOF_BITMAP_TEST(data->start_class, uc)
- && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
+ && (!(data->start_class->flags & ANYOF_LOC_FOLD)
|| !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
)
{
if (compat) {
ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
- data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
if (OP(scan) == EXACTFL) {
/* XXX This set is probably no longer necessary, and
* probably wrong as LOCALE now is on in the initial
* state */
- data->start_class->flags |= ANYOF_LOCALE;
+ data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
}
else {
}
}
else if (flags & SCF_DO_STCLASS_OR) {
- if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
+ if (data->start_class->flags & ANYOF_LOC_FOLD) {
/* false positive possible if the class is case-folded.
Assume that the locale settings are the same... */
if (uc < 0x100) {
/* merge the main (r1) and run-time (r2) code blocks into one */
{
- RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
+ RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
struct reg_code_block *new_block, *dst;
RExC_state_t * const r1 = pRExC_state; /* convenient alias */
int i1 = 0, i2 = 0;
if (!r2->num_code_blocks) /* we guessed wrong */
+ {
+ SvREFCNT_dec(qr);
return 1;
+ }
Newx(new_block,
r1->num_code_blocks + r2->num_code_blocks,
I32 minlen = 0;
U32 rx_flags;
SV * VOL pat;
+ SV * VOL code_blocksv = NULL;
/* these are all flags - maybe they should be turned
* into a single int with different bit masks */
PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
+
+ PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
}
#endif
&& RX_ENGINE((REGEXP*)rx)->op_comp)
{
- RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
+ RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
if (ri->num_code_blocks) {
int i;
/* the presence of an embedded qr// with code means
for (i=0; i < ri->num_code_blocks; i++) {
struct reg_code_block *src, *dst;
STRLEN offset = orig_patlen
- + ((struct regexp *)SvANY(rx))->pre_prefix;
+ + ReANY((REGEXP *)rx)->pre_prefix;
assert(n < pRExC_state->num_code_blocks);
src = &ri->code_blocks[i];
dst = &pRExC_state->code_blocks[n];
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
RExC_extralen = 0;
RExC_override_recoding = 0;
+ RExC_in_multi_char_class = 0;
/* First pass: determine size, legality. */
RExC_parse = exp;
RExC_lastnum=0;
RExC_lastparse=NULL;
);
+ /* reg may croak on us, not giving us a chance to free
+ pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
+ need it to survive as long as the regexp (qr/(?{})/).
+ We must check that code_blocksv is not already set, because we may
+ have longjmped back. */
+ if (pRExC_state->code_blocks && !code_blocksv) {
+ code_blocksv = newSV_type(SVt_PV);
+ SAVEFREESV(code_blocksv);
+ SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
+ SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
+ }
if (reg(pRExC_state, 0, &flags,1) == NULL) {
RExC_precomp = NULL;
- Safefree(pRExC_state->code_blocks);
return(NULL);
}
+ if (code_blocksv)
+ SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
/* Here, finished first pass. Get rid of any added setjmp */
if (used_setjump) {
of zeroing when in debug mode, thus anything assigned has to
happen after that */
rx = (REGEXP*) newSV_type(SVt_REGEXP);
- r = (struct regexp*)SvANY(rx);
+ r = ReANY(rx);
Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
char, regexp_internal);
if ( r == NULL || ri == NULL )
ri->num_code_blocks = pRExC_state->num_code_blocks;
}
else
+ {
+ int n;
+ for (n = 0; n < pRExC_state->num_code_blocks; n++)
+ if (pRExC_state->code_blocks[n].src_regex)
+ SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
SAVEFREEPV(pRExC_state->code_blocks);
+ }
{
bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
+ (sizeof(STD_PAT_MODS) - 1)
+ (sizeof("(?:)") - 1);
- p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
- SvPOK_on(rx);
+ Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
+ r->xpv_len_u.xpvlenu_pv = p;
if (RExC_utf8)
SvFLAGS(rx) |= SVf_UTF8;
*p++='('; *p++='?';
*p++ = '\n';
*p++ = ')';
*p = 0;
- SvCUR_set(rx, p - SvPVX_const(rx));
+ SvCUR_set(rx, p - RX_WRAPPED(rx));
}
r->intflags = 0;
{
AV *retarray = NULL;
SV *ret;
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
const U32 flags)
{
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
SV*
Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
{
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
SV*
Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
{
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
SV *ret;
AV *av;
I32 length;
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
SV*
Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
{
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
AV *av = newAV();
PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
SV * const sv)
{
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
char *s = NULL;
I32 i = 0;
I32 s1, t1;
Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
const I32 paren)
{
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
I32 i;
I32 s1, t1;
if (in_char_class && has_multiple_chars) {
ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
}
+
RExC_parse = endbrace + 1;
}
else if (! node_p || ! has_multiple_chars) {
} \
}
-STATIC void
-S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
-{
- /* Adds input 'string' with length 'len' to the ANYOF node's unicode
- * alternate list, pointed to by 'alternate_ptr'. This is an array of
- * the multi-character folds of characters in the node */
- SV *sv;
-
- PERL_ARGS_ASSERT_ADD_ALTERNATE;
-
- if (! *alternate_ptr) {
- *alternate_ptr = newAV();
- }
- sv = newSVpvn_utf8((char*)string, len, TRUE);
- av_push(*alternate_ptr, sv);
- return;
-}
-
/* The names of properties whose definitions are not known at compile time are
* stored in this SV, after a constant heading. So if the length has been
* changed since initialization, then there is a run-time definition. */
* number defined in handy.h. */
#define namedclass_to_classnum(class) ((class) / 2)
-/*
- parse a class specification and produce either an ANYOF node that
- matches the pattern or perhaps will be optimized into an EXACTish node
- instead. The node contains a bit map for the first 256 characters, with the
- corresponding bit set if that character is in the list. For characters
- above 255, a range list is used */
-
STATIC regnode *
S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
+ /* parse a bracketed class specification. Most of these will produce an ANYOF node;
+ * but something like [a] will produce an EXACT node; [aA], an EXACTFish
+ * node; [[:ascii:]], a POSIXA node; etc. It is more complex under /i with
+ * multi-character folds: it will be rewritten following the paradigm of
+ * this example, where the <multi-fold>s are characters which fold to
+ * multiple character sequences:
+ * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
+ * gets effectively rewritten as:
+ * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
+ * reg() gets called (recursively) on the rewritten version, and this
+ * function will return what it constructs. (Actually the <multi-fold>s
+ * aren't physically removed from the [abcdefghi], it's just that they are
+ * ignored in the recursion by means of a a flag:
+ * <RExC_in_multi_char_class>.)
+ *
+ * ANYOF nodes contain a bit map for the first 256 characters, with the
+ * corresponding bit set if that character is in the list. For characters
+ * above 255, a range list or swash is used. There are extra bits for \w,
+ * etc. in locale ANYOFs, as what these match is not determinable at
+ * compile time */
+
dVAR;
UV nextvalue;
- UV prevvalue = OOB_UNICODE;
+ UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
IV range = 0;
- UV value = 0;
+ UV value = OOB_UNICODE, save_value = OOB_UNICODE;
regnode *ret;
STRLEN numlen;
IV namedclass = OOB_NAMEDCLASS;
char *rangebegin = NULL;
bool need_class = 0;
- bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
SV *listsv = NULL;
STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
than just initialized. */
extended beyond the Latin1 range */
UV element_count = 0; /* Number of distinct elements in the class.
Optimizations may be possible if this is tiny */
+ AV * multi_char_matches = NULL; /* Code points that fold to more than one
+ character; used under /i */
UV n;
/* Unicode properties are stored in a swash; this holds the current one
* of the target string */
SV* cp_list = NULL;
- /* List of multi-character folds that are matched by this node */
- AV* unicode_alternate = NULL;
#ifdef EBCDIC
/* In a range, counts how many 0-2 of the ends of it came from literals,
* not escapes. Thus we can tell if 'A' was input vs \x{C1} */
/* Assume we are going to generate an ANYOF node. */
ret = reganode(pRExC_state, ANYOF, 0);
-
if (!SIZE_ONLY) {
ANYOF_FLAGS(ret) = 0;
}
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
- RExC_naughty++;
RExC_parse++;
invert = TRUE;
-
- /* We have decided to not allow multi-char folds in inverted character
- * classes, due to the confusion that can happen, especially with
- * classes that are designed for a non-Unicode world: You have the
- * peculiar case that:
- "s s" =~ /^[^\xDF]+$/i => Y
- "ss" =~ /^[^\xDF]+$/i => N
- *
- * See [perl #89750] */
- allow_full_fold = FALSE;
+ RExC_naughty++;
}
if (SIZE_ONLY) {
charclassloop:
namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
+ save_value = value;
+ save_prevvalue = prevvalue;
if (!range) {
rangebegin = RExC_parse;
SV* scratch_list = NULL;
/* Include all above-Latin1 non-blanks */
- _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+ _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
+ &scratch_list);
/* Add them to the running total of posix classes */
- _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+ _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
+ &scratch_list);
if (! posixes) {
posixes = scratch_list;
}
/* Get the list of all non-ASCII-blanks in Latin 1, and
* add them to the running total */
- _invlist_subtract(PL_Latin1, PL_PosixBlank, &scratch_list);
+ _invlist_subtract(PL_Latin1, PL_PosixBlank,
+ &scratch_list);
_invlist_union(posixes, scratch_list, &posixes);
SvREFCNT_dec(scratch_list);
}
RExC_uni_semantics = 1;
}
- /* Ready to process either the single value, or the completed range */
- if (!SIZE_ONLY) {
+ /* Ready to process either the single value, or the completed range.
+ * For single-valued non-inverted ranges, we consider the possibility
+ * of multi-char folds. (We made a conscious decision to not do this
+ * for the other cases because it can often lead to non-intuitive
+ * results. For example, you have the peculiar case that:
+ * "s s" =~ /^[^\xDF]+$/i => Y
+ * "ss" =~ /^[^\xDF]+$/i => N
+ *
+ * See [perl #89750] */
+ if (FOLD && ! invert && value == prevvalue) {
+ if (value == LATIN_SMALL_LETTER_SHARP_S
+ || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
+ value)))
+ {
+ /* Here <value> is indeed a multi-char fold. Get what it is */
+
+ U8 foldbuf[UTF8_MAXBYTES_CASE];
+ STRLEN foldlen;
+
+ UV folded = _to_uni_fold_flags(
+ value,
+ foldbuf,
+ &foldlen,
+ FOLD_FLAGS_FULL
+ | ((LOC) ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0)
+ );
+
+ /* Here, <folded> should be the first character of the
+ * multi-char fold of <value>, with <foldbuf> containing the
+ * whole thing. But, if this fold is not allowed (because of
+ * the flags), <fold> will be the same as <value>, and should
+ * be processed like any other character, so skip the special
+ * handling */
+ if (folded != value) {
+
+ /* Skip if we are recursed, currently parsing the class
+ * again. Otherwise add this character to the list of
+ * multi-char folds. */
+ if (! RExC_in_multi_char_class) {
+ AV** this_array_ptr;
+ AV* this_array;
+ STRLEN cp_count = utf8_length(foldbuf,
+ foldbuf + foldlen);
+ SV* multi_fold = sv_2mortal(newSVpvn("", 0));
+
+ Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
+
+
+ if (! multi_char_matches) {
+ multi_char_matches = newAV();
+ }
+
+ /* <multi_char_matches> is actually an array of arrays.
+ * There will be one or two top-level elements: [2],
+ * and/or [3]. The [2] element is an array, each
+ * element thereof is a character which folds to two
+ * characters; likewise for [3]. (Unicode guarantees a
+ * maximum of 3 characters in any fold.) When we
+ * rewrite the character class below, we will do so
+ * such that the longest folds are written first, so
+ * that it prefers the longest matching strings first.
+ * This is done even if it turns out that any
+ * quantifier is non-greedy, out of programmer
+ * laziness. Tom Christiansen has agreed that this is
+ * ok. This makes the test for the ligature 'ffi' come
+ * before the test for 'ff' */
+ if (av_exists(multi_char_matches, cp_count)) {
+ this_array_ptr = (AV**) av_fetch(multi_char_matches,
+ cp_count, FALSE);
+ this_array = *this_array_ptr;
+ }
+ else {
+ this_array = newAV();
+ av_store(multi_char_matches, cp_count,
+ (SV*) this_array);
+ }
+ av_push(this_array, multi_fold);
+ }
+
+ /* This element should not be processed further in this
+ * class */
+ element_count--;
+ value = save_value;
+ prevvalue = save_prevvalue;
+ continue;
+ }
+ }
+ }
+
+ /* Deal with this element of the class */
+ if (! SIZE_ONLY) {
#ifndef EBCDIC
cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
#else
range = 0; /* this range (if it was one) is done now */
} /* End of loop through all the text within the brackets */
+ /* If anything in the class expands to more than one character, we have to
+ * deal with them by building up a substitute parse string, and recursively
+ * calling reg() on it, instead of proceeding */
+ if (multi_char_matches) {
+ SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
+ I32 cp_count;
+ STRLEN len;
+ char *save_end = RExC_end;
+ char *save_parse = RExC_parse;
+ bool first_time = TRUE; /* First multi-char occurrence doesn't get
+ a "|" */
+ I32 reg_flags;
+
+ assert(! invert);
+#if 0 /* Have decided not to deal with multi-char folds in inverted classes,
+ because too confusing */
+ if (invert) {
+ sv_catpv(substitute_parse, "(?:");
+ }
+#endif
+
+ /* Look at the longest folds first */
+ for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
+
+ if (av_exists(multi_char_matches, cp_count)) {
+ AV** this_array_ptr;
+ SV* this_sequence;
+
+ this_array_ptr = (AV**) av_fetch(multi_char_matches,
+ cp_count, FALSE);
+ while ((this_sequence = av_pop(*this_array_ptr)) !=
+ &PL_sv_undef)
+ {
+ if (! first_time) {
+ sv_catpv(substitute_parse, "|");
+ }
+ first_time = FALSE;
+
+ sv_catpv(substitute_parse, SvPVX(this_sequence));
+ }
+ }
+ }
+
+ /* If the character class contains anything else besides these
+ * multi-character folds, have to include it in recursive parsing */
+ if (element_count) {
+ sv_catpv(substitute_parse, "|[");
+ sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
+ sv_catpv(substitute_parse, "]");
+ }
+
+ sv_catpv(substitute_parse, ")");
+#if 0
+ if (invert) {
+ /* This is a way to get the parse to skip forward a whole named
+ * sequence instead of matching the 2nd character when it fails the
+ * first */
+ sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
+ }
+#endif
+
+ RExC_parse = SvPV(substitute_parse, len);
+ RExC_end = RExC_parse + len;
+ RExC_in_multi_char_class = 1;
+ RExC_emit = (regnode *)orig_emit;
+
+ ret = reg(pRExC_state, 1, ®_flags, depth+1);
+
+ *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
+
+ RExC_parse = save_parse;
+ RExC_end = save_end;
+ RExC_in_multi_char_class = 0;
+ SvREFCNT_dec(multi_char_matches);
+ return ret;
+ }
+
/* If the character class contains only a single element, it may be
* optimizable into another node type which is smaller and runs faster.
* Check if this is the case for this class */
* to force that */
if (! PL_utf8_tofold) {
U8 dummy[UTF8_MAXBYTES+1];
- STRLEN dummy_len;
/* This string is just a short named one above \xff */
- to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len);
+ to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
assert(PL_utf8_tofold); /* Verify that worked */
}
PL_utf8_foldclosures =
U8 foldbuf[UTF8_MAXBYTES_CASE+1];
STRLEN foldlen;
- UV f;
+ SV** listp;
if (j < 256) {
&& (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
{
/* Certain Latin1 characters have matches outside
- * Latin1, or are multi-character. To get here, 'j' is
- * one of those characters. None of these matches is
- * valid for ASCII characters under /aa, which is why
- * the 'if' just above excludes those. The matches
- * fall into three categories:
- * 1) They are singly folded-to or -from an above 255
- * character, e.g., LATIN SMALL LETTER Y WITH
- * DIAERESIS and LATIN CAPITAL LETTER Y WITH
- * DIAERESIS;
- * 2) They are part of a multi-char fold with another
- * latin1 character; only LATIN SMALL LETTER
- * SHARP S => "ss" fits this;
- * 3) They are part of a multi-char fold with a
- * character outside of Latin1, such as various
- * ligatures.
- * We aren't dealing fully with multi-char folds, except
- * we do deal with the pattern containing a character
- * that has a multi-char fold (not so much the inverse).
- * For types 1) and 3), the matches only happen when the
- * target string is utf8; that's not true for 2), and we
- * set a flag for it.
- *
- * The code below adds the single fold closures for 'j'
- * to the inversion list. */
+ * Latin1. To get here, <j> is one of those
+ * characters. None of these matches is valid for
+ * ASCII characters under /aa, which is why the 'if'
+ * just above excludes those. These matches only
+ * happen when the target string is utf8. The code
+ * below adds the single fold closures for <j> to the
+ * inversion list. */
switch (j) {
case 'k':
case 'K':
case LATIN_SMALL_LETTER_SHARP_S:
cp_list = add_cp_to_invlist(cp_list,
LATIN_CAPITAL_LETTER_SHARP_S);
-
- /* Under /a, /d, and /u, this can match the two
- * chars "ss" */
- if (! ASCII_FOLD_RESTRICTED) {
- add_alternate(&unicode_alternate,
- (U8 *) "ss", 2);
-
- /* And under /u or /a, it can match even if
- * the target is not utf8 */
- if (AT_LEAST_UNI_SEMANTICS) {
- ANYOF_FLAGS(ret) |=
- ANYOF_NONBITMAP_NON_UTF8;
- }
- }
break;
case 'F': case 'f':
case 'I': case 'i':
* express, so they can't match unless the
* target string is in UTF-8, so no action here
* is necessary, as regexec.c properly handles
- * the general case for UTF-8 matching */
+ * the general case for UTF-8 matching and
+ * multi-char folds */
break;
default:
/* Use deprecated warning to increase the
}
/* Here is an above Latin1 character. We don't have the rules
- * hard-coded for it. First, get its fold */
- f = _to_uni_fold_flags(j, foldbuf, &foldlen,
- ((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
- | ((LOC)
- ? FOLD_FLAGS_LOCALE
- : (ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0));
-
- if (foldlen > (STRLEN)UNISKIP(f)) {
-
- /* Any multicharacter foldings (disallowed in lookbehind
- * patterns) require the following transform: [ABCDEF] ->
- * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
- * folds into "rst", all other characters fold to single
- * characters. We save away these multicharacter foldings,
- * to be later saved as part of the additional "s" data. */
- if (! RExC_in_lookbehind) {
- U8* loc = foldbuf;
- U8* e = foldbuf + foldlen;
-
- /* If any of the folded characters of this are in the
- * Latin1 range, tell the regex engine that this can
- * match a non-utf8 target string. */
- while (loc < e) {
- if (UTF8_IS_INVARIANT(*loc)
- || UTF8_IS_DOWNGRADEABLE_START(*loc))
- {
- ANYOF_FLAGS(ret)
- |= ANYOF_NONBITMAP_NON_UTF8;
- break;
- }
- loc += UTF8SKIP(loc);
+ * hard-coded for it. First, get its fold. This is the simple
+ * fold, as the multi-character folds have been handled earlier
+ * and separated out */
+ _to_uni_fold_flags(j, foldbuf, &foldlen,
+ ((LOC)
+ ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
+
+ /* Single character fold of above Latin1. Add everything in
+ * its fold closure to the list that this node should match.
+ * The fold closures data structure is a hash with the keys
+ * being the UTF-8 of every character that is folded to, like
+ * 'k', and the values each an array of all code points that
+ * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
+ * Multi-character folds are not included */
+ if ((listp = hv_fetch(PL_utf8_foldclosures,
+ (char *) foldbuf, foldlen, FALSE)))
+ {
+ AV* list = (AV*) *listp;
+ IV k;
+ for (k = 0; k <= av_len(list); k++) {
+ SV** c_p = av_fetch(list, k, FALSE);
+ UV c;
+ if (c_p == NULL) {
+ Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
}
+ c = SvUV(*c_p);
- add_alternate(&unicode_alternate, foldbuf, foldlen);
- }
- }
- else {
- /* Single character fold of above Latin1. Add everything
- * in its fold closure to the list that this node should
- * match */
- SV** listp;
-
- /* The fold closures data structure is a hash with the keys
- * being every character that is folded to, like 'k', and
- * the values each an array of everything that folds to its
- * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
- if ((listp = hv_fetch(PL_utf8_foldclosures,
- (char *) foldbuf, foldlen, FALSE)))
- {
- AV* list = (AV*) *listp;
- IV k;
- for (k = 0; k <= av_len(list); k++) {
- SV** c_p = av_fetch(list, k, FALSE);
- UV c;
- if (c_p == NULL) {
- Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
- }
- c = SvUV(*c_p);
-
- /* /aa doesn't allow folds between ASCII and non-;
- * /l doesn't allow them between above and below
- * 256 */
- if ((ASCII_FOLD_RESTRICTED
- && (isASCII(c) != isASCII(j)))
- || (LOC && ((c < 256) != (j < 256))))
- {
- continue;
- }
+ /* /aa doesn't allow folds between ASCII and non-; /l
+ * doesn't allow them between above and below 256 */
+ if ((ASCII_FOLD_RESTRICTED
+ && (isASCII(c) != isASCII(j)))
+ || (LOC && ((c < 256) != (j < 256))))
+ {
+ continue;
+ }
- /* Folds involving non-ascii Latin1 characters
- * under /d are added to a separate list */
- if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
- {
- cp_list = add_cp_to_invlist(cp_list, c);
- }
- else {
- depends_list = add_cp_to_invlist(depends_list, c);
- }
- }
- }
- }
+ /* Folds involving non-ascii Latin1 characters
+ * under /d are added to a separate list */
+ if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
+ {
+ cp_list = add_cp_to_invlist(cp_list, c);
+ }
+ else {
+ depends_list = add_cp_to_invlist(depends_list, c);
+ }
+ }
+ }
}
}
SvREFCNT_dec(fold_intersection);
* folded until runtime */
/* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
- * at compile time. Besides not inverting folded locale now, we can't invert
- * if there are things such as \w, which aren't known until runtime */
+ * at compile time. Besides not inverting folded locale now, we can't
+ * invert if there are things such as \w, which aren't known until runtime
+ * */
if (invert
&& ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
&& ! depends_list
- && ! unicode_alternate
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
{
_invlist_invert(cp_list);
* until runtime; set the run-time fold flag for these. (We don't have to
* worry about properties folding, as that is taken care of by the swash
* fetching) */
- if (FOLD && (LOC || unicode_alternate))
+ if (FOLD && LOC)
{
- ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
+ ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
}
/* Some character classes are equivalent to other nodes. Such nodes take
* node types they could possibly match using _invlistEQ(). */
if (cp_list
- && ! unicode_alternate
&& ! invert
&& ! depends_list
&& ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
}
if (! cp_list
- && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
- && ! unicode_alternate)
+ && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
{
ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
SvREFCNT_dec(listsv);
- SvREFCNT_dec(unicode_alternate);
}
else {
/* av[0] stores the character class description in its textual form:
* av[1] if NULL, is a placeholder to later contain the swash computed
* from av[0]. But if no further computation need be done, the
* swash is stored there now.
- * av[2] stores the multicharacter foldings, used later in
- * regexec.c:S_reginclass().
- * av[3] stores the cp_list inversion list for use in addition or
+ * av[2] stores the cp_list inversion list for use in addition or
* instead of av[0]; used only if av[1] is NULL
- * av[4] is set if any component of the class is from a user-defined
+ * av[3] is set if any component of the class is from a user-defined
* property; used only if av[1] is NULL */
AV * const av = newAV();
SV *rv;
else {
av_store(av, 1, NULL);
if (cp_list) {
- av_store(av, 3, cp_list);
- av_store(av, 4, newSVuv(has_user_defined_property));
+ av_store(av, 2, cp_list);
+ av_store(av, 3, newSVuv(has_user_defined_property));
}
}
- /* Store any computed multi-char folds only if we are allowing
- * them */
- if (allow_full_fold) {
- av_store(av, 2, MUTABLE_SV(unicode_alternate));
- if (unicode_alternate) { /* This node is variable length */
- OP(ret) = ANYOFV;
- }
- }
- else {
- av_store(av, 2, NULL);
- }
rv = newRV_noinc(MUTABLE_SV(av));
n = add_data(pRExC_state, 1, "s");
RExC_rxi->data->data[n] = (void*)rv;
if (flags & ANYOF_LOCALE)
sv_catpvs(sv, "{loc}");
- if (flags & ANYOF_LOC_NONBITMAP_FOLD)
+ if (flags & ANYOF_LOC_FOLD)
sv_catpvs(sv, "{i}");
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
if (flags & ANYOF_INVERT)
if (ANYOF_NONBITMAP(o)) {
SV *lv; /* Set if there is something outside the bit map */
- SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
+ SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
bool byte_output = FALSE; /* If something in the bitmap has been
output */
Perl_re_intuit_string(pTHX_ REGEXP * const r)
{ /* Assume that RE_INTUIT is set */
dVAR;
- struct regexp *const prog = (struct regexp *)SvANY(r);
+ struct regexp *const prog = ReANY(r);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_RE_INTUIT_STRING;
Perl_pregfree2(pTHX_ REGEXP *rx)
{
dVAR;
- struct regexp *const r = (struct regexp *)SvANY(rx);
+ struct regexp *const r = ReANY(rx);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_PREGFREE2;
} else {
CALLREGFREE_PVT(rx); /* free the private data */
SvREFCNT_dec(RXp_PAREN_NAMES(r));
+ Safefree(r->xpv_len_u.xpvlenu_pv);
}
if (r->substrs) {
SvREFCNT_dec(r->anchored_substr);
#endif
Safefree(r->offs);
SvREFCNT_dec(r->qr_anoncv);
+ rx->sv_u.svu_rx = 0;
}
/* reg_temp_copy()
Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
{
struct regexp *ret;
- struct regexp *const r = (struct regexp *)SvANY(rx);
+ struct regexp *const r = ReANY(rx);
+ const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
PERL_ARGS_ASSERT_REG_TEMP_COPY;
if (!ret_x)
ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
- ret = (struct regexp *)SvANY(ret_x);
+ else {
+ SvOK_off((SV *)ret_x);
+ if (islv) {
+ /* For PVLVs, SvANY points to the xpvlv body while sv_u points
+ to the regexp. (For SVt_REGEXPs, sv_upgrade has already
+ made both spots point to the same regexp body.) */
+ REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
+ assert(!SvPVX(ret_x));
+ ret_x->sv_u.svu_rx = temp->sv_any;
+ temp->sv_any = NULL;
+ SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
+ SvREFCNT_dec(temp);
+ /* SvCUR still resides in the xpvlv struct, so the regexp copy-
+ ing below will not set it. */
+ SvCUR_set(ret_x, SvCUR(rx));
+ }
+ }
+ /* This ensures that SvTHINKFIRST(sv) is true, and hence that
+ sv_force_normal(sv) is called. */
+ SvFAKE_on(ret_x);
+ ret = ReANY(ret_x);
- (void)ReREFCNT_inc(rx);
- /* We can take advantage of the existing "copied buffer" mechanism in SVs
- by pointing directly at the buffer, but flagging that the allocated
- space in the copy is zero. As we've just done a struct copy, it's now
- a case of zero-ing that, rather than copying the current length. */
- SvPV_set(ret_x, RX_WRAPPED(rx));
- SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
+ SvFLAGS(ret_x) |= SvUTF8(rx);
+ /* We share the same string buffer as the original regexp, on which we
+ hold a reference count, incremented when mother_re is set below.
+ The string pointer is copied here, being part of the regexp struct.
+ */
memcpy(&(ret->xpv_cur), &(r->xpv_cur),
sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
- SvLEN_set(ret_x, 0);
- SvSTASH_set(ret_x, NULL);
- SvMAGIC_set(ret_x, NULL);
if (r->offs) {
const I32 npar = r->nparens+1;
Newx(ret->offs, npar, regexp_paren_pair);
#ifdef PERL_OLD_COPY_ON_WRITE
ret->saved_copy = NULL;
#endif
- ret->mother_re = rx;
+ ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
SvREFCNT_inc_void(ret->qr_anoncv);
return ret_x;
Perl_regfree_internal(pTHX_ REGEXP * const rx)
{
dVAR;
- struct regexp *const r = (struct regexp *)SvANY(rx);
+ struct regexp *const r = ReANY(rx);
RXi_GET_DECL(r,ri);
GET_RE_DEBUG_FLAGS_DECL;
{
dVAR;
I32 npar;
- const struct regexp *r = (const struct regexp *)SvANY(sstr);
- struct regexp *ret = (struct regexp *)SvANY(dstr);
+ const struct regexp *r = ReANY(sstr);
+ struct regexp *ret = ReANY(dstr);
PERL_ARGS_ASSERT_RE_DUP_GUTS;
ret->saved_copy = NULL;
#endif
- if (ret->mother_re) {
- if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
- /* Our storage points directly to our mother regexp, but that's
+ /* Whether mother_re be set or no, we need to copy the string. We
+ cannot refrain from copying it when the storage points directly to
+ our mother regexp, because that's
1: a buffer in a different thread
2: something we no longer hold a reference on
so we need to copy it locally. */
- /* Note we need to use SvCUR(), rather than
- SvLEN(), on our mother_re, because it, in
- turn, may well be pointing to its own mother_re. */
- SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
- SvCUR(ret->mother_re)+1));
- SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
- }
- ret->mother_re = NULL;
- }
+ RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
+ ret->mother_re = NULL;
ret->gofs = 0;
}
#endif /* PERL_IN_XSUB_RE */
Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
{
dVAR;
- struct regexp *const r = (struct regexp *)SvANY(rx);
+ struct regexp *const r = ReANY(rx);
regexp_internal *reti;
int len;
RXi_GET_DECL(r,ri);