X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/09091399717abb9634c88f60ff8b2384a3297c16..582cb20e4de9cf2d47abf705ff2908664756c5f0:/regcomp.c diff --git a/regcomp.c b/regcomp.c index ea98177..dedcd00 100644 --- a/regcomp.c +++ b/regcomp.c @@ -69,7 +69,8 @@ * **** Alterations to Henry's code are... **** - **** Copyright (c) 1991-2001, Larry Wall + **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, + **** 2000, 2001, 2002, 2003, by Larry Wall and others **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. @@ -108,7 +109,7 @@ #endif typedef struct RExC_state_t { - U16 flags16; /* are we folding, multilining? */ + U32 flags; /* are we folding, multilining? */ char *precomp; /* uncompiled string. */ regexp *rx; char *start; /* Start of input for compile */ @@ -132,7 +133,7 @@ typedef struct RExC_state_t { #endif } RExC_state_t; -#define RExC_flags16 (pRExC_state->flags16) +#define RExC_flags (pRExC_state->flags) #define RExC_precomp (pRExC_state->precomp) #define RExC_rx (pRExC_state->rx) #define RExC_start (pRExC_state->start) @@ -227,9 +228,9 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) #define SCF_WHILEM_VISITED_POS 0x2000 -#define UTF RExC_utf8 -#define LOC (RExC_flags16 & PMf_LOCALE) -#define FOLD (RExC_flags16 & PMf_FOLD) +#define UTF (RExC_utf8 != 0) +#define LOC ((RExC_flags & PMf_LOCALE) != 0) +#define FOLD ((RExC_flags & PMf_FOLD) != 0) #define OOB_UNICODE 12345678 #define OOB_NAMEDCLASS -1 @@ -256,184 +257,159 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, * arg. Show regex, up to a maximum length. If it's too long, chop and add * "...". */ -#define FAIL(msg) \ - STMT_START { \ - char *ellipses = ""; \ - IV len = RExC_end - RExC_precomp; \ - \ - if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ - \ - if (len > RegexLengthToShowInErrorMessages) { \ - /* chop 10 shorter than the max, to ensure meaning of "..." */ \ - len = RegexLengthToShowInErrorMessages - 10; \ - ellipses = "..."; \ - } \ - Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ - msg, (int)len, RExC_precomp, ellipses); \ - } STMT_END +#define FAIL(msg) STMT_START { \ + char *ellipses = ""; \ + IV len = RExC_end - RExC_precomp; \ + \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ + msg, (int)len, RExC_precomp, ellipses); \ +} STMT_END /* * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given * args. Show regex, up to a maximum length. If it's too long, chop and add * "...". */ -#define FAIL2(pat,msg) \ - STMT_START { \ - char *ellipses = ""; \ - IV len = RExC_end - RExC_precomp; \ - \ - if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ - \ - if (len > RegexLengthToShowInErrorMessages) { \ - /* chop 10 shorter than the max, to ensure meaning of "..." */ \ - len = RegexLengthToShowInErrorMessages - 10; \ - ellipses = "..."; \ - } \ - S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \ - msg, (int)len, RExC_precomp, ellipses); \ - } STMT_END +#define FAIL2(pat,msg) STMT_START { \ + char *ellipses = ""; \ + IV len = RExC_end - RExC_precomp; \ + \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \ + msg, (int)len, RExC_precomp, ellipses); \ +} STMT_END /* * Simple_vFAIL -- like FAIL, but marks the current location in the scan */ -#define Simple_vFAIL(m) \ - STMT_START { \ - IV offset = RExC_parse - RExC_precomp; \ - \ - Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ - } STMT_END +#define Simple_vFAIL(m) STMT_START { \ + IV offset = RExC_parse - RExC_precomp; \ + Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ + m, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END /* * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() */ -#define vFAIL(m) \ - STMT_START { \ - if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ - Simple_vFAIL(m); \ - } STMT_END +#define vFAIL(m) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + Simple_vFAIL(m); \ +} STMT_END /* * Like Simple_vFAIL(), but accepts two arguments. */ -#define Simple_vFAIL2(m,a1) \ - STMT_START { \ - IV offset = RExC_parse - RExC_precomp; \ - \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ - } STMT_END +#define Simple_vFAIL2(m,a1) STMT_START { \ + IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END /* * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). */ -#define vFAIL2(m,a1) \ - STMT_START { \ - if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ - Simple_vFAIL2(m, a1); \ - } STMT_END +#define vFAIL2(m,a1) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + Simple_vFAIL2(m, a1); \ +} STMT_END /* * Like Simple_vFAIL(), but accepts three arguments. */ -#define Simple_vFAIL3(m, a1, a2) \ - STMT_START { \ - IV offset = RExC_parse - RExC_precomp; \ - \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ - } STMT_END +#define Simple_vFAIL3(m, a1, a2) STMT_START { \ + IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END /* * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). */ -#define vFAIL3(m,a1,a2) \ - STMT_START { \ - if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ - Simple_vFAIL3(m, a1, a2); \ - } STMT_END +#define vFAIL3(m,a1,a2) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + Simple_vFAIL3(m, a1, a2); \ +} STMT_END /* * Like Simple_vFAIL(), but accepts four arguments. */ -#define Simple_vFAIL4(m, a1, a2, a3) \ - STMT_START { \ - IV offset = RExC_parse - RExC_precomp; \ - \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\ - (int)offset, RExC_precomp, RExC_precomp + offset); \ - } STMT_END +#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ + IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END /* * Like Simple_vFAIL(), but accepts five arguments. */ -#define Simple_vFAIL5(m, a1, a2, a3, a4) \ - STMT_START { \ - IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\ - (int)offset, RExC_precomp, RExC_precomp + offset); \ - } STMT_END - - -#define vWARN(loc,m) \ - STMT_START { \ - IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ - } STMT_END \ - -#define vWARNdep(loc,m) \ - STMT_START { \ - IV offset = loc - RExC_precomp; \ - int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \ - Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ - } STMT_END \ - - -#define vWARN2(loc, m, a1) \ - STMT_START { \ - IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ - a1, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ - } STMT_END - -#define vWARN3(loc, m, a1, a2) \ - STMT_START { \ - IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ - a1, a2, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ - } STMT_END - -#define vWARN4(loc, m, a1, a2, a3) \ - STMT_START { \ - IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ - a1, a2, a3, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ - } STMT_END - -/* used for the parse_flags section for (?c) -- japhy */ -#define vWARN5(loc, m, a1, a2, a3, a4) \ - STMT_START { \ - IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ - a1, a2, a3, a4, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ - } STMT_END +#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \ + IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + + +#define vWARN(loc,m) STMT_START { \ + IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ + m, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +#define vWARNdep(loc,m) STMT_START { \ + IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + "%s" REPORT_LOCATION, \ + m, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + + +#define vWARN2(loc, m, a1) STMT_START { \ + IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +#define vWARN3(loc, m, a1, a2) STMT_START { \ + IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +#define vWARN4(loc, m, a1, a2, a3) STMT_START { \ + IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ + IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END /* Allow for side effects in s */ -#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END +#define REGC(c,s) STMT_START { \ + if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ +} STMT_END /* Macros for recording node offsets. 20001227 mjd@plover.com * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in @@ -442,38 +418,42 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, */ #define MJD_OFFSET_DEBUG(x) -/* #define MJD_OFFSET_DEBUG(x) fprintf x */ - - -# define Set_Node_Offset_To_R(node,byte) \ - STMT_START { \ - if (! SIZE_ONLY) { \ - if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \ - } else { \ - RExC_offsets[2*(node)-1] = (byte); \ - } \ - } \ - } STMT_END - -# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) -# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) - -# define Set_Node_Length_To_R(node,len) \ - STMT_START { \ - if (! SIZE_ONLY) { \ - MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \ - if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Length macro", node); \ - } else { \ - RExC_offsets[2*(node)] = (len); \ - } \ - } \ - } STMT_END - -# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len) -# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len) -# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start) +/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */ + + +#define Set_Node_Offset_To_R(node,byte) STMT_START { \ + if (! SIZE_ONLY) { \ + MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ + __LINE__, (node), (byte))); \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \ + } else { \ + RExC_offsets[2*(node)-1] = (byte); \ + } \ + } \ +} STMT_END + +#define Set_Node_Offset(node,byte) \ + Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) +#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) + +#define Set_Node_Length_To_R(node,len) STMT_START { \ + if (! SIZE_ONLY) { \ + MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ + __LINE__, (node), (len))); \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Length macro", node); \ + } else { \ + RExC_offsets[2*(node)] = (len); \ + } \ + } \ +} STMT_END + +#define Set_Node_Length(node,len) \ + Set_Node_Length_To_R((node)-RExC_emit_start, len) +#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len) +#define Set_Node_Cur_Length(node) \ + Set_Node_Length(node, RExC_parse - parse_start) /* Get offsets and lengths */ #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) @@ -492,7 +472,7 @@ S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data) STRLEN old_l = CHR_SVLEN(*data->longest); if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { - sv_setsv(*data->longest, data->last_found); + SvSetMagicSV(*data->longest, data->last_found); if (*data->longest == data->longest_fixed) { data->offset_fixed = l ? data->last_start_min : data->pos_min; if (data->flags & SF_BEFORE_EOL) @@ -506,6 +486,8 @@ S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data) data->offset_float_max = (l ? data->last_start_max : data->pos_min + data->pos_delta); + if ((U32)data->offset_float_max > (U32)I32_MAX) + data->offset_float_max = I32_MAX; if (data->flags & SF_BEFORE_EOL) data->flags |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); @@ -514,6 +496,13 @@ S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data) } } SvCUR_set(data->last_found, 0); + { + SV * sv = data->last_found; + MAGIC *mg = + SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len > 0) + mg->mg_len = 0; + } data->last_end = -1; data->flags &= ~SF_BEFORE_EOL; } @@ -587,14 +576,17 @@ S_cl_and(pTHX_ struct regnode_charclass_class *cl, if (!(and_with->flags & ANYOF_EOS)) cl->flags &= ~ANYOF_EOS; - if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) { + if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE && + !(and_with->flags & ANYOF_INVERT)) { cl->flags &= ~ANYOF_UNICODE_ALL; cl->flags |= ANYOF_UNICODE; ARG_SET(cl, ARG(and_with)); } - if (!(and_with->flags & ANYOF_UNICODE_ALL)) + if (!(and_with->flags & ANYOF_UNICODE_ALL) && + !(and_with->flags & ANYOF_INVERT)) cl->flags &= ~ANYOF_UNICODE_ALL; - if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL))) + if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) && + !(and_with->flags & ANYOF_INVERT)) cl->flags &= ~ANYOF_UNICODE; } @@ -736,6 +728,50 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg n = nnext; } } + + if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) { +/* + Two problematic code points in Unicode casefolding of EXACT nodes: + + U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + + which casefold to + + Unicode UTF-8 + + U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 + U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 + + This means that in case-insensitive matching (or "loose matching", + as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte + length of the above casefolded versions) can match a target string + of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). + This would rather mess up the minimum length computation. + + What we'll do is to look for the tail four bytes, and then peek + at the preceding two bytes to see whether we need to decrease + the minimum length by four (six minus two). + + Thanks to the design of UTF-8, there cannot be false matches: + A sequence of valid UTF-8 bytes cannot be a subsequence of + another valid sequence of UTF-8 bytes. + +*/ + char *s0 = STRING(scan), *s, *t; + char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4; + char *t0 = "\xcc\x88\xcc\x81"; + char *t1 = t0 + 3; + + for (s = s0 + 2; + s < s2 && (t = ninstr(s, s1, t0, t1)); + s = t + 4) { + if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || + ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) + min -= 4; + } + } + #ifdef DEBUGGING /* Allow dumping */ n = scan + NODE_SZ_STR(scan); @@ -888,6 +924,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg ? I32_MAX : data->pos_min + data->pos_delta; } sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); + { + SV * sv = data->last_found; + MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len >= 0) + mg->mg_len += utf8_length((U8*)STRING(scan), + (U8*)STRING(scan)+STR_LEN(scan)); + } + if (UTF) + SvUTF8_on(data->last_found); data->last_end = data->pos_min + l; data->pos_min += l; /* As in the first entry. */ data->flags &= ~SF_BEFORE_EOL; @@ -1124,7 +1170,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg if (OP(nxt) != CLOSE) goto nogo; /* Now we know that nxt2 is the only contents: */ - oscan->flags = ARG(nxt); + oscan->flags = (U8)ARG(nxt); OP(oscan) = CURLYN; OP(nxt1) = NOTHING; /* was OPEN. */ #ifdef DEBUGGING @@ -1160,7 +1206,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg if (OP(nxt) != CLOSE) FAIL("Panic opt close"); - oscan->flags = ARG(nxt); + oscan->flags = (U8)ARG(nxt); OP(nxt1) = OPTIMIZED; /* was OPEN. */ OP(nxt) = OPTIMIZED; /* was CLOSE. */ #ifdef DEBUGGING @@ -1204,8 +1250,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ nxt += ARG(nxt); - PREVOPER(nxt)->flags = data->whilem_c - | (RExC_whilem_seen << 4); /* On WHILEM */ + PREVOPER(nxt)->flags = (U8)(data->whilem_c + | (RExC_whilem_seen << 4)); /* On WHILEM */ } if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) pars++; @@ -1243,6 +1289,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg l -= old; /* Get the added string: */ last_str = newSVpvn(s + old, l); + if (UTF) + SvUTF8_on(last_str); if (deltanext == 0 && pos_before == b) { /* What was added is a constant string */ if (mincount > 1) { @@ -1254,13 +1302,21 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg SvCUR_set(data->last_found, SvCUR(data->last_found) - l); sv_catsv(data->last_found, last_str); + { + SV * sv = data->last_found; + MAGIC *mg = + SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len >= 0) + mg->mg_len += CHR_SVLEN(last_str); + } data->last_end += l * (mincount - 1); } } else { /* start offset must point into the last copy */ data->last_start_min += minnext * (mincount - 1); - data->last_start_max += is_inf ? 0 : (maxcount - 1) - * (minnext + data->pos_delta); + data->last_start_max += is_inf ? I32_MAX + : (maxcount - 1) * (minnext + data->pos_delta); } } /* It is counted once already... */ @@ -1546,7 +1602,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg else if (minnext > U8_MAX) { vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); } - scan->flags = minnext; + scan->flags = (U8)minnext; } if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) pars++; @@ -1566,7 +1622,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg pars++; } else if (OP(scan) == CLOSE) { - if (ARG(scan) == is_par) { + if ((I32)ARG(scan) == is_par) { next = regnext(scan); if ( next && (OP(next) != WHILEM) && next < last) @@ -1699,7 +1755,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) PL_colors[4],PL_colors[5],PL_colors[0], (int)(xend - exp), RExC_precomp, PL_colors[1]); }); - RExC_flags16 = pm->op_pmflags; + RExC_flags = pm->op_pmflags; RExC_sawback = 0; RExC_seen = 0; @@ -1749,6 +1805,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->prelen = xend - exp; r->precomp = savepvn(RExC_precomp, r->prelen); r->subbeg = NULL; +#ifdef PERL_COPY_ON_WRITE + r->saved_copy = Nullsv; +#endif r->reganch = pm->op_pmflags & PMf_COMPILETIME; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ @@ -1768,7 +1827,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) RExC_rx = r; /* Second pass: emit code. */ - RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */ + RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */ RExC_parse = exp; RExC_end = xend; RExC_naughty = 0; @@ -1776,7 +1835,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) RExC_emit_start = r->program; RExC_emit = r->program; /* Store the count of eval-groups for security checks: */ - RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals); + RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals); REGC((U8)REG_MAGIC, (char*) RExC_emit++); r->data = 0; if (reg(pRExC_state, 0, &flags) == NULL) @@ -1784,7 +1843,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* Dig out information for optimizations. */ r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */ - pm->op_pmflags = RExC_flags16; + pm->op_pmflags = RExC_flags; if (UTF) r->reganch |= ROPT_UTF8; /* Unicode in it? */ r->regstclass = NULL; @@ -1848,7 +1907,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) first = NEXTOPER(first); goto again; } - else if ((OP(first) == STAR && + else if (!sawopen && (OP(first) == STAR && PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) && !(r->reganch & ROPT_ANCH) ) { @@ -1912,7 +1971,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) if (longest_float_length || (data.flags & SF_FL_BEFORE_EOL && (!(data.flags & SF_FL_BEFORE_MEOL) - || (RExC_flags16 & PMf_MULTILINE)))) { + || (RExC_flags & PMf_MULTILINE)))) { int t; if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ @@ -1920,17 +1979,23 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) goto remove_float; /* As in (a)+. */ - r->float_substr = data.longest_float; + if (SvUTF8(data.longest_float)) { + r->float_utf8 = data.longest_float; + r->float_substr = Nullsv; + } else { + r->float_substr = data.longest_float; + r->float_utf8 = Nullsv; + } r->float_min_offset = data.offset_float_min; r->float_max_offset = data.offset_float_max; t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ && (!(data.flags & SF_FL_BEFORE_MEOL) - || (RExC_flags16 & PMf_MULTILINE))); - fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0); + || (RExC_flags & PMf_MULTILINE))); + fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0); } else { remove_float: - r->float_substr = Nullsv; + r->float_substr = r->float_utf8 = Nullsv; SvREFCNT_dec(data.longest_float); longest_float_length = 0; } @@ -1939,27 +2004,35 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) if (longest_fixed_length || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags16 & PMf_MULTILINE)))) { + || (RExC_flags & PMf_MULTILINE)))) { int t; - r->anchored_substr = data.longest_fixed; + if (SvUTF8(data.longest_fixed)) { + r->anchored_utf8 = data.longest_fixed; + r->anchored_substr = Nullsv; + } else { + r->anchored_substr = data.longest_fixed; + r->anchored_utf8 = Nullsv; + } r->anchored_offset = data.offset_fixed; t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags16 & PMf_MULTILINE))); - fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0); + || (RExC_flags & PMf_MULTILINE))); + fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0); } else { - r->anchored_substr = Nullsv; + r->anchored_substr = r->anchored_utf8 = Nullsv; SvREFCNT_dec(data.longest_fixed); longest_fixed_length = 0; } if (r->regstclass && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY)) r->regstclass = NULL; - if ((!r->anchored_substr || r->anchored_offset) && stclass_flag + if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) + && stclass_flag && !(data.start_class->flags & ANYOF_EOS) - && !cl_is_anything(data.start_class)) { + && !cl_is_anything(data.start_class)) + { I32 n = add_data(pRExC_state, 1, "f"); New(1006, RExC_rx->data->data[n], 1, @@ -1980,20 +2053,22 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ if (longest_fixed_length > longest_float_length) { r->check_substr = r->anchored_substr; + r->check_utf8 = r->anchored_utf8; r->check_offset_min = r->check_offset_max = r->anchored_offset; if (r->reganch & ROPT_ANCH_SINGLE) r->reganch |= ROPT_NOSCAN; } else { r->check_substr = r->float_substr; + r->check_utf8 = r->float_utf8; r->check_offset_min = data.offset_float_min; r->check_offset_max = data.offset_float_max; } /* XXXX Currently intuiting is not compatible with ANCH_GPOS. This should be changed ASAP! */ - if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) { + if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) { r->reganch |= RE_USE_INTUIT; - if (SvTAIL(r->check_substr)) + if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->reganch |= RE_INTUIT_TAIL; } } @@ -2009,9 +2084,11 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) data.start_class = &ch_class; data.last_closep = &last_close; minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS); - r->check_substr = r->anchored_substr = r->float_substr = Nullsv; + r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 + = r->float_substr = r->float_utf8 = Nullsv; if (!(data.start_class->flags & ANYOF_EOS) - && !cl_is_anything(data.start_class)) { + && !cl_is_anything(data.start_class)) + { I32 n = add_data(pRExC_state, 1, "f"); New(1006, RExC_rx->data->data[n], 1, @@ -2063,7 +2140,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) register regnode *lastbr; register regnode *ender = 0; register I32 parno = 0; - I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0; + I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0; /* for (?g), (?gc), and (?o) warnings; warning about (?c) will warn about (?g) -- japhy */ @@ -2084,8 +2161,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) /* Make an OPEN node, if parenthesized. */ if (paren) { if (*RExC_parse == '?') { /* (?...) */ - U16 posflags = 0, negflags = 0; - U16 *flagsp = &posflags; + U32 posflags = 0, negflags = 0; + U32 *flagsp = &posflags; int logical = 0; char *seqstart = RExC_parse; @@ -2124,6 +2201,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) /* FALL THROUGH*/ case '?': /* (??...) */ logical = 1; + if (*RExC_parse != '{') + goto unknown; paren = *RExC_parse++; /* FALL THROUGH */ case '{': /* (?{...}) */ @@ -2151,7 +2230,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); } if (!SIZE_ONLY) { - AV *av; + PAD *pad; if (RExC_parse - 1 - s) sv = newSVpvn(s, RExC_parse - 1 - s); @@ -2160,7 +2239,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) ENTER; Perl_save_re_context(aTHX); - rop = sv_compile_2op(sv, &sop, "re", &av); + rop = sv_compile_2op(sv, &sop, "re", &pad); sop->op_private |= OPpREFCOUNTED; /* re_dup will OpREFCNT_inc */ OpREFCNT_set(sop, 1); @@ -2169,7 +2248,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) n = add_data(pRExC_state, 3, "nop"); RExC_rx->data->data[n] = (void*)rop; RExC_rx->data->data[n+1] = (void*)sop; - RExC_rx->data->data[n+2] = (void*)av; + RExC_rx->data->data[n+2] = (void*)pad; SvREFCNT_dec(sv); } else { /* First pass */ @@ -2180,8 +2259,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) FAIL("Eval-group not allowed at runtime, use re 'eval'"); if (PL_tainting && PL_tainted) FAIL("Eval-group in insecure regular expression"); + if (PL_curcop == &PL_compiling) + PL_cv_has_eval = 1; } - + nextchar(pRExC_state); if (logical) { ret = reg_node(pRExC_state, LOGICAL); @@ -2191,7 +2272,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) /* deal with the length of this later - MJD */ return ret; } - return reganode(pRExC_state, EVAL, n); + ret = reganode(pRExC_state, EVAL, n); + Set_Node_Length(ret, RExC_parse - parse_start + 1); + Set_Node_Offset(ret, parse_start); + return ret; } case '(': /* (?(?{...})...) and (?(?=...)...) */ { @@ -2304,8 +2388,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) ++RExC_parse; goto parse_flags; } - RExC_flags16 |= posflags; - RExC_flags16 &= ~negflags; + RExC_flags |= posflags; + RExC_flags &= ~negflags; if (*RExC_parse == ':') { RExC_parse++; paren = ':'; @@ -2361,9 +2445,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) } else if (paren != '?') /* Not Conditional */ ret = br; - if (flags&HASWIDTH) - *flagp |= HASWIDTH; - *flagp |= flags&SPSTART; + *flagp |= flags & (SPSTART | HASWIDTH); lastbr = br; while (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { @@ -2423,12 +2505,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) static char parens[] = "=!<,>"; if (paren && (p = strchr(parens, paren))) { - int node = ((p - parens) % 2) ? UNLESSM : IFMATCH; + U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; int flag = (p - parens) > 1; if (paren == '>') node = SUSPEND, flag = 0; reginsert(pRExC_state, node,ret); + Set_Node_Offset(ret, oregcomp_parse); + Set_Node_Length(ret, RExC_parse - oregcomp_parse + 2); ret->flags = flag; regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL)); } @@ -2436,7 +2520,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) /* Check for proper termination. */ if (paren) { - RExC_flags16 = oregflags; + RExC_flags = oregflags; if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { RExC_parse = oregcomp_parse; vFAIL("Unmatched ("); @@ -2618,8 +2702,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) if (max && max < min) vFAIL("Can't do {n,m} with n > m"); if (!SIZE_ONLY) { - ARG1_SET(ret, min); - ARG2_SET(ret, max); + ARG1_SET(ret, (U16)min); + ARG2_SET(ret, (U16)max); } goto nest_check; @@ -2718,9 +2802,9 @@ tryagain: case '^': RExC_seen_zerolen++; nextchar(pRExC_state); - if (RExC_flags16 & PMf_MULTILINE) + if (RExC_flags & PMf_MULTILINE) ret = reg_node(pRExC_state, MBOL); - else if (RExC_flags16 & PMf_SINGLELINE) + else if (RExC_flags & PMf_SINGLELINE) ret = reg_node(pRExC_state, SBOL); else ret = reg_node(pRExC_state, BOL); @@ -2730,9 +2814,9 @@ tryagain: nextchar(pRExC_state); if (*RExC_parse) RExC_seen_zerolen++; - if (RExC_flags16 & PMf_MULTILINE) + if (RExC_flags & PMf_MULTILINE) ret = reg_node(pRExC_state, MEOL); - else if (RExC_flags16 & PMf_SINGLELINE) + else if (RExC_flags & PMf_SINGLELINE) ret = reg_node(pRExC_state, SEOL); else ret = reg_node(pRExC_state, EOL); @@ -2740,7 +2824,7 @@ tryagain: break; case '.': nextchar(pRExC_state); - if (RExC_flags16 & PMf_SINGLELINE) + if (RExC_flags & PMf_SINGLELINE) ret = reg_node(pRExC_state, SANY); else ret = reg_node(pRExC_state, REG_ANY); @@ -2841,13 +2925,13 @@ tryagain: Set_Node_Length(ret, 2); /* MJD */ break; case 'w': - ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM); + ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM)); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ break; case 'W': - ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM); + ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM)); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ @@ -2855,7 +2939,7 @@ tryagain: case 'b': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND); + ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND)); *flagp |= SIMPLE; nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ @@ -2863,19 +2947,19 @@ tryagain: case 'B': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND); + ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND)); *flagp |= SIMPLE; nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ break; case 's': - ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE); + ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE)); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ break; case 'S': - ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE); + ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE)); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ @@ -2896,7 +2980,7 @@ tryagain: case 'P': { char* oldregxend = RExC_end; - char* parse_start = RExC_parse; + char* parse_start = RExC_parse - 2; if (RExC_parse[1] == '{') { /* a lovely hack--pretend we saw [\pX] instead */ @@ -2909,15 +2993,20 @@ tryagain: } RExC_end++; } - else + else { RExC_end = RExC_parse + 2; + if (RExC_end > oldregxend) + RExC_end = oldregxend; + } RExC_parse--; ret = regclass(pRExC_state); RExC_end = oldregxend; RExC_parse--; - Set_Node_Cur_Length(ret); /* MJD */ + + Set_Node_Offset(ret, parse_start + 2); + Set_Node_Cur_Length(ret); nextchar(pRExC_state); *flagp |= HASWIDTH|SIMPLE; } @@ -2944,12 +3033,12 @@ tryagain: while (isDIGIT(*RExC_parse)) RExC_parse++; - if (!SIZE_ONLY && num > RExC_rx->nparens) + if (!SIZE_ONLY && num > (I32)RExC_rx->nparens) vFAIL("Reference to nonexistent group"); RExC_sawback = 1; - ret = reganode(pRExC_state, FOLD - ? (LOC ? REFFL : REFF) - : REF, num); + ret = reganode(pRExC_state, + (U8)(FOLD ? (LOC ? REFFL : REFF) : REF), + num); *flagp |= HASWIDTH; /* override incorrect value set in reganode MJD */ @@ -2972,7 +3061,7 @@ tryagain: break; case '#': - if (RExC_flags16 & PMf_EXTENDED) { + if (RExC_flags & PMf_EXTENDED) { while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++; if (RExC_parse < RExC_end) goto tryagain; @@ -2985,17 +3074,17 @@ tryagain: register char *p; char *oldp, *s; STRLEN numlen; - STRLEN ulen; - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + STRLEN foldlen; + U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf; parse_start = RExC_parse - 1; RExC_parse++; defchar: - ret = reg_node(pRExC_state, FOLD - ? (LOC ? EXACTFL : EXACTF) - : EXACT); + ender = 0; + ret = reg_node(pRExC_state, + (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT)); s = STRING(ret); for (len = 0, p = RExC_parse - 1; len < 127 && p < RExC_end; @@ -3003,7 +3092,7 @@ tryagain: { oldp = p; - if (RExC_flags16 & PMf_EXTENDED) + if (RExC_flags & PMf_EXTENDED) p = regwhite(p, RExC_end); switch (*p) { case '^': @@ -3128,33 +3217,86 @@ tryagain: ender = *p++; break; } - if (RExC_flags16 & PMf_EXTENDED) + if (RExC_flags & PMf_EXTENDED) p = regwhite(p, RExC_end); if (UTF && FOLD) { - toFOLD_uni(ender, tmpbuf, &ulen); - ender = utf8_to_uvchr(tmpbuf, 0); + /* Prime the casefolded buffer. */ + ender = toFOLD_uni(ender, tmpbuf, &foldlen); } if (ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; - else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) { - reguni(pRExC_state, ender, s, &numlen); - s += numlen; - len += numlen; + else if (UTF) { + STRLEN unilen; + + if (FOLD) { + /* Emit all the Unicode characters. */ + for (foldbuf = tmpbuf; + foldlen; + foldlen -= numlen) { + ender = utf8_to_uvchr(foldbuf, &numlen); + if (numlen > 0) { + reguni(pRExC_state, ender, s, &unilen); + s += unilen; + len += unilen; + /* In EBCDIC the numlen + * and unilen can differ. */ + foldbuf += numlen; + if (numlen >= foldlen) + break; + } + else + break; /* "Can't happen." */ + } + } + else { + reguni(pRExC_state, ender, s, &unilen); + if (unilen > 0) { + s += unilen; + len += unilen; + } + } } else { len++; - REGC(ender, s++); + REGC((char)ender, s++); } break; } - if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) { - reguni(pRExC_state, ender, s, &numlen); - s += numlen; - len += numlen - 1; + if (UTF) { + STRLEN unilen; + + if (FOLD) { + /* Emit all the Unicode characters. */ + for (foldbuf = tmpbuf; + foldlen; + foldlen -= numlen) { + ender = utf8_to_uvchr(foldbuf, &numlen); + if (numlen > 0) { + reguni(pRExC_state, ender, s, &unilen); + len += unilen; + s += unilen; + /* In EBCDIC the numlen + * and unilen can differ. */ + foldbuf += numlen; + if (numlen >= foldlen) + break; + } + else + break; + } + } + else { + reguni(pRExC_state, ender, s, &unilen); + if (unilen > 0) { + s += unilen; + len += unilen; + } + } + len--; } else - REGC(ender, s++); + REGC((char)ender, s++); } loopdone: RExC_parse = p - 1; @@ -3180,26 +3322,30 @@ tryagain: break; } + /* If the encoding pragma is in effect recode the text of + * any EXACT-kind nodes. */ if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) { - STRLEN oldlen = STR_LEN(ret); - SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen)); - - if (RExC_utf8) - SvUTF8_on(sv); - if (sv_utf8_downgrade(sv, TRUE)) { - char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding); - STRLEN newlen = SvCUR(sv); - - if (!SIZE_ONLY) { - DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n", - (int)oldlen, STRING(ret), - (int)newlen, s)); - Copy(s, STRING(ret), newlen, char); - STR_LEN(ret) += newlen - oldlen; - RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen); - } else - RExC_size += STR_SZ(newlen) - STR_SZ(oldlen); - } + STRLEN oldlen = STR_LEN(ret); + SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen)); + + if (RExC_utf8) + SvUTF8_on(sv); + if (sv_utf8_downgrade(sv, TRUE)) { + char *s = sv_recode_to_utf8(sv, PL_encoding); + STRLEN newlen = SvCUR(sv); + + if (SvUTF8(sv)) + RExC_utf8 = 1; + if (!SIZE_ONLY) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n", + (int)oldlen, STRING(ret), + (int)newlen, s)); + Copy(s, STRING(ret), newlen, char); + STR_LEN(ret) += newlen - oldlen; + RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen); + } else + RExC_size += STR_SZ(newlen) - STR_SZ(oldlen); + } } return(ret); @@ -3359,15 +3505,17 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) STATIC void S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) { - if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && - POSIXCC(UCHARAT(RExC_parse))) { + if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) { char *s = RExC_parse; char c = *s++; while(*s && isALNUM(*s)) s++; if (*s && c == *s && s[1] == ']') { - vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c); + if (ckWARN(WARN_REGEXP)) + vWARN3(s+2, + "POSIX syntax [%c %c] belongs inside character classes", + c, c); /* [[=foo=]] and [[.foo.]] are still future. */ if (POSIXCC_NOTYET(c)) { @@ -3396,7 +3544,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) SV *listsv = Nullsv; register char *e; UV n; - bool optimize_invert = TRUE; + bool optimize_invert = TRUE; + AV* unicode_alternate = 0; +#ifdef EBCDIC + UV literal_endpoint = 0; +#endif ret = reganode(pRExC_state, ANYOF, 0); @@ -3424,11 +3576,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; - if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue)) + if (!SIZE_ONLY && POSIXCC(nextvalue)) checkposixcc(pRExC_state); - if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-') - goto charclassloop; /* allow 1st char to be ] or - */ + /* allow 1st char to be ] (allowing it to be - is dealt with later) */ + if (UCHARAT(RExC_parse) == ']') + goto charclassloop; while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { @@ -3472,6 +3625,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) case 'D': namedclass = ANYOF_NDIGIT; break; case 'p': case 'P': + if (RExC_parse >= RExC_end) + vFAIL2("Empty \\%c{}", (U8)value); if (*RExC_parse == '{') { U8 c = (U8)value; e = strchr(RExC_parse++, '}'); @@ -3556,6 +3711,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) break; } } /* end of \blah */ +#ifdef EBCDIC + else + literal_endpoint++; +#endif if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ @@ -3920,7 +4079,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } /* end of namedclass \blah */ if (range) { - if (prevvalue > value) /* b-a */ { + if (prevvalue > (IV)value) /* b-a */ { Simple_vFAIL4("Invalid [] range \"%*.*s\"", RExC_parse - rangebegin, RExC_parse - rangebegin, @@ -3958,8 +4117,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) IV ceilvalue = value < 256 ? value : 255; #ifdef EBCDIC - if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) || - (isUPPER(prevvalue) && isUPPER(ceilvalue))) + /* In EBCDIC [\x89-\x91] should include + * the \x8e but [i-j] should not. */ + if (literal_endpoint == 2 && + ((isLOWER(prevvalue) && isLOWER(ceilvalue)) || + (isUPPER(prevvalue) && isUPPER(ceilvalue)))) { if (isLOWER(prevvalue)) { for (i = prevvalue; i <= ceilvalue; i++) @@ -3977,26 +4139,71 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) ANYOF_BITMAP_SET(ret, i); } if (value > 255 || UTF) { + UV prevnatvalue = NATIVE_TO_UNI(prevvalue); + UV natvalue = NATIVE_TO_UNI(value); + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; - if (prevvalue < value) + if (prevnatvalue < natvalue) { /* what about > ? */ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", - (UV)prevvalue, (UV)value); - else if (prevvalue == value) { - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", - (UV)value); + prevnatvalue, natvalue); + } + else if (prevnatvalue == natvalue) { + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue); if (FOLD) { + U8 foldbuf[UTF8_MAXLEN_FOLD+1]; + STRLEN foldlen; + UV f = to_uni_fold(natvalue, foldbuf, &foldlen); + + /* If folding and foldable and a single + * character, insert also the folded version + * to the charclass. */ + if (f != value) { + if (foldlen == (STRLEN)UNISKIP(f)) + Perl_sv_catpvf(aTHX_ listsv, + "%04"UVxf"\n", f); + else { + /* Any multicharacter foldings + * require the following transform: + * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) + * where E folds into "pq" and F folds + * into "rst", all other characters + * fold to single characters. We save + * away these multicharacter foldings, + * to be later saved as part of the + * additional "s" data. */ + SV *sv; + + if (!unicode_alternate) + unicode_alternate = newAV(); + sv = newSVpvn((char*)foldbuf, foldlen); + SvUTF8_on(sv); + av_push(unicode_alternate, sv); + } + } + + /* If folding and the value is one of the Greek + * sigmas insert a few more sigmas to make the + * folding rules of the sigmas to work right. + * Note that not all the possible combinations + * are handled here: some of them are handled + * by the standard folding rules, and some of + * them (literal or EXACTF cases) are handled + * during runtime in regexec.c:S_find_byclass(). */ if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) { Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", - UNICODE_GREEK_CAPITAL_LETTER_SIGMA); + (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA); Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", - UNICODE_GREEK_SMALL_LETTER_SIGMA); + (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); } else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA) Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", - UNICODE_GREEK_SMALL_LETTER_SIGMA); + (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); } } } +#ifdef EBCDIC + literal_endpoint = 0; +#endif } range = 0; /* this range (if it was one) is done now */ @@ -4017,7 +4224,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) ) { for (value = 0; value < 256; ++value) { if (ANYOF_BITMAP_TEST(ret, value)) { - IV fold = PL_fold[value]; + UV fold = PL_fold[value]; if (fold != value) ANYOF_BITMAP_SET(ret, fold); @@ -4039,8 +4246,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) AV *av = newAV(); SV *rv; + /* The 0th element stores the character class description + * in its textual form: used later (regexec.c:Perl_regclass_swash()) + * to initialize the appropriate swash (which gets stored in + * the 1st element), and also useful for dumping the regnode. + * The 2nd element stores the multicharacter foldings, + * used later (regexec.c:S_reginclass()). */ av_store(av, 0, listsv); av_store(av, 1, NULL); + av_store(av, 2, (SV*)unicode_alternate); rv = newRV_noinc((SV*)av); n = add_data(pRExC_state, 1, "s"); RExC_rx->data->data[n] = (void*)rv; @@ -4063,7 +4277,7 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) RExC_parse++; continue; } - if (RExC_flags16 & PMf_EXTENDED) { + if (RExC_flags & PMf_EXTENDED) { if (isSPACE(*RExC_parse)) { RExC_parse++; continue; @@ -4099,7 +4313,7 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) ptr = ret; FILL_ADVANCE_NODE(ptr, op); if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", + MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", "reg_node", __LINE__, reg_name[op], RExC_emit - RExC_emit_start > RExC_offsets[0] @@ -4107,7 +4321,7 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) RExC_emit - RExC_emit_start, RExC_parse - RExC_start, RExC_offsets[0])); - Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); + Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); } RExC_emit = ptr; @@ -4135,14 +4349,16 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) ptr = ret; FILL_ADVANCE_NODE_ARG(ptr, op, arg); if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", + MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", "reganode", + __LINE__, + reg_name[op], RExC_emit - RExC_emit_start > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", RExC_emit - RExC_emit_start, RExC_parse - RExC_start, RExC_offsets[0])); - Set_Cur_Node_Offset; + Set_Cur_Node_Offset; } RExC_emit = ptr; @@ -4185,29 +4401,33 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd) while (src > opnd) { StructCopy(--src, --dst, regnode); if (RExC_offsets) { /* MJD 20010112 */ - MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n", + MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n", "reg_insert", + __LINE__, + reg_name[op], dst - RExC_emit_start > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", src - RExC_emit_start, dst - RExC_emit_start, RExC_offsets[0])); - Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); - Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); + Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); + Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); } } place = opnd; /* Op node, where operand used to be. */ if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", + MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", "reginsert", + __LINE__, + reg_name[op], place - RExC_emit_start > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", place - RExC_emit_start, RExC_parse - RExC_start, RExC_offsets[0])); - Set_Node_Offset(place, RExC_parse); + Set_Node_Offset(place, RExC_parse); } src = NEXTOPER(place); FILL_ADVANCE_NODE(place, op); @@ -4380,6 +4600,15 @@ Perl_regdump(pTHX_ regexp *r) PL_colors[1], SvTAIL(r->anchored_substr) ? "$" : "", (IV)r->anchored_offset); + else if (r->anchored_utf8) + PerlIO_printf(Perl_debug_log, + "anchored utf8 `%s%.*s%s'%s at %"IVdf" ", + PL_colors[0], + (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)), + SvPVX(r->anchored_utf8), + PL_colors[1], + SvTAIL(r->anchored_utf8) ? "$" : "", + (IV)r->anchored_offset); if (r->float_substr) PerlIO_printf(Perl_debug_log, "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ", @@ -4389,15 +4618,25 @@ Perl_regdump(pTHX_ regexp *r) PL_colors[1], SvTAIL(r->float_substr) ? "$" : "", (IV)r->float_min_offset, (UV)r->float_max_offset); - if (r->check_substr) + else if (r->float_utf8) + PerlIO_printf(Perl_debug_log, + "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ", + PL_colors[0], + (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)), + SvPVX(r->float_utf8), + PL_colors[1], + SvTAIL(r->float_utf8) ? "$" : "", + (IV)r->float_min_offset, (UV)r->float_max_offset); + if (r->check_substr || r->check_utf8) PerlIO_printf(Perl_debug_log, r->check_substr == r->float_substr + && r->check_utf8 == r->float_utf8 ? "(checking floating" : "(checking anchored"); if (r->reganch & ROPT_NOSCAN) PerlIO_printf(Perl_debug_log, " noscan"); if (r->reganch & ROPT_CHECK_ALL) PerlIO_printf(Perl_debug_log, " isall"); - if (r->check_substr) + if (r->check_substr || r->check_utf8) PerlIO_printf(Perl_debug_log, ") "); if (r->regstclass) { @@ -4474,9 +4713,13 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) if (k == EXACT) { SV *dsv = sv_2mortal(newSVpvn("", 0)); - bool do_utf8 = DO_UTF8(sv); + /* Using is_utf8_string() is a crude hack but it may + * be the best for now since we have no flag "this EXACTish + * node was UTF-8" --jhi */ + bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o)); char *s = do_utf8 ? - pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 0) : + pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, + UNI_DISPLAY_REGEX) : STRING(o); int len = do_utf8 ? strlen(s) : @@ -4500,7 +4743,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) else if (k == ANYOF) { int i, rangestart = -1; U8 flags = ANYOF_FLAGS(o); - const char * const anyofs[] = { /* Should be syncronized with + const char * const anyofs[] = { /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ "\\w", "\\W", @@ -4570,11 +4813,10 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) { SV *lv; - SV *sw = regclass_swash(o, FALSE, &lv); + SV *sw = regclass_swash(o, FALSE, &lv, 0); if (lv) { if (sw) { - UV i; U8 s[UTF8_MAXLEN+1]; for (i = 0; i <= 256; i++) { /* just the first 256 */ @@ -4642,52 +4884,74 @@ Perl_re_intuit_string(pTHX_ regexp *prog) { /* Assume that RE_INTUIT is set */ DEBUG_r( { STRLEN n_a; - char *s = SvPV(prog->check_substr,n_a); + char *s = SvPV(prog->check_substr + ? prog->check_substr : prog->check_utf8, n_a); if (!PL_colorset) reginitcolors(); PerlIO_printf(Perl_debug_log, - "%sUsing REx substr:%s `%s%.60s%s%s'\n", - PL_colors[4],PL_colors[5],PL_colors[0], + "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n", + PL_colors[4], + prog->check_substr ? "" : "utf8 ", + PL_colors[5],PL_colors[0], s, PL_colors[1], (strlen(s) > 60 ? "..." : "")); } ); - return prog->check_substr; + return prog->check_substr ? prog->check_substr : prog->check_utf8; } void Perl_pregfree(pTHX_ struct regexp *r) { - DEBUG_r(if (!PL_colorset) reginitcolors()); +#ifdef DEBUGGING + SV *dsv = PERL_DEBUG_PAD_ZERO(0); +#endif if (!r || (--r->refcnt > 0)) return; - DEBUG_r(PerlIO_printf(Perl_debug_log, - "%sFreeing REx:%s `%s%.60s%s%s'\n", - PL_colors[4],PL_colors[5],PL_colors[0], - r->precomp, - PL_colors[1], - (strlen(r->precomp) > 60 ? "..." : ""))); + DEBUG_r({ + int len; + char *s; + + s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp, + r->prelen, 60, UNI_DISPLAY_REGEX) + : pv_display(dsv, r->precomp, r->prelen, 0, 60); + len = SvCUR(dsv); + if (!PL_colorset) + reginitcolors(); + PerlIO_printf(Perl_debug_log, + "%sFreeing REx:%s `%s%*.*s%s%s'\n", + PL_colors[4],PL_colors[5],PL_colors[0], + len, len, s, + PL_colors[1], + len > 60 ? "..." : ""); + }); if (r->precomp) Safefree(r->precomp); if (r->offsets) /* 20010421 MJD */ Safefree(r->offsets); - if (RX_MATCH_COPIED(r)) - Safefree(r->subbeg); + RX_MATCH_COPY_FREE(r); +#ifdef PERL_COPY_ON_WRITE + if (r->saved_copy) + SvREFCNT_dec(r->saved_copy); +#endif if (r->substrs) { if (r->anchored_substr) SvREFCNT_dec(r->anchored_substr); + if (r->anchored_utf8) + SvREFCNT_dec(r->anchored_utf8); if (r->float_substr) SvREFCNT_dec(r->float_substr); + if (r->float_utf8) + SvREFCNT_dec(r->float_utf8); Safefree(r->substrs); } if (r->data) { int n = r->data->count; - AV* new_comppad = NULL; - AV* old_comppad; - SV** old_curpad; + PAD* new_comppad = NULL; + PAD* old_comppad; while (--n >= 0) { /* If you add a ->what type here, update the comment in regcomp.h */ @@ -4704,27 +4968,21 @@ Perl_pregfree(pTHX_ struct regexp *r) case 'o': if (new_comppad == NULL) Perl_croak(aTHX_ "panic: pregfree comppad"); - old_comppad = PL_comppad; - old_curpad = PL_curpad; - /* Watch out for global destruction's random ordering. */ - if (SvTYPE(new_comppad) == SVt_PVAV) { - PL_comppad = new_comppad; - PL_curpad = AvARRAY(new_comppad); - } - else - PL_curpad = NULL; - + PAD_SAVE_LOCAL(old_comppad, + /* Watch out for global destruction's random ordering. */ + (SvTYPE(new_comppad) == SVt_PVAV) ? + new_comppad : Null(PAD *) + ); if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) { op_free((OP_4tree*)r->data->data[n]); } - PL_comppad = old_comppad; - PL_curpad = old_curpad; + PAD_RESTORE_LOCAL(old_comppad); SvREFCNT_dec((SV*)new_comppad); new_comppad = NULL; break; case 'n': - break; + break; default: Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]); } @@ -4797,20 +5055,6 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) void Perl_save_re_context(pTHX) { -#if 0 - SAVEPPTR(RExC_precomp); /* uncompiled string. */ - SAVEI32(RExC_npar); /* () count. */ - SAVEI32(RExC_size); /* Code size. */ - SAVEI16(RExC_flags16); /* are we folding, multilining? */ - SAVEVPTR(RExC_rx); /* from regcomp.c */ - SAVEI32(RExC_seen); /* from regcomp.c */ - SAVEI32(RExC_sawback); /* Did we see \1, ...? */ - SAVEI32(RExC_naughty); /* How bad is this pattern? */ - SAVEVPTR(RExC_emit); /* Code-emit pointer; ®dummy = don't */ - SAVEPPTR(RExC_end); /* End of input for compile */ - SAVEPPTR(RExC_parse); /* Input-scan pointer. */ -#endif - SAVEI32(PL_reg_flags); /* from regexec.c */ SAVEPPTR(PL_bostr); SAVEPPTR(PL_reginput); /* String-input pointer. */ @@ -4819,6 +5063,7 @@ Perl_save_re_context(pTHX) SAVEVPTR(PL_regstartp); /* Pointer to startp array. */ SAVEVPTR(PL_regendp); /* Ditto for endp. */ SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */ + SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */ SAVEPPTR(PL_regtill); /* How far we are required to go. */ SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */ PL_reg_start_tmp = 0; @@ -4835,13 +5080,47 @@ Perl_save_re_context(pTHX) SAVEVPTR(PL_reg_re); /* from regexec.c */ SAVEPPTR(PL_reg_ganch); /* from regexec.c */ SAVESPTR(PL_reg_sv); /* from regexec.c */ - SAVEI8(PL_reg_match_utf8); /* from regexec.c */ + SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */ SAVEVPTR(PL_reg_magic); /* from regexec.c */ SAVEI32(PL_reg_oldpos); /* from regexec.c */ SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */ SAVEVPTR(PL_reg_curpm); /* from regexec.c */ + SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */ + PL_reg_oldsaved = Nullch; + SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */ + PL_reg_oldsavedlen = 0; +#ifdef PERL_COPY_ON_WRITE + SAVESPTR(PL_nrs); + PL_nrs = Nullsv; +#endif + SAVEI32(PL_reg_maxiter); /* max wait until caching pos */ + PL_reg_maxiter = 0; + SAVEI32(PL_reg_leftiter); /* wait until caching pos */ + PL_reg_leftiter = 0; + SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */ + PL_reg_poscache = Nullch; + SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */ + PL_reg_poscache_size = 0; + SAVEPPTR(PL_regprecomp); /* uncompiled string. */ SAVEI32(PL_regnpar); /* () count. */ SAVEI32(PL_regsize); /* from regexec.c */ + + { + /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ + U32 i; + GV *mgv; + REGEXP *rx; + char digits[16]; + + if (PL_curpm && (rx = PM_GETRE(PL_curpm))) { + for (i = 1; i <= rx->nparens; i++) { + sprintf(digits, "%lu", (long)i); + if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV))) + save_scalar(mgv); + } + } + } + #ifdef DEBUGGING SAVEPPTR(PL_reg_starttry); /* from regexec.c */ #endif