X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/006f26b2c7df5d8e6c003103a804a62c71f61625..098b07d5cb1d6aa13b81a0f43ea5e151829ad26c:/regexec.c diff --git a/regexec.c b/regexec.c index f79feee..b441682 100644 --- a/regexec.c +++ b/regexec.c @@ -165,13 +165,13 @@ static const char* const non_utf8_target_but_utf8_required #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \ PL_utf8_swash_ptrs[_CC_WORDCHAR], \ swash_property_names[_CC_WORDCHAR], \ - GREEK_SMALL_LETTER_IOTA_UTF8) + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ STMT_START { \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ "_X_regular_begin", \ - GREEK_SMALL_LETTER_IOTA_UTF8); \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ "_X_extend", \ COMBINING_GRAVE_ACCENT_UTF8); \ @@ -207,13 +207,13 @@ static const char* const non_utf8_target_but_utf8_required /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) -#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -250,8 +250,8 @@ static const char* const non_utf8_target_but_utf8_required #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo); -static void S_restore_eval_state(pTHX_ void *arg); -static void S_clear_backtrack_stack(pTHX_ void *p); +static void S_cleanup_regmatch_info_aux(pTHX_ void *arg); +static regmatch_state * S_push_slab(pTHX); #define REGCP_PAREN_ELEMS 3 #define REGCP_OTHER_ELEMS 3 @@ -296,8 +296,8 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) ); for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ - SSPUSHINT(rex->offs[p].end); - SSPUSHINT(rex->offs[p].start); + SSPUSHIV(rex->offs[p].end); + SSPUSHIV(rex->offs[p].start); SSPUSHINT(rex->offs[p].start_tmp); DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", @@ -369,10 +369,10 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p) ); paren = *maxopenparen_p; for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { - I32 tmps; + SSize_t tmps; rex->offs[paren].start_tmp = SSPOPINT; - rex->offs[paren].start = SSPOPINT; - tmps = SSPOPINT; + rex->offs[paren].start = SSPOPIV; + tmps = SSPOPIV; if (paren <= rex->lastparen) rex->offs[paren].end = tmps; DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, @@ -484,7 +484,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) } else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { return isFOO_lc(classnum, - TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1))); + TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); } if (classnum < _FIRST_NON_SWASH_CC) { @@ -526,7 +526,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) */ I32 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, - char *strbeg, I32 minend, SV *screamer, U32 nosave) + char *strbeg, SSize_t minend, SV *screamer, U32 nosave) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -557,13 +557,9 @@ Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, * with giant delta may be not rechecked). */ -/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */ - /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend. Otherwise, only SvCUR(sv) is used to get strbeg. */ -/* XXXX We assume that strpos is strbeg unless sv. */ - /* XXXX Some places assume that there is a fixed substring. An update may be needed if optimizer marks as "INTUITable" RExen without fixed substrings. Similarly, it is assumed that @@ -619,9 +615,9 @@ Perl_re_intuit_start(pTHX_ { dVAR; struct regexp *const prog = ReANY(rx); - I32 start_shift = 0; + SSize_t start_shift = 0; /* Should be nonnegative! */ - I32 end_shift = 0; + SSize_t end_shift = 0; char *s; SV *check; char *t; @@ -643,8 +639,6 @@ Perl_re_intuit_start(pTHX_ PERL_UNUSED_ARG(flags); PERL_UNUSED_ARG(data); - reginfo->is_utf8_target = cBOOL(utf8_target); - /* CHR_DIST() would be more correct here but it makes things slow. */ if (prog->minlen > strend - strpos) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, @@ -652,7 +646,8 @@ Perl_re_intuit_start(pTHX_ goto fail; } - reginfo->eval_state = NULL; + reginfo->is_utf8_target = cBOOL(utf8_target); + reginfo->info_aux = NULL; reginfo->strbeg = strbeg; reginfo->strend = strend; reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); @@ -672,14 +667,15 @@ Perl_re_intuit_start(pTHX_ } check = prog->check_substr; } - if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */ - ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE) + if ((prog->extflags & RXf_ANCH) /* Match at beg-of-str or after \n */ + && !(prog->extflags & RXf_ANCH_GPOS)) /* \G isn't a BOS or \n */ + { + ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE) || ( (prog->extflags & RXf_ANCH_BOL) && !multiline ) ); /* Check after \n? */ if (!ml_anch) { - if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */ - && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */ + if ( !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */ && (strpos != strbeg)) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); goto fail; @@ -691,7 +687,7 @@ Perl_re_intuit_start(pTHX_ See [perl #115242] */ { /* Substring at constant offset from beg-of-str... */ - I32 slen; + SSize_t slen; s = HOP3c(strpos, prog->check_offset_min, strend); @@ -727,9 +723,9 @@ Perl_re_intuit_start(pTHX_ end_shift = prog->check_end_shift; if (!ml_anch) { - const I32 end = prog->check_offset_max + CHR_SVLEN(check) + const SSize_t end = prog->check_offset_max + CHR_SVLEN(check) - (SvTAIL(check) != 0); - const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end; + const SSize_t eshift = CHR_DIST((U8*)strend, (U8*)s) - end; if (end_shift < eshift) end_shift = eshift; @@ -755,8 +751,8 @@ Perl_re_intuit_start(pTHX_ the "check" substring in the region corrected by start/end_shift. */ { - I32 srch_start_shift = start_shift; - I32 srch_end_shift = end_shift; + SSize_t srch_start_shift = start_shift; + SSize_t srch_end_shift = end_shift; U8* start_point; U8* end_point; if (srch_start_shift < 0 && strbeg - s > srch_start_shift) { @@ -1113,8 +1109,11 @@ Perl_re_intuit_start(pTHX_ /* If regstclass takes bytelength more than 1: If charlength==1, OK. This leaves EXACTF-ish only, which are dealt with in find_byclass(). */ const U8* const str = (U8*)STRING(progi->regstclass); + /* XXX this value could be pre-computed */ const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT - ? CHR_DIST(str+STR_LEN(progi->regstclass), str) + ? (reginfo->is_utf8_pat + ? utf8_distance(str + STR_LEN(progi->regstclass), str) + : STR_LEN(progi->regstclass)) : 1); char * endpos; if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) @@ -1228,45 +1227,55 @@ Perl_re_intuit_start(pTHX_ } #define DECL_TRIE_TYPE(scan) \ - const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ + const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ + trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ trie_type = ((scan->flags == EXACT) \ ? (utf8_target ? trie_utf8 : trie_plain) \ - : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + : (scan->flags == EXACTFA) \ + ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ STMT_START { \ STRLEN skiplen; \ + U8 flags = FOLD_FLAGS_FULL; \ switch (trie_type) { \ + case trie_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALL THROUGH */ \ case trie_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ - uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \ + uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags, NULL); \ len = UTF8SKIP(uc); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ + case trie_latin_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALL THROUGH */ \ case trie_latin_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ case trie_utf8: \ - uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ break; \ case trie_plain: \ uvc = (UV)*uc; \ @@ -1351,7 +1360,7 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ #define DUMP_EXEC_POS(li,s,doutf8) \ dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ - (PL_reg_starttry),doutf8) + startpos, doutf8) #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ @@ -1488,6 +1497,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: if (is_utf8_pat || utf8_target) { utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; @@ -1497,10 +1509,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, folder = foldEQ_latin1; /* /a, except the sharp s one which */ goto do_exactf_non_utf8; /* isn't dealt with by these */ - case EXACTF: + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); if (utf8_target) { - - /* regcomp.c already folded this if pattern is in UTF-8 */ utf8_fold_flags = 0; goto do_exactf_utf8; } @@ -1523,7 +1534,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } goto do_exactf_utf8; - case EXACTFU_TRICKYFOLD: case EXACTFU: if (is_utf8_pat || utf8_target) { utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -1556,7 +1566,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * characters, and there are only 2 availabe, we know without * trying that it will fail; so don't start a match past the * required minimum number from the far end */ - e = HOP3c(strend, -((I32)ln), s); + e = HOP3c(strend, -((SSize_t)ln), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1602,7 +1612,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * only 2 are left, it's guaranteed to fail, so don't start a * match that would require us to go beyond the end of the string */ - e = HOP3c(strend, -((I32)lnc), s); + e = HOP3c(strend, -((SSize_t)lnc), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1630,13 +1640,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case BOUNDL: RXp_MATCH_TAINTED_on(prog); FBC_BOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case NBOUNDL: RXp_MATCH_TAINTED_on(prog); FBC_NBOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case BOUND: @@ -1747,7 +1757,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, classnum))) || (UTF8_IS_DOWNGRADEABLE_START(*s) && to_complement ^ cBOOL( - _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)), + _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + *(s + 1)), classnum)))) { if (tmp && (reginfo->intuit || regtry(reginfo, &s))) @@ -2048,13 +2059,163 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, return s; } +/* set RX_SAVED_COPY, RX_SUBBEG etc. + * flags have same meanings as with regexec_flags() */ + +static void +S_reg_set_capture_string(pTHX_ REGEXP * const rx, + char *strbeg, + char *strend, + SV *sv, + U32 flags, + bool utf8_target) +{ + struct regexp *const prog = ReANY(rx); + + if (flags & REXEC_COPY_STR) { +#ifdef PERL_ANY_COW + if (SvCANCOW(sv)) { + if (DEBUG_C_TEST) { + PerlIO_printf(Perl_debug_log, + "Copy on write: regexp capture, type %d\n", + (int) SvTYPE(sv)); + } + /* Create a new COW SV to share the match string and store + * in saved_copy, unless the current COW SV in saved_copy + * is valid and suitable for our purpose */ + if (( prog->saved_copy + && SvIsCOW(prog->saved_copy) + && SvPOKp(prog->saved_copy) + && SvIsCOW(sv) + && SvPOKp(sv) + && SvPVX(sv) == SvPVX(prog->saved_copy))) + { + /* just reuse saved_copy SV */ + if (RXp_MATCH_COPIED(prog)) { + Safefree(prog->subbeg); + RXp_MATCH_COPIED_off(prog); + } + } + else { + /* create new COW SV to share string */ + RX_MATCH_COPY_FREE(rx); + prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); + } + prog->subbeg = (char *)SvPVX_const(prog->saved_copy); + assert (SvPOKp(prog->saved_copy)); + prog->sublen = strend - strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + } else +#endif + { + SSize_t min = 0; + SSize_t max = strend - strbeg; + SSize_t sublen; + + if ( (flags & REXEC_COPY_SKIP_POST) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_RIGHT) + ) { /* don't copy $' part of string */ + U32 n = 0; + max = -1; + /* calculate the right-most part of the string covered + * by a capture. Due to look-ahead, this may be to + * the right of $&, so we have to scan all captures */ + while (n <= prog->lastparen) { + if (prog->offs[n].end > max) + max = prog->offs[n].end; + n++; + } + if (max == -1) + max = (PL_sawampersand & SAWAMPERSAND_LEFT) + ? prog->offs[0].start + : 0; + assert(max >= 0 && max <= strend - strbeg); + } + + if ( (flags & REXEC_COPY_SKIP_PRE) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_LEFT) + ) { /* don't copy $` part of string */ + U32 n = 0; + min = max; + /* calculate the left-most part of the string covered + * by a capture. Due to look-behind, this may be to + * the left of $&, so we have to scan all captures */ + while (min && n <= prog->lastparen) { + if ( prog->offs[n].start != -1 + && prog->offs[n].start < min) + { + min = prog->offs[n].start; + } + n++; + } + if ((PL_sawampersand & SAWAMPERSAND_RIGHT) + && min > prog->offs[0].end + ) + min = prog->offs[0].end; + + } + + assert(min >= 0 && min <= max && min <= strend - strbeg); + sublen = max - min; + + if (RX_MATCH_COPIED(rx)) { + if (sublen > prog->sublen) + prog->subbeg = + (char*)saferealloc(prog->subbeg, sublen+1); + } + else + prog->subbeg = (char*)safemalloc(sublen+1); + Copy(strbeg + min, prog->subbeg, sublen, char); + prog->subbeg[sublen] = '\0'; + prog->suboffset = min; + prog->sublen = sublen; + RX_MATCH_COPIED_on(rx); + } + prog->subcoffset = prog->suboffset; + if (prog->suboffset && utf8_target) { + /* Convert byte offset to chars. + * XXX ideally should only compute this if @-/@+ + * has been seen, a la PL_sawampersand ??? */ + + /* If there's a direct correspondence between the + * string which we're matching and the original SV, + * then we can use the utf8 len cache associated with + * the SV. In particular, it means that under //g, + * sv_pos_b2u() will use the previously cached + * position to speed up working out the new length of + * subcoffset, rather than counting from the start of + * the string each time. This stops + * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; + * from going quadratic */ + if (SvPOKp(sv) && SvPVX(sv) == strbeg) + prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, + SV_GMAGIC|SV_CONST_RETURN); + else + prog->subcoffset = utf8_length((U8*)strbeg, + (U8*)(strbeg+prog->suboffset)); + } + } + else { + RX_MATCH_COPY_FREE(rx); + prog->subbeg = strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + prog->sublen = strend - strbeg; + } +} + + + /* - regexec_flags - match a regexp against a string */ I32 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, - char *strbeg, I32 minend, SV *sv, void *data, U32 flags) + char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -2062,21 +2223,17 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* sv: SV being matched: only used for utf8 flag, pos() etc; string * itself is accessed via the pointers above */ /* data: May be used for some additional optimizations. - Currently its only used, with a U32 cast, for transmitting - the ganch offset when doing a /g match. This will change */ -/* nosave: For optimizations. */ + Currently unused. */ +/* flags: For optimizations. See REXEC_* in regexp.h */ { dVAR; struct regexp *const prog = ReANY(rx); char *s; regnode *c; - char *startpos = stringarg; - I32 minlen; /* must match at least this many chars */ - I32 dontbother = 0; /* how many characters not to try at end */ - I32 end_shift = 0; /* Same for the end. */ /* CC */ - I32 scream_pos = -1; /* Internal iterator of scream. */ - char *scream_olds = NULL; + char *startpos; + SSize_t minlen; /* must match at least this many chars */ + SSize_t dontbother = 0; /* how many characters not to try at end */ const bool utf8_target = cBOOL(DO_UTF8(sv)); I32 multiline; RXi_GET_DECL(prog,progi); @@ -2090,47 +2247,130 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, PERL_UNUSED_ARG(data); /* Be paranoid... */ - if (prog == NULL || startpos == NULL) { + if (prog == NULL || stringarg == NULL) { Perl_croak(aTHX_ "NULL regexp parameter"); return 0; } - multiline = prog->extflags & RXf_PMf_MULTILINE; + DEBUG_EXECUTE_r( + debug_start_match(rx, utf8_target, stringarg, strend, + "Matching"); + ); - reginfo->eval_state = NULL; - reginfo->prog = rx; /* Yes, sorry that this is confusing. */ - reginfo->intuit = 0; - reginfo->is_utf8_target = cBOOL(utf8_target); + startpos = stringarg; + + if (prog->extflags & RXf_GPOS_SEEN) { + MAGIC *mg; + + /* set reginfo->ganch, the position where \G can match */ + + reginfo->ganch = + (flags & REXEC_IGNOREPOS) + ? stringarg /* use start pos rather than pos() */ + : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0) + /* Defined pos(): */ + ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) + : strbeg; /* pos() not defined; use start of string */ + + DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, + "GPOS ganch set to strbeg[%"IVdf"]\n", reginfo->ganch - strbeg)); + + /* in the presence of \G, we may need to start looking earlier in + * the string than the suggested start point of stringarg: + * if gofs->prog is set, then that's a known, fixed minimum + * offset, such as + * /..\G/: gofs = 2 + * /ab|c\G/: gofs = 1 + * or if the minimum offset isn't known, then we have to go back + * to the start of the string, e.g. /w+\G/ + */ - /* on first ever match, allocate first slab */ - if (!PL_regmatch_slab) { - Newx(PL_regmatch_slab, 1, regmatch_slab); - PL_regmatch_slab->prev = NULL; - PL_regmatch_slab->next = NULL; - PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab); + if (prog->extflags & RXf_ANCH_GPOS) { + startpos = reginfo->ganch - prog->gofs; + if (startpos < + ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) + { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "fail: ganch-gofs before earliest possible start\n")); + return 0; + } + } + else if (prog->gofs) { + if (startpos - prog->gofs < strbeg) + startpos = strbeg; + else + startpos -= prog->gofs; + } + else if (prog->extflags & RXf_GPOS_FLOAT) + startpos = strbeg; + } + + minlen = prog->minlen; + if ((startpos + minlen) > strend || startpos < strbeg) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "Regex match can't succeed, so not even tried\n")); + return 0; } - /* note current PL_regmatch_state position; at end of match we'll - * pop back to there and free any higher slabs */ + /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), + * which will call destuctors to reset PL_regmatch_state, free higher + * PL_regmatch_slabs, and clean up regmatch_info_aux and + * regmatch_info_aux_eval */ + oldsave = PL_savestack_ix; - SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL); - SAVEVPTR(PL_regmatch_slab); - SAVEVPTR(PL_regmatch_state); + s = startpos; - DEBUG_EXECUTE_r( - debug_start_match(rx, utf8_target, startpos, strend, - "Matching"); - ); + if ((prog->extflags & RXf_USE_INTUIT) + && !(flags & REXEC_CHECKED)) + { + s = re_intuit_start(rx, sv, strbeg, startpos, strend, + flags, NULL); + if (!s) + return 0; + + if (prog->extflags & RXf_CHECK_ALL) { + /* we can match based purely on the result of INTUIT. + * Set up captures etc just for $& and $-[0] + * (an intuit-only match wont have $1,$2,..) */ + assert(!prog->nparens); + + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (s < stringarg)) + { + /* this should only be possible under \G */ + assert(prog->extflags & RXf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } - minlen = prog->minlen; + /* match via INTUIT shouldn't have any captures. + * Let @-, @+, $^N know */ + prog->lastparen = prog->lastcloseparen = 0; + RX_MATCH_UTF8_set(rx, utf8_target); + prog->offs[0].start = s - strbeg; + prog->offs[0].end = utf8_target + ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg + : s - strbeg + prog->minlenret; + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, strend, + sv, flags, utf8_target); + + return 1; + } + } + + multiline = prog->extflags & RXf_PMf_MULTILINE; - if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { + if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too short [regexec_flags]...\n")); goto phooey; } - /* Check validity of program. */ if (UCHARAT(progi->program) != REG_MAGIC) { @@ -2139,56 +2379,82 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, RX_MATCH_TAINTED_off(rx); + reginfo->prog = rx; /* Yes, sorry that this is confusing. */ + reginfo->intuit = 0; + reginfo->is_utf8_target = cBOOL(utf8_target); reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); reginfo->warned = FALSE; reginfo->strbeg = strbeg; reginfo->sv = sv; reginfo->poscache_maxiter = 0; /* not yet started a countdown */ - - /* Mark end of string for $ (and such) */ reginfo->strend = strend; - /* see how far we have to get to not match where we matched before */ - reginfo->till = startpos+minend; + reginfo->till = stringarg + minend; + + if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) { + /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after + S_cleanup_regmatch_info_aux has executed (registered by + SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies + magic belonging to this SV. + Not newSVsv, either, as it does not COW. + */ + reginfo->sv = newSV(0); + sv_setsv(reginfo->sv, sv); + SAVEFREESV(reginfo->sv); + } - /* If there is a "must appear" string, look for it. */ - s = startpos; + /* reserve next 2 or 3 slots in PL_regmatch_state: + * slot N+0: may currently be in use: skip it + * slot N+1: use for regmatch_info_aux struct + * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s + * slot N+3: ready for use by regmatch() + */ - if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */ - MAGIC *mg; - if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */ - reginfo->ganch = startpos + prog->gofs; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs)); - } else if (sv && SvTYPE(sv) >= SVt_PVMG - && SvMAGIC(sv) - && (mg = mg_find(sv, PERL_MAGIC_regex_global)) - && mg->mg_len >= 0) { - reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */ - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len)); - - if (prog->extflags & RXf_ANCH_GPOS) { - if (s > reginfo->ganch) - goto phooey; - s = reginfo->ganch - prog->gofs; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs)); - if (s < strbeg) - goto phooey; - } - } - else if (data) { - reginfo->ganch = strbeg + PTR2UV(data); - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data))); - - } else { /* pos() not defined */ - reginfo->ganch = strbeg; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS: reginfo->ganch = strbeg\n")); - } + { + regmatch_state *old_regmatch_state; + regmatch_slab *old_regmatch_slab; + int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1; + + /* on first ever match, allocate first slab */ + if (!PL_regmatch_slab) { + Newx(PL_regmatch_slab, 1, regmatch_slab); + PL_regmatch_slab->prev = NULL; + PL_regmatch_slab->next = NULL; + PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab); + } + + old_regmatch_state = PL_regmatch_state; + old_regmatch_slab = PL_regmatch_slab; + + for (i=0; i <= max; i++) { + if (i == 1) + reginfo->info_aux = &(PL_regmatch_state->u.info_aux); + else if (i ==2) + reginfo->info_aux_eval = + reginfo->info_aux->info_aux_eval = + &(PL_regmatch_state->u.info_aux_eval); + + if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab)) + PL_regmatch_state = S_push_slab(aTHX); + } + + /* note initial PL_regmatch_state position; at end of match we'll + * pop back to there and free any higher slabs */ + + reginfo->info_aux->old_regmatch_state = old_regmatch_state; + reginfo->info_aux->old_regmatch_slab = old_regmatch_slab; + reginfo->info_aux->poscache = NULL; + + SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux); + + if ((prog->extflags & RXf_EVAL_SEEN)) + S_setup_eval_state(aTHX_ reginfo); + else + reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL; } + + /* If there is a "must appear" string, look for it. */ + if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { /* We have to be careful. If the previous successful match was from this regex we don't want a subsequent partially @@ -2207,24 +2473,11 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, PTR2UV(prog->offs) )); } - if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) { - re_scream_pos_data d; - - d.scream_olds = &scream_olds; - d.scream_pos = &scream_pos; - s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d); - if (!s) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n")); - goto phooey; /* not present */ - } - } - - /* Simplest case: anchored match need be tried only once. */ /* [unless only anchor is BOL and multiline is set] */ if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) { - if (s == startpos && regtry(reginfo, &startpos)) + if (s == startpos && regtry(reginfo, &s)) goto got_it; else if (multiline || (prog->intflags & PREGf_IMPLICIT) || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */ @@ -2303,12 +2556,11 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, goto phooey; } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) { - /* the warning about reginfo->ganch being used without initialization - is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN - and we only enter this block when the same bit is set. */ - char *tmp_s = reginfo->ganch - prog->gofs; - - if (tmp_s >= strbeg && regtry(reginfo, &tmp_s)) + /* For anchored \G, the only position it can match from is + * (ganch-gofs); we already set startpos to this above; if intuit + * moved us on from there, we can't possibly succeed */ + assert(startpos == reginfo->ganch - prog->gofs); + if (s == startpos && regtry(reginfo, &s)) goto got_it; goto phooey; } @@ -2364,8 +2616,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, || ((prog->float_substr != NULL || prog->float_utf8 != NULL) && prog->float_max_offset < strend - s)) { SV *must; - I32 back_max; - I32 back_min; + SSize_t back_max; + SSize_t back_min; char *last; char *last1; /* Last position checked before */ #ifdef DEBUGGING @@ -2410,7 +2662,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, last = strend; } else { last = HOP3c(strend, /* Cannot start after this */ - -(I32)(CHR_SVLEN(must) + -(SSize_t)(CHR_SVLEN(must) - (SvTAIL(must) != 0) + back_min), strbeg); } if (s > reginfo->strbeg) @@ -2420,8 +2672,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* XXXX check_substr already used to find "s", can optimize if check_substr==must. */ - scream_pos = -1; - dontbother = end_shift; + dontbother = 0; strend = HOPc(strend, -dontbother); while ( (s <= last) && (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), @@ -2614,6 +2865,18 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, goto phooey; got_it: + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (prog->offs[0].start < stringarg - strbeg)) + { + /* this should only be possible under \G */ + assert(prog->extflags & RXf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + DEBUG_BUFFERS_r( if (swap) PerlIO_printf(Perl_debug_log, @@ -2624,12 +2887,10 @@ got_it: ); Safefree(swap); - if (reginfo->eval_state) { - reginfo->eval_state->direct = TRUE; - S_restore_eval_state(aTHX_ reginfo->eval_state); - } + /* clean up; this will trigger destructors that will free all slabs + * above the current one, and cleanup the regmatch_info_aux + * and regmatch_info_aux_eval sructs */ - /* clean up; in particular, free all slabs above current one */ LEAVE_SCOPE(oldsave); if (RXp_PAREN_NAMES(prog)) @@ -2638,123 +2899,10 @@ got_it: RX_MATCH_UTF8_set(rx, utf8_target); /* make sure $`, $&, $', and $digit will work later */ - if ( !(flags & REXEC_NOT_FIRST) ) { - if (flags & REXEC_COPY_STR) { -#ifdef PERL_ANY_COW - if (SvCANCOW(sv)) { - if (DEBUG_C_TEST) { - PerlIO_printf(Perl_debug_log, - "Copy on write: regexp capture, type %d\n", - (int) SvTYPE(sv)); - } - RX_MATCH_COPY_FREE(rx); - prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); - prog->subbeg = (char *)SvPVX_const(prog->saved_copy); - assert (SvPOKp(prog->saved_copy)); - prog->sublen = reginfo->strend - strbeg; - prog->suboffset = 0; - prog->subcoffset = 0; - } else -#endif - { - I32 min = 0; - I32 max = reginfo->strend - strbeg; - I32 sublen; - - if ( (flags & REXEC_COPY_SKIP_POST) - && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ - && !(PL_sawampersand & SAWAMPERSAND_RIGHT) - ) { /* don't copy $' part of string */ - U32 n = 0; - max = -1; - /* calculate the right-most part of the string covered - * by a capture. Due to look-ahead, this may be to - * the right of $&, so we have to scan all captures */ - while (n <= prog->lastparen) { - if (prog->offs[n].end > max) - max = prog->offs[n].end; - n++; - } - if (max == -1) - max = (PL_sawampersand & SAWAMPERSAND_LEFT) - ? prog->offs[0].start - : 0; - assert(max >= 0 && max <= reginfo->strend - strbeg); - } - - if ( (flags & REXEC_COPY_SKIP_PRE) - && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ - && !(PL_sawampersand & SAWAMPERSAND_LEFT) - ) { /* don't copy $` part of string */ - U32 n = 0; - min = max; - /* calculate the left-most part of the string covered - * by a capture. Due to look-behind, this may be to - * the left of $&, so we have to scan all captures */ - while (min && n <= prog->lastparen) { - if ( prog->offs[n].start != -1 - && prog->offs[n].start < min) - { - min = prog->offs[n].start; - } - n++; - } - if ((PL_sawampersand & SAWAMPERSAND_RIGHT) - && min > prog->offs[0].end - ) - min = prog->offs[0].end; - - } - - assert(min >= 0 && min <= max - && min <= reginfo->strend - strbeg); - sublen = max - min; - - if (RX_MATCH_COPIED(rx)) { - if (sublen > prog->sublen) - prog->subbeg = - (char*)saferealloc(prog->subbeg, sublen+1); - } - else - prog->subbeg = (char*)safemalloc(sublen+1); - Copy(strbeg + min, prog->subbeg, sublen, char); - prog->subbeg[sublen] = '\0'; - prog->suboffset = min; - prog->sublen = sublen; - RX_MATCH_COPIED_on(rx); - } - prog->subcoffset = prog->suboffset; - if (prog->suboffset && utf8_target) { - /* Convert byte offset to chars. - * XXX ideally should only compute this if @-/@+ - * has been seen, a la PL_sawampersand ??? */ - - /* If there's a direct correspondence between the - * string which we're matching and the original SV, - * then we can use the utf8 len cache associated with - * the SV. In particular, it means that under //g, - * sv_pos_b2u() will use the previously cached - * position to speed up working out the new length of - * subcoffset, rather than counting from the start of - * the string each time. This stops - * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; - * from going quadratic */ - if (SvPOKp(sv) && SvPVX(sv) == strbeg) - sv_pos_b2u(sv, &(prog->subcoffset)); - else - prog->subcoffset = utf8_length((U8*)strbeg, - (U8*)(strbeg+prog->suboffset)); - } - } - else { - RX_MATCH_COPY_FREE(rx); - prog->subbeg = strbeg; - prog->suboffset = 0; - prog->subcoffset = 0; - /* use reginfo->strend, as strend may have been modified */ - prog->sublen = reginfo->strend - strbeg; - } - } + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, reginfo->strend, + sv, flags, utf8_target); return 1; @@ -2762,12 +2910,10 @@ phooey: DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", PL_colors[4], PL_colors[5])); - if (reginfo->eval_state) { - reginfo->eval_state->direct = TRUE; - S_restore_eval_state(aTHX_ reginfo->eval_state); - } + /* clean up; this will trigger destructors that will free all slabs + * above the current one, and cleanup the regmatch_info_aux + * and regmatch_info_aux_eval sructs */ - /* clean up; in particular, free all slabs above current one */ LEAVE_SCOPE(oldsave); if (swap) { @@ -2785,10 +2931,10 @@ phooey: } -/* Set which rex is pointed to by PL_reg_state, handling ref counting. +/* Set which rex is pointed to by PL_reg_curpm, handling ref counting. * Do inc before dec, in case old and new rex are the same */ #define SET_reg_curpm(Re2) \ - if (reginfo->eval_state) { \ + if (reginfo->info_aux_eval) { \ (void)ReREFCNT_inc(Re2); \ ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ PM_SETRE((PL_reg_curpm), (Re2)); \ @@ -2805,7 +2951,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) CHECKPOINT lastcp; REGEXP *const rx = reginfo->prog; regexp *const prog = ReANY(rx); - I32 result; + SSize_t result; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -2813,12 +2959,6 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) reginfo->cutpoint=NULL; - if ((prog->extflags & RXf_EVAL_SEEN) && !reginfo->eval_state) - S_setup_eval_state(aTHX_ reginfo); - -#ifdef DEBUGGING - PL_reg_starttry = *startposp; -#endif prog->offs[0].start = *startposp - reginfo->strbeg; prog->lastparen = 0; prog->lastcloseparen = 0; @@ -2868,7 +3008,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) "unreachable code" warnings, which are bogus, but distracting. */ #define CACHEsayNO \ if (ST.cache_mask) \ - PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \ + reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \ sayNO /* this is used to determine how far from the left messages like @@ -3198,25 +3338,6 @@ S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) } -/* free all slabs above current one - called during LEAVE_SCOPE */ - -STATIC void -S_clear_backtrack_stack(pTHX_ void *p) -{ - regmatch_slab *s = PL_regmatch_slab->next; - PERL_UNUSED_ARG(p); - - if (!s) - return; - PL_regmatch_slab->next = NULL; - while (s) { - regmatch_slab * const osl = s; - s = s->next; - Safefree(osl); - } -} - - static bool S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo) @@ -3324,7 +3445,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, SV** listp; if (! PL_utf8_foldclosures) { if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; + U8 dummy[UTF8_MAXBYTES_CASE+1]; /* Force loading this by folding an above-Latin1 char */ to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); @@ -3377,7 +3498,8 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, * which is the one above 255 */ if ((c1 < 256) != (c2 < 256)) { if (OP(text_node) == EXACTFL - || (OP(text_node) == EXACTFA + || ((OP(text_node) == EXACTFA + || OP(text_node) == EXACTFA_NO_TRIE) && (isASCII(c1) || isASCII(c2)))) { if (c1 < 256) { @@ -3395,7 +3517,9 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, if (utf8_target && HAS_NONLATIN1_FOLD_CLOSURE(c1) && OP(text_node) != EXACTFL - && (OP(text_node) != EXACTFA || ! isASCII(c1))) + && ((OP(text_node) != EXACTFA + && OP(text_node) != EXACTFA_NO_TRIE) + || ! isASCII(c1))) { /* Here, there could be something above Latin1 in the target which * folds to this character in the pattern. All such cases except @@ -3416,7 +3540,9 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, c2 = PL_fold_locale[c1]; break; - case EXACTF: + case EXACTF: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); if (! utf8_target) { /* /d rules */ c2 = PL_fold[c1]; break; @@ -3424,8 +3550,11 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, /* FALLTHROUGH */ /* /u rules for all these. This happens to work for * EXACTFA as nothing in Latin1 folds to ASCII */ + case EXACTFA_NO_TRIE: /* This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: - case EXACTFU_TRICKYFOLD: case EXACTFU_SS: case EXACTFU: c2 = PL_fold_latin1[c1]; @@ -3475,7 +3604,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } /* returns -1 on failure, $+[0] on success */ -STATIC I32 +STATIC SSize_t S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) { #if PERL_VERSION < 9 && !defined(PERL_CORE) @@ -3493,7 +3622,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) regnode *scan; regnode *next; U32 n = 0; /* general value; init to avoid compiler warning */ - I32 ln = 0; /* len or last; init to avoid compiler warning */ + SSize_t ln = 0; /* len or last; init to avoid compiler warning */ char *locinput = startpos; char *pushinput; /* where to continue after a PUSH */ I32 nextchr; /* is always set to UCHARAT(locinput) */ @@ -3565,10 +3694,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) PerlIO_printf(Perl_debug_log,"regmatch start\n"); })); - /* grab next free state slot */ - st = ++PL_regmatch_state; - if (st > SLAB_LAST(PL_regmatch_slab)) - st = PL_regmatch_state = S_push_slab(aTHX); + st = PL_regmatch_state; /* Note that nextchr is a byte even in UTF */ SET_nextchr; @@ -3943,7 +4069,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (chars) { if (utf8_target) { - uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, uniflags); uc += len; } @@ -3956,7 +4082,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (foldlen) { if (!--chars) break; - uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len, uniflags); uscan += len; foldlen -= len; @@ -4047,7 +4173,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) l++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) { + if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + { sayNO; } l += 2; @@ -4070,7 +4197,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) { + if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + { sayNO; } s += 2; @@ -4107,20 +4235,25 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto do_exactf; case EXACTFU_SS: /* /\x{df}/iu */ - case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */ case EXACTFU: /* /abc/iu */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; goto do_exactf; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: /* /abc/iaa */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; - case EXACTF: /* /abc/i */ + case EXACTF: /* /abc/i This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); folder = foldEQ; fold_array = PL_fold; fold_utf8_flags = 0; @@ -4196,7 +4329,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } } else { - ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln)); + ln = isWORDCHAR_LC_uvchr(ln); n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); } } @@ -4285,7 +4418,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), - (U8) TWO_BYTE_UTF8_TO_UNI(nextchr, + (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)))))) { sayNO; @@ -4366,9 +4499,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr, + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)), - FLAGS(scan))))) + FLAGS(scan))))) { sayNO; } @@ -4828,29 +4961,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) OP * const oop = PL_op; COP * const ocurcop = PL_curcop; OP *nop; - struct re_save_state saved_state; CV *newcv; /* save *all* paren positions */ regcppush(rex, 0, maxopenparen); REGCP_SET(runops_cp); - /* To not corrupt the existing regex state while executing the - * eval we would normally put it on the save stack, like with - * save_re_context. However, re-evals have a weird scoping so we - * can't just add ENTER/LEAVE here. With that, things like - * - * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a}) - * - * would break, as they expect the localisation to be unwound - * only when the re-engine backtracks through the bit that - * localised it. - * - * What we do instead is just saving the state in a local c - * variable. - */ - Copy(&PL_reg_state, &saved_state, 1, struct re_save_state); - if (!caller_cv) caller_cv = find_runcv(NULL); @@ -4938,9 +5054,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) ); rex->offs[0].end = locinput - reginfo->strbeg; - if (reginfo->eval_state->pos_magic) - reginfo->eval_state->pos_magic->mg_len - = locinput - reginfo->strbeg; + if (reginfo->info_aux_eval->pos_magic) + MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, + reginfo->sv, reginfo->strbeg, + locinput - reginfo->strbeg); if (sv_yes_mark) { SV *sv_mrk = get_sv("REGMARK", 1); @@ -4999,8 +5116,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } - Copy(&saved_state, &PL_reg_state, 1, struct re_save_state); - /* *** Note that at this point we don't restore * PL_comppad, (or pop the CxSUB) on the assumption it may * be used again soon. This is safe as long as nothing @@ -5008,6 +5123,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) PL_op = oop; PL_curcop = ocurcop; S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); + PL_curpm = PL_reg_curpm; if (logical != 2) break; @@ -5086,7 +5202,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) maxopenparen = 0; - /* XXXX This is too dramatic a measure... */ + /* invalidate the S-L poscache. We're now executing a + * different set of WHILEM ops (and their associated + * indexes) against the same string, so the bits in the + * cache are meaningless. Setting maxiter to zero forces + * the cache to be invalidated and zeroed before reuse. + * XXX This is too dramatic a measure. Ideally we should + * save the old cache and restore when running the outer + * pattern again */ reginfo->poscache_maxiter = 0; is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); @@ -5117,7 +5240,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) cur_eval = ST.prev_eval; cur_curlyx = ST.prev_curlyx; - /* XXXX This is too dramatic a measure... */ + /* Invalidate cache. See "invalidate" comment above. */ reginfo->poscache_maxiter = 0; if ( nochange_depth ) nochange_depth--; @@ -5136,7 +5259,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) regcppop(rex, &maxopenparen); cur_eval = ST.prev_eval; cur_curlyx = ST.prev_curlyx; - /* XXXX This is too dramatic a measure... */ + /* Invalidate cache. See "invalidate" comment above. */ reginfo->poscache_maxiter = 0; if ( nochange_depth ) nochange_depth--; @@ -5415,7 +5538,37 @@ NULL goto do_whilem_B_max; } - /* super-linear cache processing */ + /* super-linear cache processing. + * + * The idea here is that for certain types of CURLYX/WHILEM - + * principally those whose upper bound is infinity (and + * excluding regexes that have things like \1 and other very + * non-regular expresssiony things), then if a pattern like + * /....A*.../ fails and we backtrack to the WHILEM, then we + * make a note that this particular WHILEM op was at string + * position 47 (say) when the rest of pattern failed. Then, if + * we ever find ourselves back at that WHILEM, and at string + * position 47 again, we can just fail immediately rather than + * running the rest of the pattern again. + * + * This is very handy when patterns start to go + * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up + * with a combinatorial explosion of backtracking. + * + * The cache is implemented as a bit array, with one bit per + * string byte position per WHILEM op (up to 16) - so its + * between 0.25 and 2x the string size. + * + * To avoid allocating a poscache buffer every time, we do an + * initially countdown; only after we have executed a WHILEM + * op (string-length x #WHILEMs) times do we allocate the + * cache. + * + * The top 4 bits of scan->flags byte say how many different + * relevant CURLLYX/WHILEM op pairs there are, while the + * bottom 4-bits is the identifying index number of this + * WHILEM. + */ if (scan->flags) { @@ -5433,17 +5586,18 @@ NULL if (reginfo->poscache_iter-- == 0) { /* initialise cache */ - const I32 size = (reginfo->poscache_maxiter + 7)/8; - if (PL_reg_poscache) { - if ((I32)PL_reg_poscache_size < size) { - Renew(PL_reg_poscache, size, char); - PL_reg_poscache_size = size; + const SSize_t size = (reginfo->poscache_maxiter + 7)/8; + regmatch_info_aux *const aux = reginfo->info_aux; + if (aux->poscache) { + if ((SSize_t)reginfo->poscache_size < size) { + Renew(aux->poscache, size, char); + reginfo->poscache_size = size; } - Zero(PL_reg_poscache, size, char); + Zero(aux->poscache, size, char); } else { - PL_reg_poscache_size = size; - Newxz(PL_reg_poscache, size, char); + reginfo->poscache_size = size; + Newxz(aux->poscache, size, char); } DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, "%swhilem: Detected a super-linear match, switching on caching%s...\n", @@ -5453,13 +5607,15 @@ NULL if (reginfo->poscache_iter < 0) { /* have we already failed at this position? */ - I32 offset, mask; + SSize_t offset, mask; + + reginfo->poscache_iter = -1; /* stop eventual underflow */ offset = (scan->flags & 0xf) - 1 + (locinput - reginfo->strbeg) * (scan->flags>>4); mask = 1 << (offset % 8); offset /= 8; - if (PL_reg_poscache[offset] & mask) { + if (reginfo->info_aux->poscache[offset] & mask) { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, "%*s whilem: (cache) already tried at this position...\n", REPORT_CODE_OFF+depth*2, "") @@ -5776,7 +5932,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n", + "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n", (int)(REPORT_CODE_OFF+(depth*2)),"", valid_utf8_to_uvchr((U8 *) locinput, NULL), valid_utf8_to_uvchr(ST.c1_utf8, NULL), @@ -5790,7 +5946,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n", + "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", (int)(REPORT_CODE_OFF+(depth*2)),"", (int) nextchr, ST.c1, ST.c2) ); @@ -6200,8 +6356,8 @@ NULL DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n", PL_colors[4], - (long)(locinput - PL_reg_starttry), - (long)(reginfo->till - PL_reg_starttry), + (long)(locinput - startpos), + (long)(reginfo->till - startpos), PL_colors[5])); sayNO_SILENT; /* Cannot match: too short. */ @@ -6523,7 +6679,7 @@ yes: DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", PL_colors[4], PL_colors[5])); - if (reginfo->eval_state) { + if (reginfo->info_aux_eval) { /* each successfully executed (?{...}) block does the equivalent of * local $^R = do {...} * When popping the save stack, all these locals would be undone; @@ -6635,7 +6791,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, scan = *startposp; if (max == REG_INFTY) max = I32_MAX; - else if (! utf8_target && scan + max < loceol) + else if (! utf8_target && loceol - scan > max) loceol = scan + max; /* Here, for the case of a non-UTF-8 target we have adjusted down @@ -6684,7 +6840,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, scan = loceol; break; case CANY: /* Move forward bytes, unless goes off end */ - if (utf8_target && scan + max < loceol) { + if (utf8_target && loceol - scan > max) { /* hadn't been adjusted in the UTF-8 case */ scan += max; @@ -6703,7 +6859,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's * true iff it doesn't matter if the argument is in UTF-8 or not */ if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) { - if (utf8_target && scan + max < loceol) { + if (utf8_target && loceol - scan > max) { /* We didn't adjust because is UTF-8, but ok to do so, * since here, to match at all, 1 char == 1 byte */ loceol = scan + max; @@ -6731,7 +6887,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* Target isn't utf8; convert the character in the UTF-8 * pattern to non-UTF8, and do a simple loop */ - c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1)); + c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); while (scan < loceol && UCHARAT(scan) == c) { scan++; } @@ -6758,8 +6914,11 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: - utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; + utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; case EXACTFL: @@ -6767,12 +6926,12 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, utf8_flags = FOLDEQ_UTF8_LOCALE; goto do_exactf; - case EXACTF: - utf8_flags = 0; - goto do_exactf; + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + utf8_flags = 0; + goto do_exactf; case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFU: utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -6883,7 +7042,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* FALLTHROUGH */ case POSIXA: - if (utf8_target && scan + max < loceol) { + if (utf8_target && loceol - scan > max) { /* We didn't adjust at the beginning of this routine * because is UTF-8, but it is actually ok to do so, since here, to @@ -6958,8 +7117,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan, - *(scan + 1)), + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + *(scan + 1)), classnum)))) { break; @@ -7144,15 +7303,18 @@ STATIC SV * S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp) { /* Returns the swash for the input 'node' in the regex 'prog'. - * If is true, will attempt to create the swash if not already + * If is 'true', will attempt to create the swash if not already * done. - * If is non-null, will return the swash initialization string in - * it. + * If is non-null, will return the printable contents of the + * swash. This can be used to get debugging information even before the + * swash exists, by calling this function with 'doinit' set to false, in + * which case the components that will be used to eventually create the + * swash are returned (in a printable form). * Tied intimately to how regcomp.c sets up the data structure */ dVAR; SV *sw = NULL; - SV *si = NULL; + SV *si = NULL; /* Input swash initialization string */ SV* invlist = NULL; RXi_GET_DECL(prog,progi); @@ -7188,7 +7350,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit /* Element [1] is reserved for the set-up swash. If already there, * return it; if not, create it and store it there */ - if (SvROK(ary[1])) { + if (ary[1] && SvROK(ary[1])) { sw = ary[1]; } else if (si && doinit) { @@ -7205,16 +7367,18 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit } } + /* If requested, return a printable version of what this swash matches */ if (listsvp) { SV* matches_string = newSVpvn("", 0); - /* Use the swash, if any, which has to have incorporated into it all - * possibilities */ + /* The swash should be used, if possible, to get the data, as it + * contains the resolved data. But this function can be called at + * compile-time, before everything gets resolved, in which case we + * return the currently best available information, which is the string + * that will eventually be used to do that resolving, 'si' */ if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) && (si && si != &PL_sv_undef)) { - - /* If no swash, use the input initialization string, if available */ sv_catsv(matches_string, si); } @@ -7386,7 +7550,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } STATIC U8 * -S_reghop3(U8 *s, I32 off, const U8* lim) +S_reghop3(U8 *s, SSize_t off, const U8* lim) { /* return the position 'off' UTF-8 characters away from 's', forward if * 'off' >= 0, backwards if negative. But don't go outside of position @@ -7421,7 +7585,7 @@ S_reghop3(U8 *s, I32 off, const U8* lim) we ifdef it out - dmq */ STATIC U8 * -S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) +S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) { dVAR; @@ -7448,7 +7612,7 @@ S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) #endif STATIC U8 * -S_reghopmaybe3(U8* s, I32 off, const U8* lim) +S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) { dVAR; @@ -7491,9 +7655,6 @@ S_reghopmaybe3(U8* s, I32 off, const U8* lim) * save the old values of subbeg etc of the current regex, and set then to the current string (again, this is normally only done at the end of execution) - - It also sets up a destructor so that all this will be cleared up if - we die. */ static void @@ -7501,14 +7662,8 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) { MAGIC *mg; regexp *const rex = ReANY(reginfo->prog); - regmatch_eval_state *eval_state; - - Newx(eval_state, 1, regmatch_eval_state); - assert(!reginfo->eval_state); - reginfo->eval_state = eval_state; + regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; - eval_state->restored = FALSE; - eval_state->direct = FALSE; eval_state->rex = rex; if (reginfo->sv) { @@ -7518,24 +7673,23 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) DEFSV_set(reginfo->sv); } - if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv) - && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) { + if (!(mg = mg_find_mglob(reginfo->sv))) { /* prepare for quick setting of pos */ -#ifdef PERL_OLD_COPY_ON_WRITE - if (SvIsCOW(reginfo->sv)) - sv_force_normal_flags(reginfo->sv, 0); -#endif - mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global, - &PL_vtbl_mglob, NULL, 0); + mg = sv_magicext_mglob(reginfo->sv); mg->mg_len = -1; } eval_state->pos_magic = mg; eval_state->pos = mg->mg_len; + eval_state->pos_flags = mg->mg_flags; } else eval_state->pos_magic = NULL; if (!PL_reg_curpm) { + /* PL_reg_curpm is a fake PMOP that we can attach the current + * regex to and point PL_curpm at, so that $1 et al are visible + * within a /(?{})/. It's just allocated once per interpreter the + * first time its needed */ Newxz(PL_reg_curpm, 1, PMOP); #ifdef USE_ITHREADS { @@ -7570,41 +7724,63 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) rex->suboffset = 0; rex->subcoffset = 0; rex->sublen = reginfo->strend - reginfo->strbeg; - SAVEDESTRUCTOR_X(S_restore_eval_state, eval_state); } -/* undo the effects of S_setup_eval_state() - can either be called - * directly, or via a destructor. If we get called directly, we'll still - * get called again later from the destructor */ + +/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */ static void -S_restore_eval_state(pTHX_ void *arg) +S_cleanup_regmatch_info_aux(pTHX_ void *arg) { dVAR; - regmatch_eval_state * const eval_state = (regmatch_eval_state *)arg; - regexp * const rex = eval_state->rex; - - if (!eval_state->restored) { - if (eval_state->subbeg) { - rex->subbeg = eval_state->subbeg; - rex->sublen = eval_state->sublen; - rex->suboffset = eval_state->suboffset; - rex->subcoffset = eval_state->subcoffset; + regmatch_info_aux *aux = (regmatch_info_aux *) arg; + regmatch_info_aux_eval *eval_state = aux->info_aux_eval; + regmatch_slab *s; + + Safefree(aux->poscache); + + if (eval_state) { + + /* undo the effects of S_setup_eval_state() */ + + if (eval_state->subbeg) { + regexp * const rex = eval_state->rex; + rex->subbeg = eval_state->subbeg; + rex->sublen = eval_state->sublen; + rex->suboffset = eval_state->suboffset; + rex->subcoffset = eval_state->subcoffset; #ifdef PERL_ANY_COW - rex->saved_copy = eval_state->saved_copy; + rex->saved_copy = eval_state->saved_copy; #endif - RXp_MATCH_COPIED_on(rex); - } + RXp_MATCH_COPIED_on(rex); + } if (eval_state->pos_magic) + { eval_state->pos_magic->mg_len = eval_state->pos; - PL_curpm = eval_state->curpm; - eval_state->restored = TRUE; + eval_state->pos_magic->mg_flags = + (eval_state->pos_magic->mg_flags & ~MGf_BYTES) + | (eval_state->pos_flags & MGf_BYTES); + } + + PL_curpm = eval_state->curpm; + } + + PL_regmatch_state = aux->old_regmatch_state; + PL_regmatch_slab = aux->old_regmatch_slab; + + /* free all slabs above current one - this must be the last action + * of this function, as aux and eval_state are allocated within + * slabs and may be freed here */ + + s = PL_regmatch_slab->next; + if (s) { + PL_regmatch_slab->next = NULL; + while (s) { + regmatch_slab * const osl = s; + s = s->next; + Safefree(osl); + } } - if (eval_state->direct) - eval_state->direct = FALSE; - else - /* we're being called from a destructor rather than directly */ - Safefree(eval_state); }