From bf2039a952dec80d770a6793104c0280373be103 Mon Sep 17 00:00:00 2001 From: David Mitchell Date: Thu, 30 May 2013 16:27:09 +0100 Subject: [PATCH] unify regmatch_info data Previously the regmatch_info struct was allocated as a local var on the C stack, while some extra state (only needed for regexes having (?{})) was malloced (as a regmatch_eval_state struct) as needed - and a destructor set up to clean it up afterwards. This being because the stuff being cleaned up couldn't be allocated on the C stack as it needed to hang around after a croak(). Reorganise this so that: * regmatch_info is on the C stack as before. * a new struct, regmatch_info_aux is allocated within the first slot of the regmatch_state stack, for fields which must always exist but which need cleanup afterwards. This is currently unused, but will be shortly. * a new struct, regmatch_info_aux_eval (which is just a renamed regmatch_eval_state struct), is optionally allocated in the second slot of regmatch_state. This is logically part of regmatch_info_aux, except that splitting it in two stops it being too large to fit in a regmatch_state slot (we can fit it in two instead). (The second and third structs aren't allocated when we're intuit() rather than regexec()). Doing it like this simplifies allocation and cleanup: there's no need for a malloc(), and we are already going to allocate a slab's worth of regmatch_state slots, so using an extra one of two of them is effectively free; and the cleanup just requires calling a single overall destructor. In the next few commits, more of the regexec() state setup and tear-down will be integrated into this new regime. And in particular, the new regmatch_info_aux struct will give us somewhere to hang things like PL_reg_poscache once it stops being global (it being local state that needs cleanup). --- regexec.c | 123 +++++++++++++++++++++++++++++++++----------------------------- regexp.h | 54 ++++++++++++++++++++++----- 2 files changed, 109 insertions(+), 68 deletions(-) diff --git a/regexec.c b/regexec.c index dc4e17f..70c90cb 100644 --- a/regexec.c +++ b/regexec.c @@ -250,8 +250,9 @@ static const char* const non_utf8_target_but_utf8_required #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo); -static void S_restore_eval_state(pTHX_ void *arg); +static void S_cleanup_regmatch_info_aux(pTHX_ void *arg); static void S_clear_backtrack_stack(pTHX_ void *p); +static regmatch_state * S_push_slab(pTHX); #define REGCP_PAREN_ELEMS 3 #define REGCP_OTHER_ELEMS 3 @@ -652,7 +653,7 @@ Perl_re_intuit_start(pTHX_ goto fail; } - reginfo->eval_state = NULL; + reginfo->info_aux = NULL; reginfo->strbeg = strbeg; reginfo->strend = strend; reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); @@ -2097,7 +2098,6 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, multiline = prog->extflags & RXf_PMf_MULTILINE; - reginfo->eval_state = NULL; reginfo->prog = rx; /* Yes, sorry that this is confusing. */ reginfo->intuit = 0; reginfo->is_utf8_target = cBOOL(utf8_target); @@ -2113,10 +2113,6 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* note current PL_regmatch_state position; at end of match we'll * pop back to there and free any higher slabs */ oldsave = PL_savestack_ix; - SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL); - SAVEVPTR(PL_regmatch_slab); - SAVEVPTR(PL_regmatch_state); - DEBUG_EXECUTE_r( debug_start_match(rx, utf8_target, startpos, strend, @@ -2151,8 +2147,35 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* see how far we have to get to not match where we matched before */ reginfo->till = startpos+minend; - if ((prog->extflags & RXf_EVAL_SEEN)) + SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL); + SAVEVPTR(PL_regmatch_slab); + SAVEVPTR(PL_regmatch_state); + + /* grab next slot in regmatch_state stack to store regmatch_info_aux + * struct */ + + reginfo->info_aux = &(PL_regmatch_state->u.info_aux); + if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab)) + PL_regmatch_state = S_push_slab(aTHX); + + SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux); + + + if ((prog->extflags & RXf_EVAL_SEEN)) { + /* grab next slot in regmatch_state stack to store + * regmatch_info_aux_eval struct */ + + reginfo->info_aux_eval = + reginfo->info_aux->info_aux_eval = + &(PL_regmatch_state->u.info_aux_eval); + + if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab)) + PL_regmatch_state = S_push_slab(aTHX); + S_setup_eval_state(aTHX_ reginfo); + } + else + reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL; /* If there is a "must appear" string, look for it. */ @@ -2628,12 +2651,10 @@ got_it: ); Safefree(swap); - if (reginfo->eval_state) { - reginfo->eval_state->direct = TRUE; - S_restore_eval_state(aTHX_ reginfo->eval_state); - } + /* clean up; this will trigger destructors that will free all slabs + * above the current one, and cleanup the regmatch_info_aux + * and regmatch_info_aux_eval sructs */ - /* clean up; in particular, free all slabs above current one */ LEAVE_SCOPE(oldsave); if (RXp_PAREN_NAMES(prog)) @@ -2766,12 +2787,10 @@ phooey: DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", PL_colors[4], PL_colors[5])); - if (reginfo->eval_state) { - reginfo->eval_state->direct = TRUE; - S_restore_eval_state(aTHX_ reginfo->eval_state); - } + /* clean up; this will trigger destructors that will free all slabs + * above the current one, and cleanup the regmatch_info_aux + * and regmatch_info_aux_eval sructs */ - /* clean up; in particular, free all slabs above current one */ LEAVE_SCOPE(oldsave); if (swap) { @@ -2792,7 +2811,7 @@ phooey: /* Set which rex is pointed to by PL_reg_state, handling ref counting. * Do inc before dec, in case old and new rex are the same */ #define SET_reg_curpm(Re2) \ - if (reginfo->eval_state) { \ + if (reginfo->info_aux_eval) { \ (void)ReREFCNT_inc(Re2); \ ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ PM_SETRE((PL_reg_curpm), (Re2)); \ @@ -4939,8 +4958,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) ); rex->offs[0].end = locinput - reginfo->strbeg; - if (reginfo->eval_state->pos_magic) - reginfo->eval_state->pos_magic->mg_len + if (reginfo->info_aux_eval->pos_magic) + reginfo->info_aux_eval->pos_magic->mg_len = locinput - reginfo->strbeg; if (sv_yes_mark) { @@ -6524,7 +6543,7 @@ yes: DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", PL_colors[4], PL_colors[5])); - if (reginfo->eval_state) { + if (reginfo->info_aux_eval) { /* each successfully executed (?{...}) block does the equivalent of * local $^R = do {...} * When popping the save stack, all these locals would be undone; @@ -7492,9 +7511,6 @@ S_reghopmaybe3(U8* s, I32 off, const U8* lim) * save the old values of subbeg etc of the current regex, and set then to the current string (again, this is normally only done at the end of execution) - - It also sets up a destructor so that all this will be cleared up if - we die. */ static void @@ -7502,14 +7518,8 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) { MAGIC *mg; regexp *const rex = ReANY(reginfo->prog); - regmatch_eval_state *eval_state; - - Newx(eval_state, 1, regmatch_eval_state); - assert(!reginfo->eval_state); - reginfo->eval_state = eval_state; + regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; - eval_state->restored = FALSE; - eval_state->direct = FALSE; eval_state->rex = rex; if (reginfo->sv) { @@ -7571,41 +7581,38 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) rex->suboffset = 0; rex->subcoffset = 0; rex->sublen = reginfo->strend - reginfo->strbeg; - SAVEDESTRUCTOR_X(S_restore_eval_state, eval_state); } -/* undo the effects of S_setup_eval_state() - can either be called - * directly, or via a destructor. If we get called directly, we'll still - * get called again later from the destructor */ + +/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */ static void -S_restore_eval_state(pTHX_ void *arg) +S_cleanup_regmatch_info_aux(pTHX_ void *arg) { dVAR; - regmatch_eval_state * const eval_state = (regmatch_eval_state *)arg; - regexp * const rex = eval_state->rex; - - if (!eval_state->restored) { - if (eval_state->subbeg) { - rex->subbeg = eval_state->subbeg; - rex->sublen = eval_state->sublen; - rex->suboffset = eval_state->suboffset; - rex->subcoffset = eval_state->subcoffset; + regmatch_info_aux *aux = (regmatch_info_aux *) arg; + regmatch_info_aux_eval *eval_state = aux->info_aux_eval; + + if (!eval_state) + return; + + /* undo the effects of S_setup_eval_state() */ + + if (eval_state->subbeg) { + regexp * const rex = eval_state->rex; + rex->subbeg = eval_state->subbeg; + rex->sublen = eval_state->sublen; + rex->suboffset = eval_state->suboffset; + rex->subcoffset = eval_state->subcoffset; #ifdef PERL_ANY_COW - rex->saved_copy = eval_state->saved_copy; + rex->saved_copy = eval_state->saved_copy; #endif - RXp_MATCH_COPIED_on(rex); - } - if (eval_state->pos_magic) - eval_state->pos_magic->mg_len = eval_state->pos; - PL_curpm = eval_state->curpm; - eval_state->restored = TRUE; + RXp_MATCH_COPIED_on(rex); } - if (eval_state->direct) - eval_state->direct = FALSE; - else - /* we're being called from a destructor rather than directly */ - Safefree(eval_state); + if (eval_state->pos_magic) + eval_state->pos_magic->mg_len = eval_state->pos; + + PL_curpm = eval_state->curpm; } diff --git a/regexp.h b/regexp.h index 51bba06..cedeb56 100644 --- a/regexp.h +++ b/regexp.h @@ -577,11 +577,9 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp) #define FBMrf_MULTILINE 1 -/* saved state when executing a regex that contains code blocks. - * These need restoring at the end of the match, or on croak(). - * These fields can't be stored in regmatch_info since the latter is - * allocated directly on the stack in regexec_flags(), and they need to - * exist during a croak() after the stack has been unwound */ +/* like regmatch_info_aux, but contains extra fields only needed if the + * pattern contains (?{}). If used, is snuck into the second slot in the + * regmatch_state stack at the start of execution */ typedef struct { regexp *rex; @@ -595,12 +593,24 @@ typedef struct { STRLEN subcoffset; /* saved subcoffset field from rex */ MAGIC *pos_magic; /* pos() magic attached to $_ */ I32 pos; /* the original value of pos() in pos_magic */ - bool restored; /* we have already undone the save */ - bool direct; /* we are calling the destructor directly */ -} regmatch_eval_state; +} regmatch_info_aux_eval; + + +/* fields that logically live in regmatch_info, but which need cleaning + * up on croak(), and so are instead are snuck into the first slot in + * the regmatch_state stack at the start of execution */ + +typedef struct { + regmatch_info_aux_eval *info_aux_eval; +} regmatch_info_aux; + /* some basic information about the current match that is created by - * Perl_regexec_flags and then passed to regtry(), regmatch() etc */ + * Perl_regexec_flags and then passed to regtry(), regmatch() etc. + * It is allocated as a local var on the stack, so nothing should be + * stored in it that needs preserving or clearing up on croak(). + * For that, see the aux_info and aux_info_eval members of the + * regmatch_state union. */ typedef struct { REGEXP *prog; @@ -610,7 +620,8 @@ typedef struct { SV *sv; char *ganch; char *cutpoint; - regmatch_eval_state *eval_state; /* extra saved state for (?{}) */ + regmatch_info_aux *info_aux; /* extra fields that need cleanup */ + regmatch_info_aux_eval *info_aux_eval; /* extra saved state for (?{}) */ I32 poscache_maxiter; /* how many whilems todo before S-L cache kicks in */ I32 poscache_iter; /* current countdown from _maxiter to zero */ bool intuit; /* re_intuit_start() is the top-level caller */ @@ -634,6 +645,29 @@ typedef struct regmatch_state { union { + /* the 'info_aux' and 'info_aux_eval' union members are cuckoos in + * the nest. They aren't saved backtrack state; rather they + * represent one or two extra chunks of data that need allocating + * at the start of a match. These fields would logically live in + * the regmatch_info struct, except that is allocated on the + * C stack, and these fields are all things that require cleanup + * after a croak(), when the stack is lost. + * As a convenience, we just use the first 1 or 2 regmatch_state + * slots to store this info, as we will be allocating a slab of + * these anyway. Otherwise we'd have to malloc and then free them, + * or allocate them on the save stack (where they will get + * realloced if the save stack grows). + * info_aux contains the extra fields that are always needed; + * info_aux_eval contains extra fields that only needed if + * the pattern contains code blocks + * We split them into two separate structs to avoid increasing + * the size of the union. + */ + + regmatch_info_aux info_aux; + + regmatch_info_aux_eval info_aux_eval; + /* this is a fake union member that matches the first element * of each member that needs to store positive backtrack * information */ -- 1.8.3.1