X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/d24ca0c5f11250dcd2552c84a048bda5786ba8d1..cd94d768f5ef1ad750828e00ebe7ea2e19a1b237:/regexec.c diff --git a/regexec.c b/regexec.c index f94d15a..040e16a 100644 --- a/regexec.c +++ b/regexec.c @@ -80,6 +80,9 @@ # include "regcomp.h" #endif +#include "inline_invlist.c" +#include "utf8_strings.h" + #define RF_tainted 1 /* tainted information used? e.g. locale */ #define RF_warned 2 /* warned about big count? */ @@ -87,9 +90,6 @@ #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0) -#define RS_init 1 /* eval environment created */ -#define RS_set 2 /* replsv value is set */ - #ifndef STATIC #define STATIC static #endif @@ -122,19 +122,13 @@ #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) /* these are unrolled below in the CCC_TRY_XXX defined */ -#ifdef EBCDIC - /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just - * skip the check on EBCDIC platforms */ -# define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class) -#else -# define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \ +#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \ if (!CAT2(PL_utf8_,class)) { \ bool ok; \ ENTER; save_re_context(); \ ok=CAT2(is_utf8_,class)((const U8*)str); \ + PERL_UNUSED_VAR(ok); \ assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END -#endif - /* Doesn't do an assert to verify that is correct */ #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \ if (!CAT2(PL_utf8_,class)) { \ @@ -148,21 +142,17 @@ #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ") #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ - LOAD_UTF8_CHARCLASS(X_begin, " "); \ - LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \ - /* These are utf8 constants, and not utf-ebcdic constants, so the \ - * assert should likely and hopefully fail on an EBCDIC machine */ \ - LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \ - \ - /* No asserts are done for these, in case called on an early \ - * Unicode version in which they map to nothing */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */ + /* No asserts are done for some of these, in case called on a */ \ + /* Unicode version in which they map to nothing */ \ + LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin); \ + LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \ + LOAD_UTF8_CHARCLASS(X_L, HANGUL_CHOSEONG_KIYEOK_UTF8); \ + LOAD_UTF8_CHARCLASS(X_LV_LVT_V, HANGUL_JUNGSEONG_FILLER_UTF8); \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI); /* empty in many releases */ \ + LOAD_UTF8_CHARCLASS(X_T, HANGUL_JONGSEONG_KIYEOK_UTF8); \ + LOAD_UTF8_CHARCLASS(X_V, HANGUL_JUNGSEONG_FILLER_UTF8) #define PLACEHOLDER /* Something for the preprocessor to grab onto */ @@ -335,23 +325,25 @@ static void restore_pos(pTHX_ void *arg); -#define REGCP_PAREN_ELEMS 4 -#define REGCP_OTHER_ELEMS 5 +#define REGCP_PAREN_ELEMS 3 +#define REGCP_OTHER_ELEMS 3 #define REGCP_FRAME_ELEMS 1 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and * are needed for the regexp context stack bookkeeping. */ STATIC CHECKPOINT -S_regcppush(pTHX_ I32 parenfloor) +S_regcppush(pTHX_ const regexp *rex, I32 parenfloor) { dVAR; const int retval = PL_savestack_ix; const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS; const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; - int p; + I32 p; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGCPPUSH; + if (paren_elems_to_push < 0) Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0", paren_elems_to_push); @@ -363,25 +355,31 @@ S_regcppush(pTHX_ I32 parenfloor) SSGROW(total_elems + REGCP_FRAME_ELEMS); - for (p = PL_regsize; p > parenfloor; p--) { + DEBUG_BUFFERS_r( + if ((int)PL_regsize > (int)parenfloor) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n", + PTR2UV(rex), + PTR2UV(rex->offs) + ); + ); + for (p = parenfloor+1; p <= (I32)PL_regsize; p++) { /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ - SSPUSHINT(PL_regoffs[p].end); - SSPUSHINT(PL_regoffs[p].start); - SSPUSHPTR(PL_reg_start_tmp[p]); - SSPUSHINT(p); + SSPUSHINT(rex->offs[p].end); + SSPUSHINT(rex->offs[p].start); + SSPUSHINT(rex->offs[p].start_tmp); DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, - " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n", - (UV)p, (IV)PL_regoffs[p].start, - (IV)(PL_reg_start_tmp[p] - PL_bostr), - (IV)PL_regoffs[p].end + " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", + (UV)p, + (IV)rex->offs[p].start, + (IV)rex->offs[p].start_tmp, + (IV)rex->offs[p].end )); } /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ - SSPUSHPTR(PL_regoffs); SSPUSHINT(PL_regsize); - SSPUSHINT(*PL_reglastparen); - SSPUSHINT(*PL_reglastcloseparen); - SSPUSHPTR(PL_reginput); + SSPUSHINT(rex->lastparen); + SSPUSHINT(rex->lastcloseparen); SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */ return retval; @@ -403,12 +401,19 @@ S_regcppush(pTHX_ I32 parenfloor) (IV)(cp), (IV)PL_savestack_ix)); \ regcpblow(cp) -STATIC char * -S_regcppop(pTHX_ const regexp *rex) +#define UNWIND_PAREN(lp, lcp) \ + for (n = rex->lastparen; n > lp; n--) \ + rex->offs[n].end = -1; \ + rex->lastparen = n; \ + rex->lastcloseparen = lcp; + + +STATIC void +S_regcppop(pTHX_ regexp *rex) { dVAR; UV i; - char *input; + U32 paren; GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCPPOP; @@ -417,38 +422,38 @@ S_regcppop(pTHX_ const regexp *rex) i = SSPOPUV; assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */ - input = (char *) SSPOPPTR; - *PL_reglastcloseparen = SSPOPINT; - *PL_reglastparen = SSPOPINT; + rex->lastcloseparen = SSPOPINT; + rex->lastparen = SSPOPINT; PL_regsize = SSPOPINT; - PL_regoffs=(regexp_paren_pair *) SSPOPPTR; i -= REGCP_OTHER_ELEMS; /* Now restore the parentheses context. */ + DEBUG_BUFFERS_r( + if (i || rex->lastparen + 1 <= rex->nparens) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n", + PTR2UV(rex), + PTR2UV(rex->offs) + ); + ); + paren = PL_regsize; for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { I32 tmps; - U32 paren = (U32)SSPOPINT; - PL_reg_start_tmp[paren] = (char *) SSPOPPTR; - PL_regoffs[paren].start = SSPOPINT; + rex->offs[paren].start_tmp = SSPOPINT; + rex->offs[paren].start = SSPOPINT; tmps = SSPOPINT; - if (paren <= *PL_reglastparen) - PL_regoffs[paren].end = tmps; - DEBUG_BUFFERS_r( - PerlIO_printf(Perl_debug_log, - " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n", - (UV)paren, (IV)PL_regoffs[paren].start, - (IV)(PL_reg_start_tmp[paren] - PL_bostr), - (IV)PL_regoffs[paren].end, - (paren > *PL_reglastparen ? "(no)" : "")); + if (paren <= rex->lastparen) + rex->offs[paren].end = tmps; + DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n", + (UV)paren, + (IV)rex->offs[paren].start, + (IV)rex->offs[paren].start_tmp, + (IV)rex->offs[paren].end, + (paren > rex->lastparen ? "(skipped)" : "")); ); + paren--; } - DEBUG_BUFFERS_r( - if (*PL_reglastparen + 1 <= rex->nparens) { - PerlIO_printf(Perl_debug_log, - " restoring \\%"IVdf"..\\%"IVdf" to undef\n", - (IV)(*PL_reglastparen + 1), (IV)rex->nparens); - } - ); #if 1 /* It would seem that the similar code in regtry() * already takes care of this, and in fact it is in @@ -459,13 +464,29 @@ S_regcppop(pTHX_ const regexp *rex) * this code seems to be necessary or otherwise * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ * --jhi updated by dapm */ - for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) { + for (i = rex->lastparen + 1; i <= rex->nparens; i++) { if (i > PL_regsize) - PL_regoffs[i].start = -1; - PL_regoffs[i].end = -1; + rex->offs[i].start = -1; + rex->offs[i].end = -1; + DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %s ..-1 undeffing\n", + (UV)i, + (i > PL_regsize) ? "-1" : " " + )); } #endif - return input; +} + +/* restore the parens and associated vars at savestack position ix, + * but without popping the stack */ + +STATIC void +S_regcp_restore(pTHX_ regexp *rex, I32 ix) +{ + I32 tmpix = PL_savestack_ix; + PL_savestack_ix = ix; + regcppop(rex); + PL_savestack_ix = tmpix; } #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ @@ -550,16 +571,16 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, { dVAR; struct regexp *const prog = (struct regexp *)SvANY(rx); - register I32 start_shift = 0; + I32 start_shift = 0; /* Should be nonnegative! */ - register I32 end_shift = 0; - register char *s; - register SV *check; + I32 end_shift = 0; + char *s; + SV *check; char *strbeg; char *t; const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ I32 ml_anch; - register char *other_last = NULL; /* other substr checked before this */ + char *other_last = NULL; /* other substr checked before this */ char *check_at = NULL; /* check substr found at this pos */ char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/ const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; @@ -1400,12 +1421,12 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const U8 *fold_array; /* array for folding ords < 256 */ STRLEN ln; STRLEN lnc; - register STRLEN uskip; + STRLEN uskip; U8 c1; U8 c2; char *e; - register I32 tmp = 1; /* Scratch variable? */ - register const bool utf8_target = PL_reg_match_utf8; + I32 tmp = 1; /* Scratch variable? */ + const bool utf8_target = PL_reg_match_utf8; UV utf8_fold_flags = 0; RXi_GET_DECL(prog,progi); @@ -1785,6 +1806,20 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, !is_HORIZWS_latin1(s) ); break; + case POSIXA: + /* Don't need to worry about utf8, as it can match only a single + * byte invariant character. The flag in this node type is the + * class number to pass to _generic_isCC() to build a mask for + * searching in PL_charclass[] */ + REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c))); + break; + case NPOSIXA: + REXEC_FBC_CSCAN( + !_generic_isCC_A(*s, FLAGS(c)), + !_generic_isCC_A(*s, FLAGS(c)) + ); + break; + case AHOCORASICKC: case AHOCORASICK: { @@ -2027,7 +2062,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre dVAR; struct regexp *const prog = (struct regexp *)SvANY(rx); /*register*/ char *s; - register regnode *c; + regnode *c; /*register*/ char *startpos = stringarg; I32 minlen; /* must match at least this many chars */ I32 dontbother = 0; /* how many characters not to try at end */ @@ -2074,7 +2109,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre } PL_reg_flags = 0; - PL_reg_eval_set = 0; + PL_reg_state.re_state_eval_setup_done = FALSE; PL_reg_maxiter = 0; if (RX_UTF8(rx)) @@ -2140,6 +2175,12 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre swap = prog->offs; /* do we need a save destructor here for eval dies? */ Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair); + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(swap), + PTR2UV(prog->offs) + )); } if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) { re_scream_pos_data d; @@ -2500,10 +2541,18 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre goto phooey; got_it: + DEBUG_BUFFERS_r( + if (swap) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(swap) + ); + ); Safefree(swap); RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted); - if (PL_reg_eval_set) + if (PL_reg_state.re_state_eval_setup_done) restore_pos(aTHX_ prog); if (RXp_PAREN_NAMES(prog)) (void)hv_iterinit(RXp_PAREN_NAMES(prog)); @@ -2544,10 +2593,16 @@ got_it: phooey: DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", PL_colors[4], PL_colors[5])); - if (PL_reg_eval_set) + if (PL_reg_state.re_state_eval_setup_done) restore_pos(aTHX_ prog); if (swap) { /* we failed :-( roll it back */ + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(prog->offs), + PTR2UV(swap) + )); Safefree(prog->offs); prog->offs = swap; } @@ -2556,6 +2611,16 @@ phooey: } +/* Set which rex is pointed to by PL_reg_state, handling ref counting. + * Do inc before dec, in case old and new rex are the same */ +#define SET_reg_curpm(Re2) \ + if (PL_reg_state.re_state_eval_setup_done) { \ + (void)ReREFCNT_inc(Re2); \ + ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ + PM_SETRE((PL_reg_curpm), (Re2)); \ + } + + /* - regtry - try match at specific point */ @@ -2573,22 +2638,12 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) reginfo->cutpoint=NULL; - if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) { + if ((prog->extflags & RXf_EVAL_SEEN) + && !PL_reg_state.re_state_eval_setup_done) + { MAGIC *mg; - PL_reg_eval_set = RS_init; - DEBUG_EXECUTE_r(DEBUG_s( - PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n", - (IV)(PL_stack_sp - PL_stack_base)); - )); - SAVESTACK_CXPOS(); - cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base; - /* Otherwise OP_NEXTSTATE will free whatever on stack now. */ - SAVETMPS; - /* Apparently this is not needed, judging by wantarray. */ - /* SAVEI8(cxstack[cxstack_ix].blk_gimme); - cxstack[cxstack_ix].blk_gimme = G_SCALAR; */ - + PL_reg_state.re_state_eval_setup_done = TRUE; if (reginfo->sv) { /* Make $_ available to executed code. */ if (reginfo->sv != DEFSV) { @@ -2624,16 +2679,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) } #endif } -#ifdef USE_ITHREADS - /* It seems that non-ithreads works both with and without this code. - So for efficiency reasons it seems best not to have the code - compiled when it is not needed. */ - /* This is safe against NULLs: */ - ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); - /* PM_reg_curpm owns a reference to this regexp. */ - (void)ReREFCNT_inc(rx); -#endif - PM_SETRE(PL_reg_curpm, rx); + SET_reg_curpm(rx); PL_reg_oldcurpm = PL_curpm; PL_curpm = PL_reg_curpm; if (RXp_MATCH_COPIED(prog)) { @@ -2652,30 +2698,22 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) prog->subbeg = PL_bostr; prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */ } - DEBUG_EXECUTE_r(PL_reg_starttry = *startpos); +#ifdef DEBUGGING + PL_reg_starttry = *startpos; +#endif prog->offs[0].start = *startpos - PL_bostr; PL_reginput = *startpos; - PL_reglastparen = &prog->lastparen; - PL_reglastcloseparen = &prog->lastcloseparen; prog->lastparen = 0; prog->lastcloseparen = 0; PL_regsize = 0; - PL_regoffs = prog->offs; - if (PL_reg_start_tmpl <= prog->nparens) { - PL_reg_start_tmpl = prog->nparens*3/2 + 3; - if(PL_reg_start_tmp) - Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); - else - Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*); - } /* XXXX What this code is doing here?!!! There should be no need - to do this again and again, PL_reglastparen should take care of + to do this again and again, prog->lastparen should take care of this! --ilya*/ /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. * Actually, the code in regcppop() (which Ilya may be meaning by - * PL_reglastparen), is not needed at all by the test suite + * prog->lastparen), is not needed at all by the test suite * (op/regexp, op/pat, op/split), but that code is needed otherwise * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ * Meanwhile, this code *is* needed for the @@ -2684,9 +2722,9 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) * --jhi updated by dapm */ #if 1 if (prog->nparens) { - regexp_paren_pair *pp = PL_regoffs; - register I32 i; - for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) { + regexp_paren_pair *pp = prog->offs; + I32 i; + for (i = prog->nparens; i > (I32)prog->lastparen; i--) { ++pp; pp->start = -1; pp->end = -1; @@ -2695,7 +2733,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) #endif REGCP_SET(lastcp); if (regmatch(reginfo, progi->program + 1)) { - PL_regoffs[0].end = PL_reginput - PL_bostr; + prog->offs[0].end = PL_reginput - PL_bostr; return 1; } if (reginfo->cutpoint) @@ -3031,8 +3069,8 @@ S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; for ( n=0; n= nums[n] && - PL_regoffs[nums[n]].end != -1) + if ((I32)rex->lastparen >= nums[n] && + rex->offs[nums[n]].end != -1) { return nums[n]; } @@ -3060,10 +3098,6 @@ S_clear_backtrack_stack(pTHX_ void *p) } -#define SETREX(Re1,Re2) \ - if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \ - Re1 = (Re2) - STATIC I32 /* 0 failure, 1 success */ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) { @@ -3071,21 +3105,21 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) dMY_CXT; #endif dVAR; - register const bool utf8_target = PL_reg_match_utf8; + const bool utf8_target = PL_reg_match_utf8; const U32 uniflags = UTF8_ALLOW_DEFAULT; REGEXP *rex_sv = reginfo->prog; regexp *rex = (struct regexp *)SvANY(rex_sv); RXi_GET_DECL(rex,rexi); I32 oldsave; /* the current state. This is a cached copy of PL_regmatch_state */ - register regmatch_state *st; + regmatch_state *st; /* cache heavy used fields of st in registers */ - register regnode *scan; - register regnode *next; - register U32 n = 0; /* general value; init to avoid compiler warning */ - register I32 ln = 0; /* len or last; init to avoid compiler warning */ - register char *locinput = PL_reginput; - register I32 nextchr; /* is always set to UCHARAT(locinput) */ + regnode *scan; + regnode *next; + U32 n = 0; /* general value; init to avoid compiler warning */ + I32 ln = 0; /* len or last; init to avoid compiler warning */ + char *locinput = PL_reginput; + I32 nextchr; /* is always set to UCHARAT(locinput) */ bool result = 0; /* return value of S_regmatch */ int depth = 0; /* depth of backtrack stack */ @@ -3125,11 +3159,25 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) false: plain (?=foo) true: used as a condition: (?(?=foo)) */ - PAD* const initial_pad = PL_comppad; + PAD* last_pad = NULL; + dMULTICALL; + I32 gimme = G_SCALAR; + CV *caller_cv = NULL; /* who called us */ + CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ + CHECKPOINT runops_cp; /* savestack position before executing EVAL */ + #ifdef DEBUGGING GET_RE_DEBUG_FLAGS_DECL; #endif + /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ + multicall_oldcatch = 0; + multicall_cv = NULL; + cx = NULL; + PERL_UNUSED_VAR(multicall_cop); + PERL_UNUSED_VAR(newsp); + + PERL_ARGS_ASSERT_REGMATCH; DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ @@ -3179,10 +3227,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) reenter_switch: - assert(PL_reglastparen == &rex->lastparen); - assert(PL_reglastcloseparen == &rex->lastcloseparen); - assert(PL_regoffs == rex->offs); - switch (state_num) { case BOL: if (locinput == PL_bostr) @@ -3209,14 +3253,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case KEEPS: /* update the startpoint */ - st->u.keeper.val = PL_regoffs[0].start; + st->u.keeper.val = rex->offs[0].start; PL_reginput = locinput; - PL_regoffs[0].start = locinput - PL_bostr; + rex->offs[0].start = locinput - PL_bostr; PUSH_STATE_GOTO(KEEPS_next, next); /*NOT-REACHED*/ case KEEPS_next_fail: /* rollback the start point change */ - PL_regoffs[0].start = st->u.keeper.val; + rex->offs[0].start = st->u.keeper.val; sayNO_SILENT; /*NOT-REACHED*/ case EOL: @@ -3279,7 +3323,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) ); sayNO_SILENT; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } /* FALL THROUGH */ case TRIE: @@ -3366,7 +3410,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) U32 charcount = 0; /* how many input chars we have matched */ U32 accepted = 0; /* have we seen any accepting states? */ - ST.B = next; ST.jump = trie->jump; ST.me = scan; ST.firstpos = NULL; @@ -3465,14 +3508,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ); goto trie_first_try; /* jump into the fail handler */ }} - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case TRIE_next_fail: /* we failed - try next alternative */ if ( ST.jump) { REGCP_UNWIND(ST.cp); - for (n = *PL_reglastparen; n > ST.lastparen; n--) - PL_regoffs[n].end = -1; - *PL_reglastparen = n; + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); } if (!--ST.accepted) { DEBUG_EXECUTE_r({ @@ -3487,10 +3528,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) { /* Find next-highest word to process. Note that this code * is O(N^2) per trie run (O(N) per branch), so keep tight */ - register U16 min = 0; - register U16 word; - register U16 const nextword = ST.nextword; - register reg_trie_wordinfo * const wordinfo + U16 min = 0; + U16 word; + U16 const nextword = ST.nextword; + reg_trie_wordinfo * const wordinfo = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo; for (word=ST.topword; word; word=wordinfo[word].prev) { if (word > nextword && (!min || word < min)) @@ -3506,7 +3547,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } if ( ST.jump) { - ST.lastparen = *PL_reglastparen; + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; REGCP_SET(ST.cp); } @@ -3563,9 +3605,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) PL_reginput = (char *)uc; } - scan = (ST.jump && ST.jump[ST.nextword]) - ? ST.me + ST.jump[ST.nextword] - : ST.B; + scan = ST.me + ((ST.jump && ST.jump[ST.nextword]) + ? ST.jump[ST.nextword] + : NEXT_OFF(ST.me)); DEBUG_EXECUTE_r({ PerlIO_printf( Perl_debug_log, @@ -3579,7 +3621,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) if (ST.accepted > 1 || has_cutgroup) { PUSH_STATE_GOTO(TRIE_next, scan); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } /* only one choice left - just continue */ DEBUG_EXECUTE_r({ @@ -3604,7 +3646,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) locinput = PL_reginput; nextchr = UCHARAT(locinput); continue; /* execute rest of RE */ - /* NOTREACHED */ + assert(0); /* NOTREACHED */ #undef ST case EXACT: { @@ -3845,6 +3887,26 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) DIGITA, NDIGITA, isDIGIT_A, digit, "0"); + case POSIXA: + if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) { + sayNO; + } + /* Matched a utf8-invariant, so don't have to worry about utf8 */ + nextchr = UCHARAT(++locinput); + break; + case NPOSIXA: + if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) { + sayNO; + } + if (utf8_target) { + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + } + else { + nextchr = UCHARAT(++locinput); + } + break; + case CLUMP: /* Match \X: logical Unicode character. This is defined as a Unicode extended Grapheme Cluster */ /* From http://www.unicode.org/reports/tr29 (5.2 version). An @@ -3854,43 +3916,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) | Prepend* Begin Extend* | . - Begin is (Hangul-syllable | ! Control) - Extend is (Grapheme_Extend | Spacing_Mark) - Control is [ GCB_Control CR LF ] - - The discussion below shows how the code for CLUMP is derived - from this regex. Note that most of these concepts are from - property values of the Grapheme Cluster Boundary (GCB) property. - No code point can have multiple property values for a given - property. Thus a code point in Prepend can't be in Control, but - it must be in !Control. This is why Control above includes - GCB_Control plus CR plus LF. The latter two are used in the GCB - property separately, and so can't be in GCB_Control, even though - they logically are controls. Control is not the same as gc=cc, - but includes format and other characters as well. - - The Unicode definition of Hangul-syllable is: - L+ - | (L* ( ( V | LV ) V* | LVT ) T*) - | T+ - ) - Each of these is a value for the GCB property, and hence must be - disjoint, so the order they are tested is immaterial, so the - above can safely be changed to - T+ - | L+ - | (L* ( LVT | ( V | LV ) V*) T*) - - The last two terms can be combined like this: - L* ( L - | (( LVT | ( V | LV ) V*) T*)) - - And refactored into this: - L* (L | LVT T* | V V* T* | LV V* T*) - - That means that if we have seen any L's at all we can quit - there, but if the next character is an LVT, a V, or an LV we - should keep going. + Begin is: ( Special_Begin | ! Control ) + Special_Begin is: ( Regional-Indicator+ | Hangul-syllable ) + Extend is: ( Grapheme_Extend | Spacing_Mark ) + Control is: [ GCB_Control CR LF ] + Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) )) + + If we create a 'Regular_Begin' = Begin - Special_Begin, then + we can rewrite + + Begin is ( Regular_Begin + Special Begin ) + + It turns out that 98.4% of all Unicode code points match + Regular_Begin. Doing it this way eliminates a table match in + the previouls implementation for almost all Unicode code points. There is a subtlety with Prepend* which showed up in testing. Note that the Begin, and only the Begin is required in: @@ -3930,21 +3969,24 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) LOAD_UTF8_CHARCLASS_GCB(); - /* Match (prepend)* */ - while (locinput < PL_regeol - && swash_fetch(PL_utf8_X_prepend, - (U8*)locinput, utf8_target)) - { - previous_prepend = locinput; - locinput += UTF8SKIP(locinput); - } + /* Match (prepend)*, but don't bother trying if empty (as + * being set to _undef indicates) */ + if (PL_utf8_X_prepend != &PL_sv_undef) { + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_prepend, + (U8*)locinput, utf8_target)) + { + previous_prepend = locinput; + locinput += UTF8SKIP(locinput); + } + } /* As noted above, if we matched a prepend character, but * the next thing won't match, back off the last prepend we * matched, as it is guaranteed to match the begin */ if (previous_prepend && (locinput >= PL_regeol - || ! swash_fetch(PL_utf8_X_begin, + || ! swash_fetch(PL_utf8_X_regular_begin, (U8*)locinput, utf8_target))) { locinput = previous_prepend; @@ -3955,32 +3997,38 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * moved locinput forward, we tested the result just above * and it either passed, or we backed off so that it will * now pass */ - if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) { + if (swash_fetch(PL_utf8_X_regular_begin, (U8*)locinput, utf8_target)) { + locinput += UTF8SKIP(locinput); + } + else if (! swash_fetch(PL_utf8_X_special_begin, + (U8*)locinput, utf8_target)) + { /* Here did not match the required 'Begin' in the * second term. So just match the very first * character, the '.' of the final term of the regex */ locinput = starting + UTF8SKIP(starting); + goto exit_utf8; } else { - /* Here is the beginning of a character that can have - * an extender. It is either a hangul syllable, or a - * non-control */ - if (swash_fetch(PL_utf8_X_non_hangul, - (U8*)locinput, utf8_target)) - { - - /* Here not a Hangul syllable, must be a - * ('! * Control') */ - locinput += UTF8SKIP(locinput); - } else { - - /* Here is a Hangul syllable. It can be composed - * of several individual characters. One - * possibility is T+ */ - if (swash_fetch(PL_utf8_X_T, + /* Here is a special begin. It can be composed + * of several individual characters. One + * possibility is RI+ */ + if (swash_fetch(PL_utf8_X_RI, + (U8*)locinput, utf8_target)) + { + locinput += UTF8SKIP(locinput); + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_RI, + (U8*)locinput, utf8_target)) + { + locinput += UTF8SKIP(locinput); + } + } else /* Another possibility is T+ */ + if (swash_fetch(PL_utf8_X_T, (U8*)locinput, utf8_target)) { + locinput += UTF8SKIP(locinput); while (locinput < PL_regeol && swash_fetch(PL_utf8_X_T, (U8*)locinput, utf8_target)) @@ -3989,9 +4037,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } } else { - /* Here, not T+, but is a Hangul. That means - * it is one of the others: L, LV, LVT or V, - * and matches: + /* Here, neither RI+ nor T+; must be some other + * Hangul. That means it is one of the others: + * L, LV, LVT or V, and matches: * L* (L | LVT T* | V V* T* | LV V* T*) */ /* Match L* */ @@ -4015,9 +4063,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* Otherwise keep going. Must be LV, LVT * or V. See if LVT */ - if (swash_fetch(PL_utf8_X_LVT, - (U8*)locinput, utf8_target)) - { + if (is_utf8_X_LVT((U8*)locinput)) { locinput += UTF8SKIP(locinput); } else { @@ -4043,7 +4089,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } } } - } + } /* Match any extender */ while (locinput < PL_regeol @@ -4052,8 +4098,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) { locinput += UTF8SKIP(locinput); } - } } + exit_utf8: if (locinput > PL_regeol) sayNO; } nextchr = UCHARAT(locinput); @@ -4151,11 +4197,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) n = ARG(scan); /* which paren pair */ do_nref_ref_common: - ln = PL_regoffs[n].start; + ln = rex->offs[n].start; PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ - if (*PL_reglastparen < n || ln == -1) + if (rex->lastparen < n || ln == -1) sayNO; /* Do not match unless seen CLOSEn. */ - if (ln == PL_regoffs[n].end) + if (ln == rex->offs[n].end) break; s = PL_bostr + ln; @@ -4169,7 +4215,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * not going off the end given by PL_regeol, and returns in * limit upon success, how much of the current input was * matched */ - if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target, + if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target, locinput, &limit, 0, utf8_target, utf8_fold_flags)) { sayNO; @@ -4184,7 +4230,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) (type == REF || UCHARAT(s) != fold_array[nextchr])) sayNO; - ln = PL_regoffs[n].end - ln; + ln = rex->offs[n].end - ln; if (locinput + ln > PL_regeol) sayNO; if (ln > 1 && (type == REF @@ -4225,7 +4271,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) re_sv = rex_sv; re = rex; rei = rexi; - (void)ReREFCNT_inc(rex_sv); if (OP(scan)==GOSUB) { startpoint = scan + ARG2L(scan); ST.close_paren = ARG(scan); @@ -4234,7 +4279,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ST.close_paren = 0; } goto eval_recurse_doit; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ if (cur_eval && cur_eval->locinput==locinput) { if ( ++nochange_depth > max_nochange_depth ) @@ -4244,13 +4289,19 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } { /* execute the code in the {...} */ + dSP; - SV ** const before = SP; - OP_4tree * const oop = PL_op; + SV ** before; + OP * const oop = PL_op; COP * const ocurcop = PL_curcop; - PAD *old_comppad, *new_comppad; + OP *nop; char *saved_regeol = PL_regeol; struct re_save_state saved_state; + CV *newcv; + + /* save *all* paren positions */ + regcppush(rex, 0); + REGCP_SET(runops_cp); /* To not corrupt the existing regex state while executing the * eval we would normally put it on the save stack, like with @@ -4267,49 +4318,100 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * variable. */ Copy(&PL_reg_state, &saved_state, 1, struct re_save_state); + PL_reg_state.re_reparsing = FALSE; + if (!caller_cv) + caller_cv = find_runcv(NULL); + n = ARG(scan); + if (rexi->data->what[n] == 'r') { /* code from an external qr */ - /* XXX assumes pad depth is 1; this isn't necessarily - * the case with recursive qr//'s */ - new_comppad = (PAD*)AvARRAY(CvPADLIST( - ((struct regexp *)SvANY( + newcv = ((struct regexp *)SvANY( (REGEXP*)(rexi->data->data[n]) ))->qr_anoncv - ))[1]; - PL_op = (OP_4tree*)rexi->data->data[n+1]; + ; + nop = (OP*)rexi->data->data[n+1]; } else if (rexi->data->what[n] == 'l') { /* literal code */ - new_comppad = initial_pad; /* the pad of the current sub */ - PL_op = (OP_4tree*)rexi->data->data[n]; + newcv = caller_cv; + nop = (OP*)rexi->data->data[n]; + assert(CvDEPTH(newcv)); } else { /* literal with own CV */ assert(rexi->data->what[n] == 'L'); - new_comppad = (PAD*)AvARRAY(CvPADLIST(rex->qr_anoncv))[1]; - PL_op = (OP_4tree*)rexi->data->data[n]; + newcv = rex->qr_anoncv; + nop = (OP*)rexi->data->data[n]; } - DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, - " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(PL_op)) ); - /* wrap the call in two SAVECOMPPADs. This ensures that - * when the save stack is eventually unwound, all the - * accumulated SAVEt_CLEARSV's will be processed with - * interspersed SAVEt_COMPPAD's to ensure that lexicals - * are cleared in the right pad */ - if (PL_comppad == new_comppad) - old_comppad = new_comppad; - else { - SAVECOMPPAD(); - PAD_SAVE_LOCAL(old_comppad, new_comppad); + + /* normally if we're about to execute code from the same + * CV that we used previously, we just use the existing + * CX stack entry. However, its possible that in the + * meantime we may have backtracked, popped from the save + * stack, and undone the SAVECOMPPAD(s) associated with + * PUSH_MULTICALL; in which case PL_comppad no longer + * points to newcv's pad. */ + if (newcv != last_pushed_cv || PL_comppad != last_pad) + { + I32 depth = (newcv == caller_cv) ? 0 : 1; + if (last_pushed_cv) { + CHANGE_MULTICALL_WITHDEPTH(newcv, depth); + } + else { + PUSH_MULTICALL_WITHDEPTH(newcv, depth); + } + last_pushed_cv = newcv; + } + last_pad = PL_comppad; + + /* the initial nextstate you would normally execute + * at the start of an eval (which would cause error + * messages to come from the eval), may be optimised + * away from the execution path in the regex code blocks; + * so manually set PL_curcop to it initially */ + { + OP *o = cUNOPx(nop)->op_first; + assert(o->op_type == OP_NULL); + if (o->op_targ == OP_SCOPE) { + o = cUNOPo->op_first; + } + else { + assert(o->op_targ == OP_LEAVE); + o = cUNOPo->op_first; + assert(o->op_type == OP_ENTER); + o = o->op_sibling; + } + + if (o->op_type != OP_STUB) { + assert( o->op_type == OP_NEXTSTATE + || o->op_type == OP_DBSTATE + || (o->op_type == OP_NULL + && ( o->op_targ == OP_NEXTSTATE + || o->op_targ == OP_DBSTATE + ) + ) + ); + PL_curcop = (COP*)o; + } } - PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr; + nop = nop->op_next; + + DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, + " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) ); + + rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr; if (sv_yes_mark) { SV *sv_mrk = get_sv("REGMARK", 1); sv_setsv(sv_mrk, sv_yes_mark); } + /* we don't use MULTICALL here as we want to call the + * first op of the block of interest, rather than the + * first op of the sub */ + before = SP; + PL_op = nop; CALLRUNOPS(aTHX); /* Scalar context. */ SPAGAIN; if (SP == before) @@ -4319,93 +4421,107 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) PUTBACK; } + /* before restoring everything, evaluate the returned + * value, so that 'uninit' warnings don't use the wrong + * PL_op or pad. Also need to process any magic vars + * (e.g. $1) *before* parentheses are restored */ + + PL_op = NULL; + + re_sv = NULL; + if (logical == 0) /* (?{})/ */ + sv_setsv(save_scalar(PL_replgv), ret); /* $^R */ + else if (logical == 1) { /* /(?(?{...})X|Y)/ */ + sw = cBOOL(SvTRUE(ret)); + logical = 0; + } + else { /* /(??{}) */ + /* if its overloaded, let the regex compiler handle + * it; otherwise extract regex, or stringify */ + if (!SvAMAGIC(ret)) { + SV *sv = ret; + if (SvROK(sv)) + sv = SvRV(sv); + if (SvTYPE(sv) == SVt_REGEXP) + re_sv = (REGEXP*) sv; + else if (SvSMAGICAL(sv)) { + MAGIC *mg = mg_find(sv, PERL_MAGIC_qr); + if (mg) + re_sv = (REGEXP *) mg->mg_obj; + } + + /* force any magic, undef warnings here */ + if (!re_sv) { + ret = sv_mortalcopy(ret); + (void) SvPV_force_nolen(ret); + } + } + + } + Copy(&saved_state, &PL_reg_state, 1, struct re_save_state); + /* *** Note that at this point we don't restore + * PL_comppad, (or pop the CxSUB) on the assumption it may + * be used again soon. This is safe as long as nothing + * in the regexp code uses the pad ! */ PL_op = oop; - if (old_comppad != PL_comppad) { - SAVECOMPPAD(); - PAD_RESTORE_LOCAL(old_comppad); - } PL_curcop = ocurcop; PL_regeol = saved_regeol; - if (!logical) { - /* /(?{...})/ */ - sv_setsv(save_scalar(PL_replgv), ret); + S_regcp_restore(aTHX_ rex, runops_cp); + + if (logical != 2) break; - } } - if (logical == 2) { /* Postponed subexpression: /(??{...})/ */ + + /* only /(??{})/ from now on */ logical = 0; { /* extract RE object from returned value; compiling if * necessary */ - MAGIC *mg = NULL; - REGEXP *rx = NULL; - - if (SvROK(ret)) { - SV *const sv = SvRV(ret); - if (SvTYPE(sv) == SVt_REGEXP) { - rx = (REGEXP*) sv; - } else if (SvSMAGICAL(sv)) { - mg = mg_find(sv, PERL_MAGIC_qr); - assert(mg); - } - } else if (SvTYPE(ret) == SVt_REGEXP) { - rx = (REGEXP*) ret; - } else if (SvSMAGICAL(ret)) { - if (SvGMAGICAL(ret)) { - /* I don't believe that there is ever qr magic - here. */ - assert(!mg_find(ret, PERL_MAGIC_qr)); - sv_unmagic(ret, PERL_MAGIC_qr); - } - else { - mg = mg_find(ret, PERL_MAGIC_qr); - /* testing suggests mg only ends up non-NULL for - scalars who were upgraded and compiled in the - else block below. In turn, this is only - triggered in the "postponed utf8 string" tests - in t/op/pat.t */ - } - } - - if (mg) { - rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/ - assert(rx); - } - if (rx) { - rx = reg_temp_copy(NULL, rx); + if (re_sv) { + re_sv = reg_temp_copy(NULL, re_sv); } else { U32 pm_flags = 0; const I32 osize = PL_regsize; - if (DO_UTF8(ret)) { - assert (SvUTF8(ret)); - } else if (SvUTF8(ret)) { - /* Not doing UTF-8, despite what the SV says. Is - this only if we're trapped in use 'bytes'? */ - /* Make a copy of the octet sequence, but without - the flag on, as the compiler now honours the - SvUTF8 flag on ret. */ + if (SvUTF8(ret) && IN_BYTES) { + /* In use 'bytes': make a copy of the octet + * sequence, but without the flag on */ STRLEN len; const char *const p = SvPV(ret, len); ret = newSVpvn_flags(p, len, SVs_TEMP); } - rx = CALLREGCOMP(ret, pm_flags); + if (rex->intflags & PREGf_USE_RE_EVAL) + pm_flags |= PMf_USE_RE_EVAL; + + /* if we got here, it should be an engine which + * supports compiling code blocks and stuff */ + assert(rex->engine && rex->engine->op_comp); + assert(!(scan->flags & ~RXf_PMf_COMPILETIME)); + re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL, + rex->engine, NULL, NULL, + /* copy /msix etc to inner pattern */ + scan->flags, + pm_flags); + if (!(SvFLAGS(ret) & (SVs_TEMP | SVs_PADTMP | SVf_READONLY | SVs_GMG))) { /* This isn't a first class regexp. Instead, it's caching a regexp onto an existing, Perl visible scalar. */ - sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0); + sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); } PL_regsize = osize; + /* safe to do now that any $1 etc has been + * interpolated into the new pattern string and + * compiled */ + S_regcp_restore(aTHX_ rex, runops_cp); } - re_sv = rx; - re = (struct regexp *)SvANY(rx); + re = (struct regexp *)SvANY(re_sv); } RXp_MATCH_COPIED_off(re); re->subbeg = rex->subbeg; @@ -4417,25 +4533,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ); startpoint = rei->program + 1; ST.close_paren = 0; /* only used for GOSUB */ - /* borrowed from regtry */ - if (PL_reg_start_tmpl <= re->nparens) { - PL_reg_start_tmpl = re->nparens*3/2 + 3; - if(PL_reg_start_tmp) - Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); - else - Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*); - } eval_recurse_doit: /* Share code with GOSUB below this line */ /* run the pattern returned from (??{...}) */ - ST.cp = regcppush(0); /* Save *all* the positions. */ + ST.cp = regcppush(rex, 0); /* Save *all* the positions. */ REGCP_SET(ST.lastcp); - PL_regoffs = re->offs; /* essentially NOOP on GOSUB */ - - /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */ - PL_reglastparen = &re->lastparen; - PL_reglastcloseparen = &re->lastcloseparen; re->lastparen = 0; re->lastcloseparen = 0; @@ -4454,7 +4557,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ST.prev_rex = rex_sv; ST.prev_curlyx = cur_curlyx; - SETREX(rex_sv,re_sv); + rex_sv = re_sv; + SET_reg_curpm(rex_sv); rex = re; rexi = rei; cur_curlyx = NULL; @@ -4463,31 +4567,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) cur_eval = st; /* now continue from first node in postoned RE */ PUSH_YES_STATE_GOTO(EVAL_AB, startpoint); - /* NOTREACHED */ - } - /* logical is 1, /(?(?{...})X|Y)/ */ - sw = cBOOL(SvTRUE(ret)); - logical = 0; - break; + assert(0); /* NOTREACHED */ } case EVAL_AB: /* cleanup after a successful (??{A})B */ /* note: this is called twice; first after popping B, then A */ PL_reg_flags ^= ST.toggle_reg_flags; - ReREFCNT_dec(rex_sv); - SETREX(rex_sv,ST.prev_rex); + rex_sv = ST.prev_rex; + SET_reg_curpm(rex_sv); rex = (struct regexp *)SvANY(rex_sv); rexi = RXi_GET(rex); regcpblow(ST.cp); cur_eval = ST.prev_eval; cur_curlyx = ST.prev_curlyx; - /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ - PL_reglastparen = &rex->lastparen; - PL_reglastcloseparen = &rex->lastcloseparen; - /* also update PL_regoffs */ - PL_regoffs = rex->offs; - /* XXXX This is too dramatic a measure... */ PL_reg_maxiter = 0; if ( nochange_depth ) @@ -4498,13 +4591,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ /* note: this is called twice; first after popping B, then A */ PL_reg_flags ^= ST.toggle_reg_flags; - ReREFCNT_dec(rex_sv); - SETREX(rex_sv,ST.prev_rex); + rex_sv = ST.prev_rex; + SET_reg_curpm(rex_sv); rex = (struct regexp *)SvANY(rex_sv); rexi = RXi_GET(rex); - /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ - PL_reglastparen = &rex->lastparen; - PL_reglastcloseparen = &rex->lastcloseparen; PL_reginput = locinput; REGCP_UNWIND(ST.lastcp); @@ -4520,20 +4610,41 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case OPEN: n = ARG(scan); /* which paren pair */ - PL_reg_start_tmp[n] = locinput; + rex->offs[n].start_tmp = locinput - PL_bostr; if (n > PL_regsize) PL_regsize = n; + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n", + PTR2UV(rex), + PTR2UV(rex->offs), + (UV)n, + (IV)rex->offs[n].start_tmp, + (UV)PL_regsize + )); lastopen = n; break; + +/* XXX really need to log other places start/end are set too */ +#define CLOSE_CAPTURE \ + rex->offs[n].start = rex->offs[n].start_tmp; \ + rex->offs[n].end = locinput - PL_bostr; \ + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \ + "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \ + PTR2UV(rex), \ + PTR2UV(rex->offs), \ + (UV)n, \ + (IV)rex->offs[n].start, \ + (IV)rex->offs[n].end \ + )) + case CLOSE: n = ARG(scan); /* which paren pair */ - PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr; - PL_regoffs[n].end = locinput - PL_bostr; + CLOSE_CAPTURE; /*if (n > PL_regsize) PL_regsize = n;*/ - if (n > *PL_reglastparen) - *PL_reglastparen = n; - *PL_reglastcloseparen = n; + if (n > rex->lastparen) + rex->lastparen = n; + rex->lastcloseparen = n; if (cur_eval && cur_eval->u.eval.close_paren == n) { goto fake_end; } @@ -4548,14 +4659,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) if ( OP(cursor)==CLOSE ){ n = ARG(cursor); if ( n <= lastopen ) { - PL_regoffs[n].start - = PL_reg_start_tmp[n] - PL_bostr; - PL_regoffs[n].end = locinput - PL_bostr; + CLOSE_CAPTURE; /*if (n > PL_regsize) PL_regsize = n;*/ - if (n > *PL_reglastparen) - *PL_reglastparen = n; - *PL_reglastcloseparen = n; + if (n > rex->lastparen) + rex->lastparen = n; + rex->lastcloseparen = n; if ( n == ARG(scan) || (cur_eval && cur_eval->u.eval.close_paren == n)) break; @@ -4567,7 +4676,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /*NOTREACHED*/ case GROUPP: n = ARG(scan); /* which paren pair */ - sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1); + sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); break; case NGROUPP: /* reg_check_named_buff_matched returns 0 for no match */ @@ -4690,8 +4799,8 @@ NULL /* XXXX Probably it is better to teach regpush to support parenfloor > PL_regsize... */ - if (parenfloor > (I32)*PL_reglastparen) - parenfloor = *PL_reglastparen; /* Pessimization... */ + if (parenfloor > (I32)rex->lastparen) + parenfloor = rex->lastparen; /* Pessimization... */ ST.prev_curlyx= cur_curlyx; cur_curlyx = st; @@ -4709,19 +4818,19 @@ NULL PL_reginput = locinput; PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next)); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } case CURLYX_end: /* just finished matching all of A*B */ cur_curlyx = ST.prev_curlyx; sayYES; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case CURLYX_end_fail: /* just failed to match all of A*B */ regcpblow(ST.cp); cur_curlyx = ST.prev_curlyx; sayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ #undef ST @@ -4751,12 +4860,12 @@ NULL /* First just match a string of min A's. */ if (n < min) { - ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor); + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor); cur_curlyx->u.curlyx.lastloc = locinput; REGCP_SET(ST.lastcp); PUSH_STATE_GOTO(WHILEM_A_pre, A); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } /* If degenerate A matches "", assume A done. */ @@ -4827,37 +4936,37 @@ NULL if (cur_curlyx->u.curlyx.minmod) { ST.save_curlyx = cur_curlyx; cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; - ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor); + ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor); REGCP_SET(ST.lastcp); PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } /* Prefer A over B for maximal matching. */ if (n < max) { /* More greed allowed? */ - ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor); + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor); cur_curlyx->u.curlyx.lastloc = locinput; REGCP_SET(ST.lastcp); PUSH_STATE_GOTO(WHILEM_A_max, A); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } goto do_whilem_B_max; } - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case WHILEM_B_min: /* just matched B in a minimal match */ case WHILEM_B_max: /* just matched B in a maximal match */ cur_curlyx = ST.save_curlyx; sayYES; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ cur_curlyx = ST.save_curlyx; cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; cur_curlyx->u.curlyx.count--; CACHEsayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ /* FALL THROUGH */ @@ -4867,7 +4976,7 @@ NULL cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; cur_curlyx->u.curlyx.count--; CACHEsayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ REGCP_UNWIND(ST.lastcp); @@ -4893,7 +5002,7 @@ NULL ST.save_curlyx = cur_curlyx; cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ cur_curlyx = ST.save_curlyx; @@ -4922,11 +5031,11 @@ NULL /* Try grabbing another A and see if it helps. */ PL_reginput = locinput; cur_curlyx->u.curlyx.lastloc = locinput; - ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor); + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor); REGCP_SET(ST.lastcp); PUSH_STATE_GOTO(WHILEM_A_min, /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ #undef ST #define ST st->u.branch @@ -4940,7 +5049,8 @@ NULL case BRANCH: /* /(...|A|...)/ */ scan = NEXTOPER(scan); /* scan now points to inner node */ - ST.lastparen = *PL_reglastparen; + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; ST.next_branch = next; REGCP_SET(ST.cp); PL_reginput = locinput; @@ -4951,33 +5061,30 @@ NULL } else { PUSH_STATE_GOTO(BRANCH_next, scan); } - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case CUTGROUP: PL_reginput = locinput; sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL : MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); PUSH_STATE_GOTO(CUTGROUP_next,next); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case CUTGROUP_next_fail: do_cutgroup = 1; no_final = 1; if (st->u.mark.mark_name) sv_commit = st->u.mark.mark_name; sayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case BRANCH_next: sayYES; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case BRANCH_next_fail: /* that branch failed; try the next, if any */ if (do_cutgroup) { do_cutgroup = 0; no_final = 0; } REGCP_UNWIND(ST.cp); - for (n = *PL_reglastparen; n > ST.lastparen; n--) - PL_regoffs[n].end = -1; - *PL_reglastparen = n; - /*dmq: *PL_reglastcloseparen = n; */ + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); scan = ST.next_branch; /* no more branches? */ if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { @@ -4991,7 +5098,7 @@ NULL sayNO_SILENT; } continue; /* execute next BRANCH[J] op */ - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case MINMOD: minmod = 1; @@ -5011,13 +5118,14 @@ NULL ST.me = scan; scan = NEXTOPER(scan) + NODE_STEP_REGNODE; + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + /* if paren positive, emulate an OPEN/CLOSE around A */ if (ST.me->flags) { U32 paren = ST.me->flags; if (paren > PL_regsize) PL_regsize = paren; - if (paren > *PL_reglastparen) - *PL_reglastparen = paren; scan += NEXT_OFF(scan); /* Skip former OPEN. */ } ST.A = scan; @@ -5035,7 +5143,7 @@ NULL curlym_do_A: /* execute the A in /A{m,n}B/ */ PL_reginput = locinput; PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */ - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case CURLYM_A: /* we've just matched an A */ locinput = st->locinput; @@ -5143,16 +5251,18 @@ NULL } if (ST.me->flags) { - /* mark current A as captured */ + /* emulate CLOSE: mark current A as captured */ I32 paren = ST.me->flags; if (ST.count) { - PL_regoffs[paren].start + rex->offs[paren].start = HOPc(PL_reginput, -ST.alen) - PL_bostr; - PL_regoffs[paren].end = PL_reginput - PL_bostr; - /*dmq: *PL_reglastcloseparen = paren; */ + rex->offs[paren].end = PL_reginput - PL_bostr; + if ((U32)paren > rex->lastparen) + rex->lastparen = paren; + rex->lastcloseparen = paren; } else - PL_regoffs[paren].end = -1; + rex->offs[paren].end = -1; if (cur_eval && cur_eval->u.eval.close_paren && cur_eval->u.eval.close_paren == (U32)ST.me->flags) { @@ -5164,10 +5274,11 @@ NULL } PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */ - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case CURLYM_B_fail: /* just failed to match a B */ REGCP_UNWIND(ST.cp); + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); if (ST.minmod) { I32 max = ARG2(ST.me); if (max != REG_INFTY && ST.count == max) @@ -5187,12 +5298,17 @@ NULL #define CURLY_SETPAREN(paren, success) \ if (paren) { \ if (success) { \ - PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \ - PL_regoffs[paren].end = locinput - PL_bostr; \ - *PL_reglastcloseparen = paren; \ + rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \ + rex->offs[paren].end = locinput - PL_bostr; \ + if (paren > rex->lastparen) \ + rex->lastparen = paren; \ + rex->lastcloseparen = paren; \ + } \ + else { \ + rex->offs[paren].end = -1; \ + rex->lastparen = ST.lastparen; \ + rex->lastcloseparen = ST.lastcloseparen; \ } \ - else \ - PL_regoffs[paren].end = -1; \ } case STAR: /* /A*B/ where A is width 1 */ @@ -5209,10 +5325,10 @@ NULL goto repeat; case CURLYN: /* /(A){m,n}B/ where A is width 1 */ ST.paren = scan->flags; /* Which paren to set */ + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; if (ST.paren > PL_regsize) PL_regsize = ST.paren; - if (ST.paren > *PL_reglastparen) - *PL_reglastparen = ST.paren; ST.min = ARG1(scan); /* min to match */ ST.max = ARG2(scan); /* max to match */ if (cur_eval && cur_eval->u.eval.close_paren && @@ -5351,16 +5467,17 @@ NULL REGCP_SET(ST.cp); goto curly_try_B_max; } - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case CURLY_B_min_known_fail: /* failed to find B in a non-greedy match where c1,c2 valid */ - if (ST.paren && ST.count) - PL_regoffs[ST.paren].end = -1; PL_reginput = locinput; /* Could be reset... */ REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } /* Couldn't or didn't -- move forward. */ ST.oldloc = locinput; if (utf8_target) @@ -5429,15 +5546,16 @@ NULL } PUSH_STATE_GOTO(CURLY_B_min_known, ST.B); } - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case CURLY_B_min_fail: /* failed to find B in a non-greedy match where c1,c2 invalid */ - if (ST.paren && ST.count) - PL_regoffs[ST.paren].end = -1; REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } /* failed -- move forward one */ PL_reginput = locinput; if (regrepeat(rex, ST.A, 1, depth)) { @@ -5456,7 +5574,7 @@ NULL } } sayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ curly_try_B_max: @@ -5475,16 +5593,17 @@ NULL if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) { CURLY_SETPAREN(ST.paren, ST.count); PUSH_STATE_GOTO(CURLY_B_max, ST.B); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } } /* FALL THROUGH */ case CURLY_B_max_fail: /* failed to find B in a greedy match */ - if (ST.paren && ST.count) - PL_regoffs[ST.paren].end = -1; REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } /* back up. */ if (--ST.count < ST.min) sayNO; @@ -5497,32 +5616,24 @@ NULL fake_end: if (cur_eval) { /* we've just finished A in /(??{A})B/; now continue with B */ - I32 tmpix; st->u.eval.toggle_reg_flags = cur_eval->u.eval.toggle_reg_flags; PL_reg_flags ^= st->u.eval.toggle_reg_flags; st->u.eval.prev_rex = rex_sv; /* inner */ - SETREX(rex_sv,cur_eval->u.eval.prev_rex); + st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */ + rex_sv = cur_eval->u.eval.prev_rex; + SET_reg_curpm(rex_sv); rex = (struct regexp *)SvANY(rex_sv); rexi = RXi_GET(rex); cur_curlyx = cur_eval->u.eval.prev_curlyx; - (void)ReREFCNT_inc(rex_sv); - st->u.eval.cp = regcppush(0); /* Save *all* the positions. */ - - /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ - PL_reglastparen = &rex->lastparen; - PL_reglastcloseparen = &rex->lastcloseparen; REGCP_SET(st->u.eval.lastcp); PL_reginput = locinput; /* Restore parens of the outer rex without popping the * savestack */ - tmpix = PL_savestack_ix; - PL_savestack_ix = cur_eval->u.eval.lastcp; - regcppop(rex); - PL_savestack_ix = tmpix; + S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp); st->u.eval.prev_eval = cur_eval; cur_eval = cur_eval->u.eval.prev_eval; @@ -5599,7 +5710,7 @@ NULL /* execute body of (?...A) */ PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan))); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case IFMATCH_A_fail: /* body of (?...A) failed */ ST.wanted = !ST.wanted; @@ -5638,13 +5749,13 @@ NULL if (!scan->flags) sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); PUSH_STATE_GOTO(COMMIT_next,next); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case COMMIT_next_fail: no_final = 1; /* FALLTHROUGH */ case OPFAIL: sayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ #define ST st->u.mark case MARKPOINT: @@ -5654,11 +5765,11 @@ NULL mark_state = st; ST.mark_loc = PL_reginput = locinput; PUSH_YES_STATE_GOTO(MARKPOINT_next,next); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case MARKPOINT_next: mark_state = ST.prev_mark; sayYES; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case MARKPOINT_next_fail: if (popmark && sv_eq(ST.mark_name,popmark)) { @@ -5678,7 +5789,7 @@ NULL sv_yes_mark = mark_state ? mark_state->u.mark.mark_name : NULL; sayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case SKIP: PL_reginput = locinput; if (scan->flags) { @@ -5722,7 +5833,7 @@ NULL } no_final = 1; sayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ #undef ST case LNBREAK: if ((n=is_LNBREAK(locinput,utf8_target))) { @@ -5767,7 +5878,7 @@ NULL /* switch break jumps here */ scan = next; /* prepare to execute the next op and ... */ continue; /* ... jump back to the top, reusing st */ - /* NOTREACHED */ + assert(0); /* NOTREACHED */ push_yes_state: /* push a state that backtracks on success */ @@ -5811,7 +5922,7 @@ NULL nextchr = UCHARAT(locinput); st = newst; continue; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } } @@ -5870,7 +5981,7 @@ yes: DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", PL_colors[4], PL_colors[5])); - if (PL_reg_eval_set) { + if (PL_reg_state.re_state_eval_setup_done) { /* each successfully executed (?{...}) block does the equivalent of * local $^R = do {...} * When popping the save stack, all these locals would be undone; @@ -5936,6 +6047,13 @@ no_silent: sv_setsv(sv_mrk, sv_yes_mark); } + + if (last_pushed_cv) { + dSP; + POP_MULTICALL; + PERL_UNUSED_VAR(SP); + } + /* clean up; in particular, free all slabs above current one */ LEAVE_SCOPE(oldsave); @@ -5954,11 +6072,11 @@ STATIC I32 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) { dVAR; - register char *scan; - register I32 c; - register char *loceol = PL_regeol; - register I32 hardcount = 0; - register bool utf8_target = PL_reg_match_utf8; + char *scan; + I32 c; + char *loceol = PL_regeol; + I32 hardcount = 0; + bool utf8_target = PL_reg_match_utf8; UV utf8_flags; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); @@ -6190,6 +6308,24 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; + + case POSIXA: + while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + break; + case NPOSIXA: + if (utf8_target) { + while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan += UTF8SKIP(scan); + } + } + else { + while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + } + break; case NALNUMA: if (utf8_target) { while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) { @@ -6535,7 +6671,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo SV * const rv = MUTABLE_SV(data->data[n]); AV * const av = MUTABLE_AV(SvRV(rv)); SV **const ary = AvARRAY(av); - bool invlist_has_user_defined_property; + U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; si = *ary; /* ary[0] = the string to initialize the swash with */ @@ -6544,11 +6680,12 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo * that inversion list has any user-defined properties in it. */ if (av_len(av) >= 3) { invlist = ary[3]; - invlist_has_user_defined_property = cBOOL(SvUV(ary[4])); + if (SvUV(ary[4])) { + swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + } } else { invlist = NULL; - invlist_has_user_defined_property = FALSE; } /* Element [1] is reserved for the set-up swash. If already there, @@ -6563,10 +6700,8 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo si, 1, /* binary */ 0, /* not from tr/// */ - FALSE, /* is error if can't find - property */ invlist, - invlist_has_user_defined_property); + &swash_init_flags); (void)av_store(av, 1, sw); } @@ -6581,20 +6716,14 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo if (listsvp) { SV* matches_string = newSVpvn("", 0); - SV** invlistsvp; /* Use the swash, if any, which has to have incorporated into it all * possibilities */ - if ( sw - && SvROK(sw) - && SvTYPE(SvRV(sw)) == SVt_PVHV - && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE))) - { - invlist = *invlistsvp; - } - else if (si && si != &PL_sv_undef) { + if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) + && (si && si != &PL_sv_undef)) + { - /* If no swash, use the input nitialization string, if available */ + /* If no swash, use the input initialization string, if available */ sv_catsv(matches_string, si); } @@ -6914,9 +7043,18 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, if (! utf8_target) Safefree(utf8_p); } } + + if (UNICODE_IS_SUPER(c) + && (flags & ANYOF_WARN_SUPER) + && ckWARN_d(WARN_NON_UNICODE)) + { + Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), + "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c); + } } - return (flags & ANYOF_INVERT) ? !match : match; + /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ + return cBOOL(flags & ANYOF_INVERT) ^ match; } STATIC U8 * @@ -7016,7 +7154,7 @@ restore_pos(pTHX_ void *arg) { dVAR; regexp * const rex = (regexp *)arg; - if (PL_reg_eval_set) { + if (PL_reg_state.re_state_eval_setup_done) { if (PL_reg_oldsaved) { rex->subbeg = PL_reg_oldsaved; rex->sublen = PL_reg_oldsavedlen; @@ -7026,7 +7164,7 @@ restore_pos(pTHX_ void *arg) RXp_MATCH_COPIED_on(rex); } PL_reg_magic->mg_len = PL_reg_oldpos; - PL_reg_eval_set = 0; + PL_reg_state.re_state_eval_setup_done = FALSE; PL_curpm = PL_reg_oldcurpm; } }