X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/6ae44cd27f5f18bfa72c8d5ea352083cea44b3bc..00a0fadd2e15c4914c9b422ddf6e4fa0baed8dfb:/regcomp.c?ds=sidebyside diff --git a/regcomp.c b/regcomp.c index 77a9401..acecaa9 100644 --- a/regcomp.c +++ b/regcomp.c @@ -109,15 +109,10 @@ #define STATIC static #endif -struct code_block { - STRLEN start; - STRLEN end; - OP *block; -} ; - typedef struct RExC_state_t { - U32 flags; /* are we folding, multilining? */ + U32 flags; /* RXf_* are we folding, multilining? */ + U32 pm_flags; /* PMf_* stuff from the calling PMOP */ char *precomp; /* uncompiled string. */ REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ @@ -138,7 +133,6 @@ typedef struct RExC_state_t { I32 nestroot; /* root parens we are in - used by accept */ I32 extralen; I32 seen_zerolen; - I32 seen_evals; regnode **open_parens; /* pointers to open parens */ regnode **close_parens; /* pointers to close parens */ regnode *opend; /* END node in program */ @@ -156,7 +150,7 @@ typedef struct RExC_state_t { I32 in_lookbehind; I32 contains_locale; I32 override_recoding; - struct code_block *code_blocks; /* positions of literal (?{}) + struct reg_code_block *code_blocks; /* positions of literal (?{}) within pattern */ int num_code_blocks; /* size of code_blocks[] */ int code_index; /* next code_blocks[] slot */ @@ -164,6 +158,7 @@ typedef struct RExC_state_t { char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) #endif + SV *runtime_code_qr; /* qr with the runtime code blocks */ #ifdef DEBUGGING const char *lastparse; I32 lastnum; @@ -175,6 +170,7 @@ typedef struct RExC_state_t { } RExC_state_t; #define RExC_flags (pRExC_state->flags) +#define RExC_pm_flags (pRExC_state->pm_flags) #define RExC_precomp (pRExC_state->precomp) #define RExC_rx_sv (pRExC_state->rx_sv) #define RExC_rx (pRExC_state->rx) @@ -197,7 +193,6 @@ typedef struct RExC_state_t { #define RExC_nestroot (pRExC_state->nestroot) #define RExC_extralen (pRExC_state->extralen) #define RExC_seen_zerolen (pRExC_state->seen_zerolen) -#define RExC_seen_evals (pRExC_state->seen_evals) #define RExC_utf8 (pRExC_state->utf8) #define RExC_uni_semantics (pRExC_state->uni_semantics) #define RExC_orig_utf8 (pRExC_state->orig_utf8) @@ -4904,18 +4899,11 @@ Perl_reginitcolors(pTHX) * scope */ -#ifndef PERL_IN_XSUB_RE -#define RE_ENGINE_PTR &PL_core_reg_engine -#else -extern const struct regexp_engine my_reg_engine; -#define RE_ENGINE_PTR &my_reg_engine -#endif - #ifndef PERL_IN_XSUB_RE -/* return the currently in-scope regex engine (or NULL if none) */ +/* return the currently in-scope regex engine (or the default if none) */ -regexp_engine * +regexp_engine const * Perl_current_re_engine(pTHX) { dVAR; @@ -4925,19 +4913,19 @@ Perl_current_re_engine(pTHX) SV **ptr; if (!table) - return NULL; + return &PL_core_reg_engine; ptr = hv_fetchs(table, "regcomp", FALSE); if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) - return NULL; + return &PL_core_reg_engine; return INT2PTR(regexp_engine*,SvIV(*ptr)); } else { SV *ptr; if (!PL_curcop->cop_hints_hash) - return NULL; + return &PL_core_reg_engine; ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0); if ( !(ptr && SvIOK(ptr) && SvIV(ptr))) - return NULL; + return &PL_core_reg_engine; return INT2PTR(regexp_engine*,SvIV(ptr)); } } @@ -4947,87 +4935,286 @@ REGEXP * Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) { dVAR; - regexp_engine *eng = current_re_engine(); + regexp_engine const *eng = current_re_engine(); + GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_PREGCOMP; /* Dispatch a request to compile a regexp to correct regexp engine. */ - if (eng) { - GET_RE_DEBUG_FLAGS_DECL; - DEBUG_COMPILE_r({ - PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", - PTR2UV(eng)); - }); - return CALLREGCOMP_ENG(eng, pattern, flags); - } - return Perl_re_compile(aTHX_ pattern, flags); + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", + PTR2UV(eng)); + }); + return CALLREGCOMP_ENG(eng, pattern, flags); } #endif -/* public(ish) wrapper for Perl_op_re_compile that only takes an SV +/* public(ish) wrapper for Perl_re_op_compile that only takes an SV * pattern rather than a list of OPs */ REGEXP * -Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) +Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) { SV *pat = pattern; /* defeat constness! */ PERL_ARGS_ASSERT_RE_COMPILE; - return Perl_re_op_compile(aTHX_ &pat, 1, NULL, - NULL, NULL, NULL, orig_pm_flags); + return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(), + NULL, NULL, rx_flags, 0); +} + +/* see if there are any run-time code blocks in the pattern. + * False positives are allowed */ + +static bool +S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr, + U32 pm_flags, char *pat, STRLEN plen) +{ + int n = 0; + STRLEN s; + + /* avoid infinitely recursing when we recompile the pattern parcelled up + * as qr'...'. A single constant qr// string can't have have any + * run-time component in it, and thus, no runtime code. (A non-qr + * string, however, can, e.g. $x =~ '(?{})') */ + if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST) + return 0; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + s = pRExC_state->code_blocks[n].end; + n++; + continue; + } + /* TODO ideally should handle [..], (#..), /#.../x to reduce false + * positives here */ + if (pat[s] == '(' && pat[s+1] == '?' && + (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{')) + ) + return 1; + } + return 0; } -/* given a list of CONSTs and DO blocks in expr, append all the CONSTs to - * pat, and record the start and end of each code block in code_blocks[] - * (each DO{} op is followed by an OP_CONST containing the corresponding - * literal '(?{...}) text) +/* Handle run-time code blocks. We will already have compiled any direct + * or indirect literal code blocks. Now, take the pattern 'pat' and make a + * copy of it, but with any literal code blocks blanked out and + * appropriate chars escaped; then feed it into + * + * eval "qr'modified_pattern'" + * + * For example, + * + * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno + * + * becomes + * + * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno' + * + * After eval_sv()-ing that, grab any new code blocks from the returned qr + * and merge them with any code blocks of the original regexp. + * + * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; + * instead, just save the qr and return FALSE; this tells our caller that + * the original pattern needs upgrading to utf8. */ -static void -S_get_pat_and_code_indices(pTHX_ RExC_state_t *pRExC_state, OP* expr, SV* pat) { - int i = -1; - bool is_code = 0; - OP *o; - for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) { - if (o->op_type == OP_CONST) { - sv_catsv(pat, cSVOPo_sv); - if (is_code) { - pRExC_state->code_blocks[i].end = SvCUR(pat)-1; - is_code = 0; +bool +S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, + char *pat, STRLEN plen) +{ + SV *qr; + + GET_RE_DEBUG_FLAGS_DECL; + + if (pRExC_state->runtime_code_qr) { + /* this is the second time we've been called; this should + * only happen if the main pattern got upgraded to utf8 + * during compilation; re-use the qr we compiled first time + * round (which should be utf8 too) + */ + qr = pRExC_state->runtime_code_qr; + pRExC_state->runtime_code_qr = NULL; + assert(RExC_utf8 && SvUTF8(qr)); + } + else { + int n = 0; + STRLEN s; + char *p, *newpat; + int newlen = plen + 6; /* allow for "qr''x\0" extra chars */ + SV *sv, *qr_ref; + dSP; + + /* determine how many extra chars we need for ' and \ escaping */ + for (s = 0; s < plen; s++) { + if (pat[s] == '\'' || pat[s] == '\\') + newlen++; + } + + Newx(newpat, newlen, char); + p = newpat; + *p++ = 'q'; *p++ = 'r'; *p++ = '\''; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + /* blank out literal code block */ + assert(pat[s] == '('); + while (s <= pRExC_state->code_blocks[n].end) { + *p++ = ' '; + s++; + } + s--; + n++; + continue; } + if (pat[s] == '\'' || pat[s] == '\\') + *p++ = '\\'; + *p++ = pat[s]; + } + *p++ = '\''; + if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) + *p++ = 'x'; + *p++ = '\0'; + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, + "%sre-parsing pattern for runtime code:%s %s\n", + PL_colors[4],PL_colors[5],newpat); + }); + + sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); + Safefree(newpat); + + ENTER; + SAVETMPS; + save_re_context(); + PUSHSTACKi(PERLSI_REQUIRE); + /* this causes the toker to collapse \\ into \ when parsing + * qr''; normally only q'' does this. It also alters hints + * handling */ + PL_reg_state.re_reparsing = TRUE; + eval_sv(sv, G_SCALAR); + SvREFCNT_dec(sv); + SPAGAIN; + qr_ref = POPs; + PUTBACK; + if (SvTRUE(ERRSV)) + Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV)); + assert(SvROK(qr_ref)); + qr = SvRV(qr_ref); + assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); + /* the leaving below frees the tmp qr_ref. + * Give qr a life of its own */ + SvREFCNT_inc(qr); + POPSTACK; + FREETMPS; + LEAVE; + + } + + if (!RExC_utf8 && SvUTF8(qr)) { + /* first time through; the pattern got upgraded; save the + * qr for the next time through */ + assert(!pRExC_state->runtime_code_qr); + pRExC_state->runtime_code_qr = qr; + return 0; + } + + + /* extract any code blocks within the returned qr// */ + + + /* merge the main (r1) and run-time (r2) code blocks into one */ + { + RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2); + struct reg_code_block *new_block, *dst; + RExC_state_t * const r1 = pRExC_state; /* convenient alias */ + int i1 = 0, i2 = 0; + + if (!r2->num_code_blocks) /* we guessed wrong */ + return 1; + + Newx(new_block, + r1->num_code_blocks + r2->num_code_blocks, + struct reg_code_block); + dst = new_block; + + while ( i1 < r1->num_code_blocks + || i2 < r2->num_code_blocks) + { + struct reg_code_block *src; + bool is_qr = 0; + + if (i1 == r1->num_code_blocks) { + src = &r2->code_blocks[i2++]; + is_qr = 1; + } + else if (i2 == r2->num_code_blocks) + src = &r1->code_blocks[i1++]; + else if ( r1->code_blocks[i1].start + < r2->code_blocks[i2].start) + { + src = &r1->code_blocks[i1++]; + assert(src->end < r2->code_blocks[i2].start); + } + else { + assert( r1->code_blocks[i1].start + > r2->code_blocks[i2].start); + src = &r2->code_blocks[i2++]; + is_qr = 1; + assert(src->end < r1->code_blocks[i1].start); + } + + assert(pat[src->start] == '('); + assert(pat[src->end] == ')'); + dst->start = src->start; + dst->end = src->end; + dst->block = src->block; + dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) + : src->src_regex; + dst++; } - else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) { - assert(i+1 < pRExC_state->num_code_blocks); - pRExC_state->code_blocks[++i].start = SvCUR(pat); - pRExC_state->code_blocks[i].block = o; - is_code = 1; - } + r1->num_code_blocks += r2->num_code_blocks; + Safefree(r1->code_blocks); + r1->code_blocks = new_block; } + + SvREFCNT_dec(qr); + return 1; } /* - * Perl_op_re_compile - the perl internal RE engine's function to compile a + * Perl_re_op_compile - the perl internal RE engine's function to compile a * regular expression into internal code. * The pattern may be passed either as: * a list of SVs (patternp plus pat_count) * a list of OPs (expr) * If both are passed, the SV list is used, but the OP list indicates - * which SVs are actually pre-compiled codeblocks + * which SVs are actually pre-compiled code blocks * - * The list of SVs have magic and qr overloading applied to them (and + * The SVs in the list have magic and qr overloading applied to them (and * the list may be modified in-place with replacement SVs in the latter * case). * - * If the pattern hasn't changed from old_re, then old-re will be + * If the pattern hasn't changed from old_re, then old_re will be * returned. * - * If eng is set (and not equal to PL_core_reg_engine), then - * just do the intial concatenation of arguments, then pass on to - * the external engine. + * eng is the current engine. If that engine has an op_comp method, then + * handle directly (i.e. we assume that op_comp was us); otherwise, just + * do the initial concatenation of arguments and pass on to the external + * engine. * - * If is_bare_re is not null, set it to a boolean indicating whether - * the the arg list reduced (after overloading) to a single bare - * regex which has been returned (i.e. /$qr/). + * If is_bare_re is not null, set it to a boolean indicating whether the + * arg list reduced (after overloading) to a single bare regex which has + * been returned (i.e. /$qr/). + * + * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details. + * + * pm_flags contains the PMf_* flags, typically based on those from the + * pm_flags field of the related PMOP. Currently we're only interested in + * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL. * * We can't allocate space until we know how big the compiled form will be, * but we can't compile it (and thus know how big it is) until we've got a @@ -5045,7 +5232,7 @@ S_get_pat_and_code_indices(pTHX_ RExC_state_t *pRExC_state, OP* expr, SV* pat) { REGEXP * Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, OP *expr, const regexp_engine* eng, REGEXP *VOL old_re, - int *is_bare_re, U32 orig_pm_flags) + bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags) { dVAR; REGEXP *rx; @@ -5057,7 +5244,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, regnode *scan; I32 flags; I32 minlen = 0; - U32 pm_flags; + U32 rx_flags; SV * VOL pat; /* these are all flags - maybe they should be turned @@ -5066,9 +5253,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, I32 sawplus = 0; I32 sawopen = 0; bool used_setjump = FALSE; - regex_charset initial_charset = get_regex_charset(orig_pm_flags); + regex_charset initial_charset = get_regex_charset(orig_rx_flags); bool code_is_utf8 = 0; - + bool VOL recompile = 0; + bool runtime_code = 0; U8 jump_ret = 0; dJMPENV; scan_data_t data; @@ -5080,6 +5268,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, #endif GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_RE_OP_COMPILE; + DEBUG_r(if (!PL_colorset) reginitcolors()); #ifndef PERL_IN_XSUB_RE @@ -5144,7 +5334,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, pRExC_state->num_code_blocks = 0; if (is_bare_re) - *is_bare_re = 0; + *is_bare_re = FALSE; if (expr && (expr->op_type == OP_LIST || (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) { @@ -5162,8 +5352,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } if (ncode) { pRExC_state->num_code_blocks = ncode; - Newx(pRExC_state->code_blocks, ncode, struct code_block); - SAVEFREEPV(pRExC_state->code_blocks); + Newx(pRExC_state->code_blocks, ncode, struct reg_code_block); } } @@ -5193,6 +5382,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, OP *o = NULL; int n = 0; + bool utf8 = 0; + STRLEN orig_patlen = 0; if (pRExC_state->num_code_blocks) { o = cLISTOPx(expr)->op_first; @@ -5202,15 +5393,31 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, pat = newSVpvn("", 0); SAVEFREESV(pat); + + /* determine if the pattern is going to be utf8 (needed + * in advance to align code block indices correctly). + * XXX This could fail to be detected for an arg with + * overloading but not concat overloading; but the main effect + * in this obscure case is to need a 'use re eval' for a + * literal code block */ + for (svp = patternp; svp < patternp + pat_count; svp++) { + if (SvUTF8(*svp)) + utf8 = 1; + } + if (utf8) + SvUTF8_on(pat); + for (svp = patternp; svp < patternp + pat_count; svp++) { SV *sv, *msv = *svp; + SV *rx; bool code = 0; if (o) { if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) { + assert(n < pRExC_state->num_code_blocks); + pRExC_state->code_blocks[n].start = SvCUR(pat); + pRExC_state->code_blocks[n].block = o; + pRExC_state->code_blocks[n].src_regex = NULL; n++; - assert(n <= pRExC_state->num_code_blocks); - pRExC_state->code_blocks[n-1].start = SvCUR(pat); - pRExC_state->code_blocks[n-1].block = o; code = 1; o = o->op_sibling; /* skip CONST */ assert(o); @@ -5226,18 +5433,75 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, * code. Pretend we haven't seen it */ pRExC_state->num_code_blocks -= n; n = 0; + rx = NULL; } - else { - sv_catsv_nomg(pat, msv); - if (code) - pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1; + else { + while (SvAMAGIC(msv) + && (sv = AMG_CALLunary(msv, string_amg)) + && sv != msv) + { + msv = sv; + SvGETMAGIC(msv); + } + if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) + msv = SvRV(msv); + orig_patlen = SvCUR(pat); + sv_catsv_nomg(pat, msv); + rx = msv; + if (code) + pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1; + } + + /* extract any code blocks within any embedded qr//'s */ + if (rx && SvTYPE(rx) == SVt_REGEXP + && RX_ENGINE((REGEXP*)rx)->op_comp) + { + + RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri); + if (ri->num_code_blocks) { + int i; + /* the presence of an embedded qr// with code means + * we should always recompile: the text of the + * qr// may not have changed, but it may be a + * different closure than last time */ + recompile = 1; + Renew(pRExC_state->code_blocks, + pRExC_state->num_code_blocks + ri->num_code_blocks, + struct reg_code_block); + pRExC_state->num_code_blocks += ri->num_code_blocks; + for (i=0; i < ri->num_code_blocks; i++) { + struct reg_code_block *src, *dst; + STRLEN offset = orig_patlen + + ((struct regexp *)SvANY(rx))->pre_prefix; + assert(n < pRExC_state->num_code_blocks); + src = &ri->code_blocks[i]; + dst = &pRExC_state->code_blocks[n]; + dst->start = src->start + offset; + dst->end = src->end + offset; + dst->block = src->block; + dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) + src->src_regex + ? src->src_regex + : (REGEXP*)rx); + n++; + } + } } } SvSETMAGIC(pat); } - else + else { + SV *sv; pat = *patternp; + while (SvAMAGIC(pat) + && (sv = AMG_CALLunary(pat, string_amg)) + && sv != pat) + { + pat = sv; + SvGETMAGIC(pat); + } + } /* handle bare regex: foo =~ $re */ { @@ -5246,8 +5510,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, re = SvRV(re); if (SvTYPE(re) == SVt_REGEXP) { if (is_bare_re) - *is_bare_re = 1; + *is_bare_re = TRUE; SvREFCNT_inc(re); + Safefree(pRExC_state->code_blocks); return (REGEXP*)re; } } @@ -5256,11 +5521,37 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* not a list of SVs, so must be a list of OPs */ assert(expr); if (expr->op_type == OP_LIST) { + int i = -1; + bool is_code = 0; + OP *o; + pat = newSVpvn("", 0); SAVEFREESV(pat); if (code_is_utf8) SvUTF8_on(pat); - S_get_pat_and_code_indices(aTHX_ pRExC_state, expr, pat); + + /* given a list of CONSTs and DO blocks in expr, append all + * the CONSTs to pat, and record the start and end of each + * code block in code_blocks[] (each DO{} op is followed by an + * OP_CONST containing the corresponding literal '(?{...}) + * text) + */ + for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) { + if (o->op_type == OP_CONST) { + sv_catsv(pat, cSVOPo_sv); + if (is_code) { + pRExC_state->code_blocks[i].end = SvCUR(pat)-1; + is_code = 0; + } + } + else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) { + assert(i+1 < pRExC_state->num_code_blocks); + pRExC_state->code_blocks[++i].start = SvCUR(pat); + pRExC_state->code_blocks[i].block = o; + pRExC_state->code_blocks[i].src_regex = NULL; + is_code = 1; + } + } } else { assert(expr->op_type == OP_CONST); @@ -5270,7 +5561,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, exp = SvPV_nomg(pat, plen); - if (eng && eng != &PL_core_reg_engine) { + if (!eng->op_comp) { if ((SvUTF8(pat) && IN_BYTES) || SvGMAGICAL(pat) || SvAMAGIC(pat)) { @@ -5279,22 +5570,15 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, pat = newSVpvn_flags(exp, plen, SVs_TEMP | (IN_BYTES ? 0 : SvUTF8(pat))); } - return CALLREGCOMP_ENG(eng, pat, orig_pm_flags); - } - - if ( old_re - && !!RX_UTF8(old_re) == !!SvUTF8(pat) - && RX_PRECOMP(old_re) && RX_PRELEN(old_re) == plen - && memEQ(RX_PRECOMP(old_re), exp, plen)) - { - ReREFCNT_inc(old_re); - return old_re; + Safefree(pRExC_state->code_blocks); + return CALLREGCOMP_ENG(eng, pat, orig_rx_flags); } /* ignore the utf8ness if the pattern is 0 length */ RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); RExC_uni_semantics = 0; RExC_contains_locale = 0; + pRExC_state->runtime_code_qr = NULL; /****************** LONG JUMP TARGET HERE***********************/ /* Longjmp back to here if have to switch in midstream to utf8 */ @@ -5377,28 +5661,43 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, xend = exp + plen; SAVEFREEPV(exp); RExC_orig_utf8 = RExC_utf8 = 1; + } - /* we've changed the string; check again whether it matches - * the old pattern, to avoid recompilation */ - if ( old_re - && RX_UTF8(old_re) - && RX_PRECOMP(old_re) && RX_PRELEN(old_re) == plen - && memEQ(RX_PRECOMP(old_re), exp, plen)) - { + /* return old regex if pattern hasn't changed */ + + if ( old_re + && !recompile + && !!RX_UTF8(old_re) == !!RExC_utf8 + && RX_PRECOMP(old_re) + && RX_PRELEN(old_re) == plen + && memEQ(RX_PRECOMP(old_re), exp, plen)) + { + /* with runtime code, always recompile */ + runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags, + exp, plen); + if (!runtime_code) { ReREFCNT_inc(old_re); if (used_setjump) { JMPENV_POP; } + Safefree(pRExC_state->code_blocks); return old_re; } - } + else if ((pm_flags & PMf_USE_RE_EVAL) + /* this second condition covers the non-regex literal case, + * i.e. $foo =~ '(?{})'. */ + || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME + && (PL_hints & HINT_RE_EVAL)) + ) + runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags, + exp, plen); #ifdef TRIE_STUDY_OPT restudied = 0; #endif - pm_flags = orig_pm_flags; + rx_flags = orig_rx_flags; if (initial_charset == REGEX_LOCALE_CHARSET) { RExC_contains_locale = 1; @@ -5407,17 +5706,30 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ - set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET); + set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); } RExC_precomp = exp; - RExC_flags = pm_flags; + RExC_flags = rx_flags; + RExC_pm_flags = pm_flags; + + if (runtime_code) { + if (PL_tainting && PL_tainted) + Perl_croak(aTHX_ "Eval-group in insecure regular expression"); + + if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { + /* whoops, we have a non-utf8 pattern, whilst run-time code + * got compiled as utf8. Try again with a utf8 pattern */ + JMPENV_JUMP(UTF8_LONGJMP); + } + } + assert(!pRExC_state->runtime_code_qr); + RExC_sawback = 0; RExC_seen = 0; RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; - RExC_seen_evals = 0; RExC_extralen = 0; RExC_override_recoding = 0; @@ -5446,9 +5758,14 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, * Clever compilers notice this and complain. --jhi */ REGC((U8)REG_MAGIC, (char*)RExC_emit); #endif - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n")); + DEBUG_PARSE_r( + PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"); + RExC_lastnum=0; + RExC_lastparse=NULL; + ); if (reg(pRExC_state, 0, &flags,1) == NULL) { RExC_precomp = NULL; + Safefree(pRExC_state->code_blocks); return(NULL); } @@ -5468,9 +5785,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* The first pass could have found things that force Unicode semantics */ if ((RExC_utf8 || RExC_uni_semantics) - && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) + && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET) { - set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET); + set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); } /* Small enough for pointer-storage convention? @@ -5501,8 +5818,15 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* non-zero initialization begins here */ RXi_SET( r, ri ); - r->engine= RE_ENGINE_PTR; - r->extflags = pm_flags; + r->engine= eng; + r->extflags = rx_flags; + if (pm_flags & PMf_IS_QR) { + ri->code_blocks = pRExC_state->code_blocks; + ri->num_code_blocks = pRExC_state->num_code_blocks; + } + else + SAVEFREEPV(pRExC_state->code_blocks); + { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET); @@ -5595,7 +5919,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_rxi = ri; /* Second pass: emit code. */ - RExC_flags = pm_flags; /* don't let top level (?i) bleed */ + RExC_flags = rx_flags; /* don't let top level (?i) bleed */ + RExC_pm_flags = pm_flags; RExC_parse = exp; RExC_end = xend; RExC_naughty = 0; @@ -5605,8 +5930,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_emit_bound = ri->program + RExC_size + 1; pRExC_state->code_index = 0; - /* Store the count of eval-groups for security checks: */ - RExC_rx->seen_evals = RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); if (reg(pRExC_state, 0, &flags,1) == NULL) { ReREFCNT_dec(rx); @@ -5759,7 +6082,7 @@ reStudy: else if ((!sawopen || !RExC_sawback) && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && - !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL)) + !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks) { /* turn .* into ^.* with an implied $*=1 */ const int type = @@ -5772,7 +6095,7 @@ reStudy: goto again; } if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback) - && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */ + && !pRExC_state->num_code_blocks) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->intflags |= PREGf_SKIP; @@ -6045,7 +6368,7 @@ reStudy: r->extflags |= RXf_GPOS_SEEN; if (RExC_seen & REG_SEEN_LOOKBEHIND) r->extflags |= RXf_LOOKBEHIND_SEEN; - if (RExC_seen & REG_SEEN_EVAL) + if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; if (RExC_seen & REG_SEEN_CANY) r->extflags |= RXf_CANY_SEEN; @@ -6053,6 +6376,8 @@ reStudy: r->intflags |= PREGf_VERBARG_SEEN; if (RExC_seen & REG_SEEN_CUTGROUP) r->intflags |= PREGf_CUTGROUP_SEEN; + if (pm_flags & PMf_USE_RE_EVAL) + r->intflags |= PREGf_USE_RE_EVAL; if (RExC_paren_names) RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); else @@ -6123,8 +6448,6 @@ reStudy: return rx; } -#undef RE_ENGINE_PTR - SV* Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, @@ -6567,7 +6890,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", (unsigned long) flags); } - /* NOT REACHED */ + assert(0); /* NOT REACHED */ } return NULL; } @@ -8181,7 +8504,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } goto gen_recurse_regop; - /* NOT REACHED */ + assert(0); /* NOT REACHED */ case '+': if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { RExC_parse++; @@ -8251,7 +8574,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); return ret; } /* named and numeric backreferences */ - /* NOT REACHED */ + assert(0); /* NOT REACHED */ case '?': /* (??...) */ is_logical = 1; @@ -8265,90 +8588,51 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* FALL THROUGH */ case '{': /* (?{...}) */ { - I32 count = 1; U32 n = 0; - char c; - char *s = RExC_parse; + struct reg_code_block *cb; RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_EVAL; - if ( pRExC_state->num_code_blocks - && pRExC_state->code_index < pRExC_state->num_code_blocks - && pRExC_state->code_blocks[pRExC_state->code_index].start - == (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) + if ( !pRExC_state->num_code_blocks + || pRExC_state->code_index >= pRExC_state->num_code_blocks + || pRExC_state->code_blocks[pRExC_state->code_index].start + != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) - RExC_start) ) { - /* this is a pre-compiled literal (?{}) */ - RExC_parse = RExC_start + - pRExC_state->code_blocks[pRExC_state->code_index].end; - if (SIZE_ONLY) - RExC_seen_evals++; - else { - OP *o = - pRExC_state->code_blocks[pRExC_state->code_index].block; - n = add_data(pRExC_state, 1, - (RExC_flags & PMf_HAS_CV) ? "L" : "l"); - RExC_rxi->data->data[n] = (void*)o->op_next; - } - pRExC_state->code_index++; + if (RExC_pm_flags & PMf_USE_RE_EVAL) + FAIL("panic: Sequence (?{...}): no code block found\n"); + FAIL("Eval-group not allowed at runtime, use re 'eval'"); } - else { - while (count && (c = *RExC_parse)) { - if (c == '\\') { - if (RExC_parse[1]) - RExC_parse++; - } - else if (c == '{') - count++; - else if (c == '}') - count--; - RExC_parse++; - } - if (*RExC_parse != ')') { - RExC_parse = s; - vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); - } - if (!SIZE_ONLY) { - PAD *pad; - OP_4tree *sop, *rop; - SV * const sv = newSVpvn(s, RExC_parse - 1 - s); - - ENTER; - Perl_save_re_context(aTHX); - rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad); - sop->op_private |= OPpREFCOUNTED; - /* re_dup will OpREFCNT_inc */ - OpREFCNT_set(sop, 1); - LEAVE; - - n = add_data(pRExC_state, 3, "nop"); - RExC_rxi->data->data[n] = (void*)rop; - RExC_rxi->data->data[n+1] = (void*)sop; - RExC_rxi->data->data[n+2] = (void*)pad; - SvREFCNT_dec(sv); + /* this is a pre-compiled code block (?{...}) */ + cb = &pRExC_state->code_blocks[pRExC_state->code_index]; + RExC_parse = RExC_start + cb->end; + if (!SIZE_ONLY) { + OP *o = cb->block; + if (cb->src_regex) { + n = add_data(pRExC_state, 2, "rl"); + RExC_rxi->data->data[n] = + (void*)SvREFCNT_inc((SV*)cb->src_regex); + RExC_rxi->data->data[n+1] = (void*)o; } - else { /* First pass */ - if (PL_reginterp_cnt < ++RExC_seen_evals - && IN_PERL_RUNTIME) - /* No compiled RE interpolated, has runtime - components ===> unsafe. */ - FAIL("Eval-group not allowed at runtime, use re 'eval'"); - if (PL_tainting && PL_tainted) - FAIL("Eval-group in insecure regular expression"); - #if PERL_VERSION > 8 - if (IN_PERL_COMPILETIME) - PL_cv_has_eval = 1; - #endif + else { + n = add_data(pRExC_state, 1, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l"); + RExC_rxi->data->data[n] = (void*)o; } } + pRExC_state->code_index++; nextchar(pRExC_state); if (is_logical) { + regnode *eval; ret = reg_node(pRExC_state, LOGICAL); - if (!SIZE_ONLY) + eval = reganode(pRExC_state, EVAL, n); + if (!SIZE_ONLY) { ret->flags = 2; - REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n)); + /* for later propagation into (??{}) return value */ + eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME); + } + REGTAIL(pRExC_state, ret, eval); /* deal with the length of this later - MJD */ return ret; } @@ -8878,7 +9162,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else FAIL("Junk on end of regexp"); /* "Can't happen". */ - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } if (RExC_in_lookbehind) { @@ -9427,8 +9711,46 @@ S_reg_recode(pTHX_ const char value, SV **encp) sequence, we return. Note: we have to be careful with escapes, as they can be both literal - and special, and in the case of \10 and friends can either, depending - on context. Specifically there are two separate switches for handling + and special, and in the case of \10 and friends, context determines which. + + A summary of the code structure is: + + switch (first_byte) { + cases for each special: + handle this special; + break; + case '\\': + switch (2nd byte) { + cases for each unambiguous special: + handle this special; + break; + cases for each ambigous special/literal: + disambiguate; + if (special) handle here + else goto defchar; + default: // unambiguously literal: + goto defchar; + } + default: // is a literal char + // FALL THROUGH + defchar: + create EXACTish node for literal; + while (more input and node isn't full) { + switch (input_byte) { + cases for each special; + make sure parse pointer is set so that the next call to + regatom will see this special first + goto loopdone; // EXACTish node terminated by prev. char + default: + append char to EXACTISH node; + } + get next input byte; + } + loopdone: + } + return the generated node; + + Specifically there are two separate switches for handling escape sequences, with the one for handling literal escapes requiring a dummy entry for all of the special escapes that are actually handled by the other. @@ -9897,6 +10219,7 @@ tryagain: vFAIL("Reference to nonexistent or unclosed group"); } if (!isg && num > 9 && num >= RExC_npar) + /* Probably a character specified in octal, e.g. \35 */ goto defchar; else { char * const parse_start = RExC_parse - 1; /* MJD */ @@ -10111,38 +10434,39 @@ tryagain: break; } case 'x': - if (*++p == '{') { - char* const e = strchr(p, '}'); + { + STRLEN brace_len = len; + UV result; + const char* error_msg; - if (!e) { - RExC_parse = p + 1; - vFAIL("Missing right brace on \\x{}"); + bool valid = grok_bslash_x(p, + &result, + &brace_len, + &error_msg, + 1); + p += brace_len; + if (! valid) { + RExC_parse = p; /* going to die anyway; point + to exact spot of failure */ + vFAIL(error_msg); } else { - I32 flags = PERL_SCAN_ALLOW_UNDERSCORES - | PERL_SCAN_DISALLOW_PREFIX; - STRLEN numlen = e - p - 1; - ender = grok_hex(p + 1, &numlen, &flags, NULL); - if (ender > 0xff) - REQUIRE_UTF8; - p = e + 1; + ender = result; } + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; } - else { - I32 flags = PERL_SCAN_DISALLOW_PREFIX; - STRLEN numlen = 2; - ender = grok_hex(p, &numlen, &flags, NULL); - p += numlen; - } - if (PL_encoding && ender < 0x100) - goto recode_encoding; - break; case 'c': p++; ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); break; case '0': case '1': case '2': case '3':case '4': - case '5': case '6': case '7': case '8':case '9': + case '5': case '6': case '7': if (*p == '0' || (isDIGIT(p[1]) && atoi(p) >= RExC_npar)) { @@ -10175,7 +10499,7 @@ tryagain: FAIL("Trailing \\"); /* FALL THROUGH */ default: - if (!SIZE_ONLY&& isALPHA(*p)) { + if (!SIZE_ONLY&& isALNUMC(*p)) { ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p); } goto normal_default; @@ -11219,22 +11543,18 @@ parseit: } break; case 'x': - if (*RExC_parse == '{') { - I32 flags = PERL_SCAN_ALLOW_UNDERSCORES - | PERL_SCAN_DISALLOW_PREFIX; - char * const e = strchr(RExC_parse++, '}'); - if (!e) - vFAIL("Missing right brace on \\x{}"); - - numlen = e - RExC_parse; - value = grok_hex(RExC_parse, &numlen, &flags, NULL); - RExC_parse = e + 1; - } - else { - I32 flags = PERL_SCAN_DISALLOW_PREFIX; - numlen = 2; - value = grok_hex(RExC_parse, &numlen, &flags, NULL); + RExC_parse--; /* function expects to be pointed at the 'x' */ + { + const char* error_msg; + bool valid = grok_bslash_x(RExC_parse, + &value, + &numlen, + &error_msg, + 1); RExC_parse += numlen; + if (! valid) { + vFAIL(error_msg); + } } if (PL_encoding && value < 0x100) goto recode_encoding; @@ -13260,16 +13580,21 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) if (ri->u.offsets) Safefree(ri->u.offsets); /* 20010421 MJD */ #endif + if (ri->code_blocks) { + int n; + for (n = 0; n < ri->num_code_blocks; n++) + SvREFCNT_dec(ri->code_blocks[n].src_regex); + Safefree(ri->code_blocks); + } + if (ri->data) { int n = ri->data->count; - PAD* new_comppad = NULL; - PAD* old_comppad; - PADOFFSET refcnt; while (--n >= 0) { /* If you add a ->what type here, update the comment in regcomp.h */ switch (ri->data->what[n]) { case 'a': + case 'r': case 's': case 'S': case 'u': @@ -13278,29 +13603,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) case 'f': Safefree(ri->data->data[n]); break; - case 'p': - new_comppad = MUTABLE_AV(ri->data->data[n]); - break; - case 'o': - if (new_comppad == NULL) - Perl_croak(aTHX_ "panic: pregfree comppad"); - PAD_SAVE_LOCAL(old_comppad, - /* Watch out for global destruction's random ordering. */ - (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL - ); - OP_REFCNT_LOCK; - refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]); - OP_REFCNT_UNLOCK; - if (!refcnt) - op_free((OP_4tree*)ri->data->data[n]); - - PAD_RESTORE_LOCAL(old_comppad); - SvREFCNT_dec(MUTABLE_SV(new_comppad)); - new_comppad = NULL; - break; case 'l': case 'L': - case 'n': break; case 'T': { /* Aho Corasick add-on structure for a trie node. @@ -13489,7 +13793,20 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); - + + reti->num_code_blocks = ri->num_code_blocks; + if (ri->code_blocks) { + int n; + Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block, + struct reg_code_block); + Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks, + struct reg_code_block); + for (n = 0; n < ri->num_code_blocks; n++) + reti->code_blocks[n].src_regex = (REGEXP*) + sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param); + } + else + reti->code_blocks = NULL; reti->regstclass = NULL; @@ -13506,12 +13823,11 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) for (i = 0; i < count; i++) { d->what[i] = ri->data->what[i]; switch (d->what[i]) { - /* legal options are one of: sSfpontTua - see also regcomp.h and pregfree() */ + /* see also regcomp.h and regfree_internal() */ case 'a': /* actually an AV, but the dup function is identical. */ + case 'r': case 's': case 'S': - case 'p': /* actually an AV, but the dup function is identical. */ case 'u': /* actually an HV, but the dup function is identical. */ d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); break; @@ -13522,13 +13838,6 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) struct regnode_charclass_class); reti->regstclass = (regnode*)d->data[i]; break; - case 'o': - /* Compiled op trees are readonly and in shared memory, - and can thus be shared without duplication. */ - OP_REFCNT_LOCK; - d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]); - OP_REFCNT_UNLOCK; - break; case 'T': /* Trie stclasses are readonly and can thus be shared * without duplication. We free the stclass in pregfree @@ -13543,7 +13852,6 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) /* Fall through */ case 'l': case 'L': - case 'n': d->data[i] = ri->data->data[i]; break; default: @@ -13653,8 +13961,6 @@ Perl_save_re_context(pTHX) Copy(&PL_reg_state, state, 1, struct re_save_state); - PL_reg_start_tmp = 0; - PL_reg_start_tmpl = 0; PL_reg_oldsaved = NULL; PL_reg_oldsavedlen = 0; PL_reg_maxiter = 0;