From d24ca0c5f11250dcd2552c84a048bda5786ba8d1 Mon Sep 17 00:00:00 2001 From: David Mitchell Date: Sun, 18 Mar 2012 15:53:40 +0000 Subject: [PATCH] Fix up runtime regex codeblocks. The previous commits in this branch have brought literal code blocks into the New World Order; now do the same for runtime blocks, i.e. those needing "use re 'eval'". The main user-visible changes from this commit are that: * the code is now fully parsed, rather than needing balanced {}'s; i.e. this now works: my $code = q[ (?{ $a = '{' }) ]; use re 'eval'; /$code/ * warnings and errors are now reported as coming from "(eval NNN)" rather than "(re_eval NNN)" (although see the next commit for some fixups to that). Indeed, the string "re_eval" has been expunged from the source and documentation. The big internal difference is that the sv_compile_2op() and sv_compile_2op_is_broken() functions are no longer used, and will be removed shorty. It works by the regex compiler detecting the presence of run-time code blocks, and feeding the whole pattern string back into the parser (where the run-time blocks are now seen as compile-time), then extracting out any compiled code blocks and adding them to the mix. For example, in the following: $c = '(?{"runtime"})d'; use re 'eval'; /a(?{"literal"})\b'$c/ At the point the regex compiler is called, the perl parser will already have compiled the literal code block and presented it to the regex engine. The engine examines the pattern string, sees two '(?{', but only one accounted for by the parser, and so constructs a short string to be evalled: based on the pattern, but with literal code-blocks blanked out, and \ and ' escaped. In the above example, the pattern string is a(?{"literal"})\b'(?{"runtime"})d and we call eval_sv() with an SV containing the text qr'a \\b\'(?{"runtime"})d' The returned qr will contain the new code-block (and associated CV and pad) which can be extracted and added to the list of compiled code blocks of the original pattern. Note that with this scheme, the requirement for "use re 'eval'" is easily determined, and no longer requires all the pp_regcreset / PL_reginterp_cnt machinery, which will be removed shortly. Two subtleties of this scheme are that normally, \\ isn't collapsed into \ for literal regexes (unlike literal strings), and hints aren't inherited when using eval_sv(). We get round both of these by adding and setting a new flag, PL_reg_state.re_reparsing, which indicates that we are refeeding a pattern into the perl parser. --- embed.fnc | 3 +- embed.h | 2 +- lib/perl5db.pl | 4 +- perl.c | 2 + pod/perldebguts.pod | 7 +- pod/perldiag.pod | 13 +- proto.h | 2 +- regcomp.c | 403 +++++++++++++++++++++++++++++++++++----------------- regcomp.h | 3 - regexec.c | 11 +- regexp.h | 1 + t/op/taint.t | 11 +- t/op/threads.t | 2 +- t/re/pat_re_eval.t | 106 +++++++++++++- t/re/recompile.t | 11 +- toke.c | 50 ++++--- 16 files changed, 449 insertions(+), 182 deletions(-) diff --git a/embed.fnc b/embed.fnc index e05af38..faf1f85 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2121,7 +2121,8 @@ s |char* |scan_ident |NN char *s|NN const char *send|NN char *dest \ |STRLEN destlen|I32 ck_uni sR |char* |scan_inputsymbol|NN char *start sR |char* |scan_pat |NN char *start|I32 type -sR |char* |scan_str |NN char *start|int keep_quoted|int keep_delims +sR |char* |scan_str |NN char *start|int keep_quoted \ + |int keep_delims|int re_reparse sR |char* |scan_subst |NN char *start sR |char* |scan_trans |NN char *start s |char* |scan_word |NN char *s|NN char *dest|STRLEN destlen \ diff --git a/embed.h b/embed.h index d6ea5b2..a18b723 100644 --- a/embed.h +++ b/embed.h @@ -1586,7 +1586,7 @@ #define scan_ident(a,b,c,d,e) S_scan_ident(aTHX_ a,b,c,d,e) #define scan_inputsymbol(a) S_scan_inputsymbol(aTHX_ a) #define scan_pat(a,b) S_scan_pat(aTHX_ a,b) -#define scan_str(a,b,c) S_scan_str(aTHX_ a,b,c) +#define scan_str(a,b,c,d) S_scan_str(aTHX_ a,b,c,d) #define scan_subst(a) S_scan_subst(aTHX_ a) #define scan_trans(a) S_scan_trans(aTHX_ a) #define scan_word(a,b,c,d,e) S_scan_word(aTHX_ a,b,c,d,e) diff --git a/lib/perl5db.pl b/lib/perl5db.pl index 889f305..d31c7f6 100644 --- a/lib/perl5db.pl +++ b/lib/perl5db.pl @@ -189,7 +189,7 @@ Values are magical in numeric context: 1 if the line is breakable, 0 if not. The scalar C<${"_<$filename"}> simply contains the string C<_<$filename>. This is also the case for evaluated strings that contain subroutines, or which are currently being executed. The $filename for Ced strings looks -like C<(eval 34)> or C<(re_eval 19)>. +like C<(eval 34). =head1 DEBUGGER STARTUP @@ -519,7 +519,7 @@ BEGIN { } # Debugger for Perl 5.00x; perl5db.pl patch level: -$VERSION = '1.37'; +$VERSION = '1.38'; $header = "perl5db.pl version $VERSION"; diff --git a/perl.c b/perl.c index d287872..c561799 100644 --- a/perl.c +++ b/perl.c @@ -2808,6 +2808,8 @@ Perl_eval_sv(pTHX_ SV *sv, I32 flags) myop.op_flags |= OP_GIMME_REVERSE(flags); if (flags & G_KEEPERR) myop.op_flags |= OPf_SPECIAL; + if (PL_reg_state.re_reparsing) + myop.op_private = OPpEVAL_COPHH; /* fail now; otherwise we could fail after the JMPENV_PUSH but * before a PUSHEVAL, which corrupts the stack after a croak */ diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod index 8ae6e7b..fdddf4a 100644 --- a/pod/perldebguts.pod +++ b/pod/perldebguts.pod @@ -38,7 +38,6 @@ Each array C<@{"_<$filename"}> holds the lines of $filename for a file compiled by Perl. The same is also true for Ced strings that contain subroutines, or which are currently being executed. The $filename for Ced strings looks like C<(eval 34)>. -Code assertions in regexes look like C<(re_eval 19)>. Values in this array are magical in numeric context: they compare equal to zero only if the line is not breakable. @@ -53,14 +52,14 @@ C<"$break_condition\0$action">. The same holds for evaluated strings that contain subroutines, or which are currently being executed. The $filename for Ced strings -looks like C<(eval 34)> or C<(re_eval 19)>. +looks like C<(eval 34)>. =item * Each scalar C<${"_<$filename"}> contains C<"_<$filename">. This is also the case for evaluated strings that contain subroutines, or which are currently being executed. The $filename for Ced -strings looks like C<(eval 34)> or C<(re_eval 19)>. +strings looks like C<(eval 34)>. =item * @@ -81,7 +80,7 @@ also exists. A hash C<%DB::sub> is maintained, whose keys are subroutine names and whose values have the form C. C has the form C<(eval 34)> for subroutines defined inside -Cs, or C<(re_eval 19)> for those within regex code assertions. +Cs. =item * diff --git a/pod/perldiag.pod b/pod/perldiag.pod index daca152..e547ceb 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -3653,6 +3653,12 @@ then discovered it wasn't a subroutine or eval context. (P) scan_num() got called on something that wasn't a number. +=item panic: Sequence (?{...}): no code block found + +(P) while compiling a pattern that has embedded (?{}) or (??{}) code +blocks, perl couldn't locate the code block that should have already been +seen and compiled by perl before control passed to the regex compiler. + =item panic: sv_chop %s (P) The sv_chop() routine was passed a position that is not within the @@ -4329,13 +4335,6 @@ parenthesis. Embedded parentheses aren't allowed. The <-- HERE shows in the regular expression about where the problem was discovered. See L. -=item Sequence (?{...}) not terminated or not {}-balanced in regex; marked by <-- HERE in m/%s/ - -(F) If the contents of a (?{...}) clause contain braces, they -must balance for Perl to detect the end of the clause properly. -The <-- HERE shows in the regular expression about where the -problem was discovered. See L. - =item Sequence (?{...}) not terminated with ')' (F) The end of the perl code contained within the {...} must be diff --git a/proto.h b/proto.h index ba75b64..a80ee70 100644 --- a/proto.h +++ b/proto.h @@ -7081,7 +7081,7 @@ STATIC char* S_scan_pat(pTHX_ char *start, I32 type) #define PERL_ARGS_ASSERT_SCAN_PAT \ assert(start) -STATIC char* S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) +STATIC char* S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_SCAN_STR \ diff --git a/regcomp.c b/regcomp.c index d329a7f..f3052de 100644 --- a/regcomp.c +++ b/regcomp.c @@ -159,6 +159,7 @@ typedef struct RExC_state_t { char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) #endif + SV *runtime_code_qr; /* qr with the runtime code blocks */ #ifdef DEBUGGING const char *lastparse; I32 lastnum; @@ -4962,6 +4963,229 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) NULL, NULL, rx_flags, 0); } +/* see if there are any run-time code blocks in the pattern. + * False positives are allowed */ + +static bool +S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr, + U32 pm_flags, char *pat, STRLEN plen) +{ + int n = 0; + STRLEN s; + + /* avoid infinitely recursing when we recompile the pattern parcelled up + * as qr'...'. A single constant qr// string can't have have any + * run-time component in it, and thus, no runtime code. (A non-qr + * string, however, can, e.g. $x =~ '(?{})') */ + if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST) + return 0; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + s = pRExC_state->code_blocks[n].end; + n++; + continue; + } + /* TODO ideally should handle [..], (#..), /#.../x to reduce false + * positives here */ + if (pat[s] == '(' && pat[s+1] == '?' && + (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{')) + ) + return 1; + } + return 0; +} + +/* Handle run-time code blocks. We will already have compiled any direct + * or indirect literal code blocks. Now, take the pattern 'pat' and make a + * copy of it, but with any literal code blocks blanked out and + * appropriate chars escaped; then feed it into + * + * eval "qr'modified_pattern'" + * + * For example, + * + * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno + * + * becomes + * + * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno' + * + * After eval_sv()-ing that, grab any new code blocks from the returned qr + * and merge them with any code blocks of the original regexp. + * + * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; + * instead, just save the qr and return FALSE; this tells our caller that + * the original pattern needs upgrading to utf8. + */ + +bool +S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, + char *pat, STRLEN plen) +{ + SV *qr; + + GET_RE_DEBUG_FLAGS_DECL; + + if (pRExC_state->runtime_code_qr) { + /* this is the second time we've been called; this should + * only happen if the main pattern got upgraded to utf8 + * during compilation; re-use the qr we compiled first time + * round (which should be utf8 too) + */ + qr = pRExC_state->runtime_code_qr; + pRExC_state->runtime_code_qr = NULL; + assert(RExC_utf8 && SvUTF8(qr)); + } + else { + int n = 0; + STRLEN s; + char *p, *newpat; + int newlen = plen + 5; /* allow for "qr''x" extra chars */ + SV *sv, *qr_ref; + dSP; + + /* determine how many extra chars we need for ' and \ escaping */ + for (s = 0; s < plen; s++) { + if (pat[s] == '\'' || pat[s] == '\\') + newlen++; + } + + Newx(newpat, newlen, char); + p = newpat; + *p++ = 'q'; *p++ = 'r'; *p++ = '\''; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + /* blank out literal code block */ + assert(pat[s] == '('); + while (s <= pRExC_state->code_blocks[n].end) { + *p++ = ' '; + s++; + } + s--; + n++; + continue; + } + if (pat[s] == '\'' || pat[s] == '\\') + *p++ = '\\'; + *p++ = pat[s]; + } + *p++ = '\''; + if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) + *p++ = 'x'; + *p++ = '\0'; + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, + "%sre-parsing pattern for runtime code:%s %s\n", + PL_colors[4],PL_colors[5],newpat); + }); + + sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); + Safefree(newpat); + + ENTER; + SAVETMPS; + save_re_context(); + PUSHSTACKi(PERLSI_REQUIRE); + /* this causes the toker to collapse \\ into \ when parsing + * qr''; normally only q'' does this. It also alters hints + * handling */ + PL_reg_state.re_reparsing = TRUE; + eval_sv(sv, G_SCALAR); + SvREFCNT_dec(sv); + SPAGAIN; + qr_ref = POPs; + PUTBACK; + if (SvTRUE(ERRSV)) + Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV)); + assert(SvROK(qr_ref)); + qr = SvRV(qr_ref); + assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); + /* the leaving below frees the tmp qr_ref. + * Give qr a life of its own */ + SvREFCNT_inc(qr); + POPSTACK; + FREETMPS; + LEAVE; + + } + + if (!RExC_utf8 && SvUTF8(qr)) { + /* first time through; the pattern got upgraded; save the + * qr for the next time through */ + assert(!pRExC_state->runtime_code_qr); + pRExC_state->runtime_code_qr = qr; + return 0; + } + + + /* extract any code blocks within the returned qr// */ + + + /* merge the main (r1) and run-time (r2) code blocks into one */ + { + RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2); + struct reg_code_block *new_block, *dst; + RExC_state_t * const r1 = pRExC_state; /* convenient alias */ + int i1 = 0, i2 = 0; + + if (!r2->num_code_blocks) /* we guessed wrong */ + return 1; + + Newx(new_block, + r1->num_code_blocks + r2->num_code_blocks, + struct reg_code_block); + dst = new_block; + + while ( i1 < r1->num_code_blocks + || i2 < r2->num_code_blocks) + { + struct reg_code_block *src; + bool is_qr = 0; + + if (i1 == r1->num_code_blocks) { + src = &r2->code_blocks[i2++]; + is_qr = 1; + } + else if (i2 == r2->num_code_blocks) + src = &r1->code_blocks[i1++]; + else if ( r1->code_blocks[i1].start + < r2->code_blocks[i2].start) + { + src = &r1->code_blocks[i1++]; + assert(src->end < r2->code_blocks[i2].start); + } + else { + assert( r1->code_blocks[i1].start + > r2->code_blocks[i2].start); + src = &r2->code_blocks[i2++]; + is_qr = 1; + assert(src->end < r1->code_blocks[i1].start); + } + + assert(pat[src->start] == '('); + assert(pat[src->end] == ')'); + dst->start = src->start; + dst->end = src->end; + dst->block = src->block; + dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) + : src->src_regex; + dst++; + } + r1->num_code_blocks += r2->num_code_blocks; + Safefree(r1->code_blocks); + r1->code_blocks = new_block; + } + + SvREFCNT_dec(qr); + return 1; +} + /* * Perl_re_op_compile - the perl internal RE engine's function to compile a @@ -5034,6 +5258,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, regex_charset initial_charset = get_regex_charset(orig_rx_flags); bool code_is_utf8 = 0; bool VOL recompile = 0; + bool runtime_code = 0; U8 jump_ret = 0; dJMPENV; scan_data_t data; @@ -5341,6 +5566,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); RExC_uni_semantics = 0; RExC_contains_locale = 0; + pRExC_state->runtime_code_qr = NULL; /****************** LONG JUMP TARGET HERE***********************/ /* Longjmp back to here if have to switch in midstream to utf8 */ @@ -5434,27 +5660,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && RX_PRELEN(old_re) == plen && memEQ(RX_PRECOMP(old_re), exp, plen)) { - /* see if there are any run-time code blocks */ - int n = 0; - STRLEN s; - bool runtime = 0; - for (s = 0; s < plen; s++) { - if (n < pRExC_state->num_code_blocks - && s == pRExC_state->code_blocks[n].start) - { - s = pRExC_state->code_blocks[n].end; - n++; - continue; - } - if (exp[s] == '(' && exp[s+1] == '?' && - (exp[s+2] == '{' || (exp[s+2] == '?' && exp[s+3] == '{'))) - { - runtime = 1; - break; - } - } /* with runtime code, always recompile */ - if (!runtime) { + runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags, + exp, plen); + if (!runtime_code) { ReREFCNT_inc(old_re); if (used_setjump) { JMPENV_POP; @@ -5463,6 +5672,14 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, return old_re; } } + else if ((pm_flags & PMf_USE_RE_EVAL) + /* this second condition covers the non-regex literal case, + * i.e. $foo =~ '(?{})'. */ + || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME + && (PL_hints & HINT_RE_EVAL)) + ) + runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags, + exp, plen); #ifdef TRIE_STUDY_OPT restudied = 0; @@ -5483,6 +5700,19 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_precomp = exp; RExC_flags = rx_flags; RExC_pm_flags = pm_flags; + + if (runtime_code) { + if (PL_tainting && PL_tainted) + Perl_croak(aTHX_ "Eval-group in insecure regular expression"); + + if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { + /* whoops, we have a non-utf8 pattern, whilst run-time code + * got compiled as utf8. Try again with a utf8 pattern */ + JMPENV_JUMP(UTF8_LONGJMP); + } + } + assert(!pRExC_state->runtime_code_qr); + RExC_sawback = 0; RExC_seen = 0; @@ -8347,91 +8577,42 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* FALL THROUGH */ case '{': /* (?{...}) */ { - I32 count = 1; U32 n = 0; - char c; - char *s = RExC_parse; + struct reg_code_block *cb; RExC_seen_zerolen++; RExC_seen |= REG_SEEN_EVAL; - if ( pRExC_state->num_code_blocks - && pRExC_state->code_index < pRExC_state->num_code_blocks - && pRExC_state->code_blocks[pRExC_state->code_index].start - == (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) + if ( !pRExC_state->num_code_blocks + || pRExC_state->code_index >= pRExC_state->num_code_blocks + || pRExC_state->code_blocks[pRExC_state->code_index].start + != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) - RExC_start) ) { - /* this is a pre-compiled literal (?{}) */ - struct reg_code_block *cb = - &pRExC_state->code_blocks[pRExC_state->code_index]; - RExC_parse = RExC_start + cb->end; - if (SIZE_ONLY) - RExC_seen_evals++; - else { - OP *o = cb->block; - if (cb->src_regex) { - n = add_data(pRExC_state, 2, "rl"); - RExC_rxi->data->data[n] = - (void*)SvREFCNT_inc((SV*)cb->src_regex); - RExC_rxi->data->data[n+1] = (void*)o->op_next; - } - else { - n = add_data(pRExC_state, 1, - (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l"); - RExC_rxi->data->data[n] = (void*)o->op_next; - } - } - pRExC_state->code_index++; + if (RExC_pm_flags & PMf_USE_RE_EVAL) + FAIL("panic: Sequence (?{...}): no code block found\n"); + FAIL("Eval-group not allowed at runtime, use re 'eval'"); } + /* this is a pre-compiled code block (?{...}) */ + cb = &pRExC_state->code_blocks[pRExC_state->code_index]; + RExC_parse = RExC_start + cb->end; + if (SIZE_ONLY) + RExC_seen_evals++; else { - while (count && (c = *RExC_parse)) { - if (c == '\\') { - if (RExC_parse[1]) - RExC_parse++; - } - else if (c == '{') - count++; - else if (c == '}') - count--; - RExC_parse++; - } - if (*RExC_parse != ')') { - RExC_parse = s; - vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); - } - if (!SIZE_ONLY) { - PAD *pad; - OP_4tree *sop, *rop; - SV * const sv = newSVpvn(s, RExC_parse - 1 - s); - - ENTER; - Perl_save_re_context(aTHX); - rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad); - sop->op_private |= OPpREFCOUNTED; - /* re_dup will OpREFCNT_inc */ - OpREFCNT_set(sop, 1); - LEAVE; - - n = add_data(pRExC_state, 3, "nop"); - RExC_rxi->data->data[n] = (void*)rop; - RExC_rxi->data->data[n+1] = (void*)sop; - RExC_rxi->data->data[n+2] = (void*)pad; - SvREFCNT_dec(sv); + OP *o = cb->block; + if (cb->src_regex) { + n = add_data(pRExC_state, 2, "rl"); + RExC_rxi->data->data[n] = + (void*)SvREFCNT_inc((SV*)cb->src_regex); + RExC_rxi->data->data[n+1] = (void*)o->op_next; } - else { /* First pass */ - if (PL_reginterp_cnt < ++RExC_seen_evals - && IN_PERL_RUNTIME) - /* No compiled RE interpolated, has runtime - components ===> unsafe. */ - FAIL("Eval-group not allowed at runtime, use re 'eval'"); - if (PL_tainting && PL_tainted) - FAIL("Eval-group in insecure regular expression"); - #if PERL_VERSION > 8 - if (IN_PERL_COMPILETIME) - PL_cv_has_eval = 1; - #endif + else { + n = add_data(pRExC_state, 1, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l"); + RExC_rxi->data->data[n] = (void*)o->op_next; } } + pRExC_state->code_index++; nextchar(pRExC_state); if (is_logical) { @@ -13359,9 +13540,6 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) if (ri->data) { int n = ri->data->count; - PAD* new_comppad = NULL; - PAD* old_comppad; - PADOFFSET refcnt; while (--n >= 0) { /* If you add a ->what type here, update the comment in regcomp.h */ @@ -13376,29 +13554,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) case 'f': Safefree(ri->data->data[n]); break; - case 'p': - new_comppad = MUTABLE_AV(ri->data->data[n]); - break; - case 'o': - if (new_comppad == NULL) - Perl_croak(aTHX_ "panic: pregfree comppad"); - PAD_SAVE_LOCAL(old_comppad, - /* Watch out for global destruction's random ordering. */ - (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL - ); - OP_REFCNT_LOCK; - refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]); - OP_REFCNT_UNLOCK; - if (!refcnt) - op_free((OP_4tree*)ri->data->data[n]); - - PAD_RESTORE_LOCAL(old_comppad); - SvREFCNT_dec(MUTABLE_SV(new_comppad)); - new_comppad = NULL; - break; case 'l': case 'L': - case 'n': break; case 'T': { /* Aho Corasick add-on structure for a trie node. @@ -13617,13 +13774,11 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) for (i = 0; i < count; i++) { d->what[i] = ri->data->what[i]; switch (d->what[i]) { - /* legal options are one of: sSfpontTua - see also regcomp.h and pregfree() */ + /* see also regcomp.h and regfree_internal() */ case 'a': /* actually an AV, but the dup function is identical. */ case 'r': case 's': case 'S': - case 'p': /* actually an AV, but the dup function is identical. */ case 'u': /* actually an HV, but the dup function is identical. */ d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); break; @@ -13634,13 +13789,6 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) struct regnode_charclass_class); reti->regstclass = (regnode*)d->data[i]; break; - case 'o': - /* Compiled op trees are readonly and in shared memory, - and can thus be shared without duplication. */ - OP_REFCNT_LOCK; - d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]); - OP_REFCNT_UNLOCK; - break; case 'T': /* Trie stclasses are readonly and can thus be shared * without duplication. We free the stclass in pregfree @@ -13655,7 +13803,6 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) /* Fall through */ case 'l': case 'L': - case 'n': d->data[i] = ri->data->data[i]; break; default: diff --git a/regcomp.h b/regcomp.h index 6d037ef..29a4a6e 100644 --- a/regcomp.h +++ b/regcomp.h @@ -540,9 +540,6 @@ END_EXTERN_C * f - start-class data for regstclass optimization * l - start op for literal (?{EVAL}) item * L - start op for literal (?{EVAL}) item, with separate CV (qr//) - * n - Root of op tree for (?{EVAL}) item - * o - Start op for (?{EVAL}) item - * p - Pad for (?{EVAL}) item * r - pointer to an embedded code-containing qr, e.g. /ab$qr/ * s - swash for Unicode-style character class, and the multicharacter * strings resulting from casefolding the single-character entries diff --git a/regexec.c b/regexec.c index 642b48f..f94d15a 100644 --- a/regexec.c +++ b/regexec.c @@ -4267,6 +4267,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * variable. */ Copy(&PL_reg_state, &saved_state, 1, struct re_save_state); + PL_reg_state.re_reparsing = FALSE; n = ARG(scan); if (rexi->data->what[n] == 'r') { /* code from an external qr */ @@ -4283,16 +4284,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) new_comppad = initial_pad; /* the pad of the current sub */ PL_op = (OP_4tree*)rexi->data->data[n]; } - else if (rexi->data->what[n] == 'L') { /* literal with own CV */ - new_comppad = (PAD*)AvARRAY(CvPADLIST(rex->qr_anoncv))[1]; - PL_op = (OP_4tree*)rexi->data->data[n]; - } else { + /* literal with own CV */ + assert(rexi->data->what[n] == 'L'); + new_comppad = (PAD*)AvARRAY(CvPADLIST(rex->qr_anoncv))[1]; PL_op = (OP_4tree*)rexi->data->data[n]; - new_comppad = (PAD*)rexi->data->data[n + 2]; } DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, - " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) ); + " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(PL_op)) ); /* wrap the call in two SAVECOMPPADs. This ensures that * when the save stack is eventually unwound, all the * accumulated SAVEt_CLEARSV's will be processed with diff --git a/regexp.h b/regexp.h index c79b281..8a77122 100644 --- a/regexp.h +++ b/regexp.h @@ -767,6 +767,7 @@ struct re_save_state { U32 re_state_reg_start_tmpl; /* from regexec.c */ I32 re_state_reg_eval_set; /* from regexec.c */ bool re_state_reg_match_utf8; /* from regexec.c */ + bool re_reparsing; /* runtime (?{}) fed back into parser */ char *re_state_bostr; char *re_state_reginput; /* String-input pointer. */ char *re_state_regeol; /* End of input, for $ check. */ diff --git a/t/op/taint.t b/t/op/taint.t index a0949d3..aec5556 100644 --- a/t/op/taint.t +++ b/t/op/taint.t @@ -17,7 +17,7 @@ BEGIN { use strict; use Config; -plan tests => 794; +plan tests => 795; $| = 1; @@ -2205,6 +2205,15 @@ pass("no death when TARG of ref is tainted"); like($@, qr/^Test\n\t\.\.\.propagated at /, "error should be propagated"); } +# tainted run-time (?{}) should die + +{ + my $code = '(?{})' . $TAINT; + use re 'eval'; + eval { "a" =~ /$code/ }; + like($@, qr/Eval-group in insecure regular expression/, "tainted (?{})"); +} + # This may bomb out with the alarm signal so keep it last SKIP: { skip "No alarm()" unless $Config{d_alarm}; diff --git a/t/op/threads.t b/t/op/threads.t index 1181a00..2a52efc 100644 --- a/t/op/threads.t +++ b/t/op/threads.t @@ -161,7 +161,7 @@ curr_test(curr_test() + 2); # the seen_evals field of a regexp was getting zeroed on clone, so # within a thread it didn't know that a regex object contained a 'safe' -# re_eval expression, so it later died with 'Eval-group not allowed' when +# code expression, so it later died with 'Eval-group not allowed' when # you tried to interpolate the object sub safe_re { diff --git a/t/re/pat_re_eval.t b/t/re/pat_re_eval.t index 64d4c6a..0e116b1 100644 --- a/t/re/pat_re_eval.t +++ b/t/re/pat_re_eval.t @@ -23,7 +23,7 @@ BEGIN { } -plan tests => 255; # Update this when adding/deleting tests. +plan tests => 352; # Update this when adding/deleting tests. run_tests() unless caller; @@ -467,7 +467,6 @@ sub run_tests { no re 'eval'; { - local $::TODO = 're_eval not yet secure!!'; eval { "A$x-BC$x" =~ /^A$code4-B$r4/ }; like($@, qr/Eval-group not allowed/, "runtime code5"); } @@ -630,6 +629,109 @@ sub run_tests { forward; } + # test that run-time embedded code, when re-fed into toker, + # does all the right escapes + + { + use re 'eval'; + + my $enc = eval 'use Encode; find_encoding("ascii")'; + + my $x = 0; + my $y = 'bad'; + + # note that most of the strings below are single-quoted, and the + # things within them, like '$y', *aren't* intended to interpolate + + my $s1 = + 'a\\$y(?# (??{BEGIN{$x=1} "X1"})b(?# \Ux2\E)c\'d\\\\e\\\\Uf\\\\E'; + + ok(q{a$ybc'd\e\Uf\E} =~ /^$s1$/, "reparse"); + is($x, 0, "reparse no BEGIN"); + + my $s2 = 'g\\$y# (??{{BEGIN{$x=2} "X3"}) \Ux3\E' . "\nh"; + + ok(q{a$ybc'd\\e\\Uf\\Eg$yh} =~ /^$s1$s2$/x, "reparse /x"); + is($x, 0, "reparse /x no BEGIN"); + + my $b = '\\'; + my $q = '\''; + + # non-ascii in string as "<0xNNN>" + sub esc_str { + my $s = shift; + $s =~ s{(.)}{ + my $c = ord($1); + ($c< 32 || $c > 127) ? sprintf("<0x%x>", $c) : $1; + }ge; + $s; + } + sub fmt { sprintf "hairy backslashes %s [%s] =~ /^%s/", + $_[0], esc_str($_[1]), esc_str($_[2]); + } + + + for my $u ( + [ '', '', 'blank ' ], + [ "\x{100}", '\x{100}', 'single' ], + [ "\x{100}", "\x{100}", 'double' ]) + { + for my $pair ( + [ "$b", "$b$b" ], + [ "$q", "$q" ], + [ "$b$q", "$b$b$b$q" ], + [ "$b$b$q", "$b$b$b$b$q" ], + [ "$b$b$b$q", "$b$b$b$b$b$b$q" ], + [ "$b$b$b$b$q","$b$b$b$b$b$b$b$b$q" ], + ) { + my ($s, $r) = @$pair; + $s = "9$s"; + my $ss = "$u->[0]$s"; + + my $c = '9' . $r; + my $cc = "$u->[1]$c"; + ok($ss =~ /^$cc/, fmt("plain $u->[2]", $ss, $cc)); + + no strict; + my $chr41 = "\x41"; + $ss = "$u->[0]\t${q}$chr41${b}x42$s"; + $nine = $nine = "bad"; + for my $use_qr ('', 'qr') { + $c = qq[(??{my \$z='{';] + . qq[$use_qr"$b${b}t$b$q$b${b}x41$b$b$b${b}x42"] + . qq[. \$nine})]; + # (??{ qr/str/ }) goes through one less interpolation + # stage than (??{ qq/str/ }) + $c =~ s{\\\\}{\\}g if ($use_qr eq 'qr'); + $c .= $r; + $cc = "$u->[1]$c"; + my $nine = 9; + ok($ss =~ /^$cc/, fmt("code $u->[2]", $ss, $cc)); + { + # Poor man's "use encoding 'ascii'". + # This causes a different code path in S_const_str() + # to be used + local ${^ENCODING} = $enc; + ok($ss =~ /^$cc/, fmt("encode $u->[2]", $ss, $cc)); + } + } + } + } + + my $code1u = "(??{qw(\x{100})})"; + ok("\x{100}" =~ /^$code1u$/, "reparse embeded unicode"); + } + + # a non-pattern literal won't get code blocks parsed at compile time; + # but they must get parsed later on if 'use re eval' is in scope + # also check that unbalanced {}'s are parsed ok + + { + use re 'eval'; + ok("a{" =~ '^(??{"a{"})$', "non-pattern literal code"); + ok("a{" =~ /^${\'(??{"a{"})'}$/, "runtime code with unbalanced {}"); + } + } # End of sub run_tests 1; diff --git a/t/re/recompile.t b/t/re/recompile.t index aa6f7e9..785dcdb 100644 --- a/t/re/recompile.t +++ b/t/re/recompile.t @@ -149,13 +149,16 @@ comp_n(3, <<'CODE', 'mixed utf8 qr'); "a" =~ qr/$_/ for "\x{c4}\x{80}", "\x{100}", "\x{c4}\x{80}"; CODE -comp_n(3, <<'CODE', 'runtime code'); +# note that that for runtime code, each pattern is compiled twice; the +# second time to allow the parser to see the code. + +comp_n(6, <<'CODE', 'runtime code'); my $x = '(?{1})'; BEGIN { $^H |= 0x00200000 } # lightweight "use re 'eval'" "a" =~ /a$_/ for $x, $x, $x; CODE -comp_n(3, <<'CODE', 'runtime code qr'); +comp_n(6, <<'CODE', 'runtime code qr'); my $x = '(?{1})'; BEGIN { $^H |= 0x00200000 } # lightweight "use re 'eval'" "a" =~ qr/a$_/ for $x, $x, $x; @@ -171,14 +174,14 @@ my $x = qr/(?{1})/; "a" =~ qr/a$_/ for $x, $x, $x; CODE -comp_n(4, <<'CODE', 'mixed code'); +comp_n(7, <<'CODE', 'mixed code'); my $x = qr/(?{1})/; my $y = '(?{1})'; BEGIN { $^H |= 0x00200000 } # lightweight "use re 'eval'" "a" =~ /a$x$_/ for $y, $y, $y; CODE -comp_n(4, <<'CODE', 'mixed code qr'); +comp_n(7, <<'CODE', 'mixed code qr'); my $x = qr/(?{1})/; my $y = '(?{1})'; BEGIN { $^H |= 0x00200000 } # lightweight "use re 'eval'" diff --git a/toke.c b/toke.c index 6cde83c..4949fdf 100644 --- a/toke.c +++ b/toke.c @@ -5547,7 +5547,7 @@ Perl_yylex(pTHX) } sv = newSVpvn_flags(s, len, UTF ? SVf_UTF8 : 0); if (*d == '(') { - d = scan_str(d,TRUE,TRUE); + d = scan_str(d,TRUE,TRUE,FALSE); if (!d) { /* MUST advance bufptr here to avoid bogus "at end of line" context messages from yyerror(). @@ -6443,7 +6443,7 @@ Perl_yylex(pTHX) TERM(THING); case '\'': - s = scan_str(s,!!PL_madskills,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE); DEBUG_T( { printbuf("### Saw string before %s\n", s); } ); if (PL_expect == XOPERATOR) { if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) { @@ -6458,7 +6458,7 @@ Perl_yylex(pTHX) TERM(sublex_start()); case '"': - s = scan_str(s,!!PL_madskills,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE); DEBUG_T( { printbuf("### Saw string before %s\n", s); } ); if (PL_expect == XOPERATOR) { if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) { @@ -6481,7 +6481,7 @@ Perl_yylex(pTHX) TERM(sublex_start()); case '`': - s = scan_str(s,!!PL_madskills,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE); DEBUG_T( { printbuf("### Saw backtick string before %s\n", s); } ); if (PL_expect == XOPERATOR) no_op("Backticks",s); @@ -7847,7 +7847,7 @@ Perl_yylex(pTHX) LOP(OP_PIPE_OP,XTERM); case KEY_q: - s = scan_str(s,!!PL_madskills,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE); if (!s) missingterm(NULL); pl_yylval.ival = OP_CONST; @@ -7858,7 +7858,7 @@ Perl_yylex(pTHX) case KEY_qw: { OP *words = NULL; - s = scan_str(s,!!PL_madskills,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE); if (!s) missingterm(NULL); PL_expect = XOPERATOR; @@ -7908,7 +7908,7 @@ Perl_yylex(pTHX) } case KEY_qq: - s = scan_str(s,!!PL_madskills,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE); if (!s) missingterm(NULL); pl_yylval.ival = OP_STRINGIFY; @@ -7921,7 +7921,7 @@ Perl_yylex(pTHX) TERM(sublex_start()); case KEY_qx: - s = scan_str(s,!!PL_madskills,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE); if (!s) missingterm(NULL); readpipe_override(); @@ -8230,7 +8230,7 @@ Perl_yylex(pTHX) const bool warnillegalproto = ckWARN(WARN_ILLEGALPROTO); STRLEN tmplen; - s = scan_str(s,!!PL_madskills,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE); if (!s) Perl_croak(aTHX_ "Prototype not terminated"); /* strip spaces and check for bad characters */ @@ -9152,7 +9152,7 @@ S_scan_pat(pTHX_ char *start, I32 type) { dVAR; PMOP *pm; - char *s = scan_str(start,!!PL_madskills,FALSE); + char *s = scan_str(start,!!PL_madskills,FALSE, PL_reg_state.re_reparsing); const char * const valid_flags = (const char *)((type == OP_QR) ? QR_PAT_MODS : M_PAT_MODS); char charset = '\0'; /* character set modifier */ @@ -9162,6 +9162,9 @@ S_scan_pat(pTHX_ char *start, I32 type) PERL_ARGS_ASSERT_SCAN_PAT; + /* this was only needed for the initial scan_str; set it to false + * so that any (?{}) code blocks etc are parsed normally */ + PL_reg_state.re_reparsing = FALSE; if (!s) { const char * const delimiter = skipspace(start); Perl_croak(aTHX_ @@ -9252,7 +9255,7 @@ S_scan_subst(pTHX_ char *start) pl_yylval.ival = OP_NULL; - s = scan_str(start,!!PL_madskills,FALSE); + s = scan_str(start,!!PL_madskills,FALSE,FALSE); if (!s) Perl_croak(aTHX_ "Substitution pattern not terminated"); @@ -9270,7 +9273,7 @@ S_scan_subst(pTHX_ char *start) #endif first_start = PL_multi_start; - s = scan_str(s,!!PL_madskills,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE); if (!s) { if (PL_lex_stuff) { SvREFCNT_dec(PL_lex_stuff); @@ -9360,7 +9363,7 @@ S_scan_trans(pTHX_ char *start) pl_yylval.ival = OP_NULL; - s = scan_str(start,!!PL_madskills,FALSE); + s = scan_str(start,!!PL_madskills,FALSE,FALSE); if (!s) Perl_croak(aTHX_ "Transliteration pattern not terminated"); @@ -9376,7 +9379,7 @@ S_scan_trans(pTHX_ char *start) } #endif - s = scan_str(s,!!PL_madskills,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE); if (!s) { if (PL_lex_stuff) { SvREFCNT_dec(PL_lex_stuff); @@ -9768,7 +9771,7 @@ S_scan_inputsymbol(pTHX_ char *start) if (d - PL_tokenbuf != len) { pl_yylval.ival = OP_GLOB; - s = scan_str(start,!!PL_madskills,FALSE); + s = scan_str(start,!!PL_madskills,FALSE,FALSE); if (!s) Perl_croak(aTHX_ "Glob not terminated"); return s; @@ -9868,6 +9871,8 @@ intro_sym: takes: start position in buffer keep_quoted preserve \ on the embedded delimiter(s) keep_delims preserve the delimiters around the string + re_reparse compiling a run-time /(?{})/: + collapse // to /, and skip encoding src returns: position to continue reading from buffer side-effects: multi_start, multi_close, lex_repl or lex_stuff, and updates the read buffer. @@ -9908,7 +9913,7 @@ intro_sym: */ STATIC char * -S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) +S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse) { dVAR; SV *sv; /* scalar value: string */ @@ -9987,7 +9992,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) } #endif for (;;) { - if (PL_encoding && !UTF) { + if (PL_encoding && !UTF && !re_reparse) { bool cont = TRUE; while (cont) { @@ -10069,9 +10074,12 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) CopLINE_inc(PL_curcop); /* handle quoted delimiters */ if (*s == '\\' && s+1 < PL_bufend && term != '\\') { - if (!keep_quoted && s[1] == term) + if (!keep_quoted + && (s[1] == term + || (re_reparse && s[1] == '\\')) + ) s++; - /* any other quotes are simply copied straight through */ + /* any other quotes are simply copied straight through */ else *to++ = *s++; } @@ -10172,7 +10180,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* at this point, we have successfully read the delimited string */ - if (!PL_encoding || UTF) { + if (!PL_encoding || UTF || re_reparse) { #ifdef PERL_MAD if (PL_madskills) { char * const tstart = SvPVX(PL_linestr) + stuffstart; @@ -10204,7 +10212,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) } } #endif - if (has_utf8 || PL_encoding) + if (has_utf8 || (PL_encoding && !re_reparse)) SvUTF8_on(sv); PL_multi_end = CopLINE(PL_curcop); -- 1.8.3.1