This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Handle literal code blocks in runtime regexes
authorDavid Mitchell <davem@iabyn.com>
Sat, 12 Nov 2011 20:51:27 +0000 (20:51 +0000)
committerDavid Mitchell <davem@iabyn.com>
Wed, 13 Jun 2012 12:25:52 +0000 (13:25 +0100)
In the following types of regex:

      /$runtime(?{...})/
    qr/$runtime(?{...})/

make it so that the code block is compiled at the same time that the
surrounding code is compiled, then is incorporated, rather than
re-compiled, when the regex source is assembled and compiled at runtime.

This fixes a bunch of closure-related TODO tests.

Note that this still doesn't yet handle the cases where $runtime contains:
    $runtime = qr/...(?{...})/; # block will be stringified and recompiled
    $runtime = '(?{...})';      # block compiled the old way, with
                                  matching nesting of {} required

It also doesn't yet handle the case where the pattern getting compiled is
upgraded to utf8 and so is restarted.

Note that this is rather complex, because in something like

    $str =~ qr/$a(?{...})$b[1]/

there are four separate phases

* perl compile time; we also compile the code block at the same time,
  but within a separate anon CV (with a separate pad)

* at run time, we execute the code that generates the list of SVs
  (i.e. $a, $b[1] etc), but have to execute them within the context of the
  anon sub, since that's what they were compiled in; we then have to
  concat the arguments, while remembering which were literal code blocks;

* then qr// clones the compiled regex, and clones the anon CV at the same
  time;

* finally, the pattern is executed.

Through all this we have to ensure that the code blocks and associated
anon CV and pad get preserved and incorporated into the right places
for eventual use.

The changes in this commit build upon the work in the previous few
commits, and work by:

* at (perl) compile time, in pmruntime(), the anon CV (if any) associated
  with a qr//, as well as being referred to by the op_targ of the
  anoncode op, is also made the targ of the regcomp op;

* at pattern assembly and compile time,
    * Perl_re_op_compile() takes the list of SVs gathered by pp_regcomp(),
      along with the op tree (from op_code_list) that was used to generate
      those SVs (as well as containing the individual DO blocks), and
      concatenates them to get a final pattern source string, while
      noting the start and end positions of any literal (?{..})'s,
      and which block they must correspond to.

    * after compilation, pp_regcomp() then uses op_targ to locate
      the anon CV and store a pointer to it in the regex.

qr// instantiation and execution work unchanged.

op.c
pp_ctl.c
regcomp.c
t/re/pat_re_eval.t

diff --git a/op.c b/op.c
index ab08830..db73470 100644 (file)
--- a/op.c
+++ b/op.c
@@ -4457,6 +4457,7 @@ Perl_pmruntime(pTHX_ OP *o, OP *expr, bool isreg, I32 floor)
     else {
        /* runtime pattern: build chain of regcomp etc ops */
        bool reglist;
+       PADOFFSET cv_targ = 0;
 
        reglist = isreg && expr->op_type == OP_LIST;
        if (reglist)
@@ -4509,8 +4510,13 @@ Perl_pmruntime(pTHX_ OP *o, OP *expr, bool isreg, I32 floor)
             */
 
            SvREFCNT_inc_simple_void(PL_compcv);
-           expr = list(force_list(newUNOP(OP_ENTERSUB, 0,
-               scalar(newANONATTRSUB(floor, NULL, NULL, expr)))));
+           /* these lines are just an unrolled newANONATTRSUB */
+           expr = newSVOP(OP_ANONCODE, 0,
+                   MUTABLE_SV(newATTRSUB(floor, 0, NULL, NULL, expr)));
+           cv_targ = expr->op_targ;
+           expr = newUNOP(OP_REFGEN, 0, expr);
+
+           expr = list(force_list(newUNOP(OP_ENTERSUB, 0, scalar(expr))));
        }
 
        NewOp(1101, rcop, 1, LOGOP);
@@ -4522,6 +4528,7 @@ Perl_pmruntime(pTHX_ OP *o, OP *expr, bool isreg, I32 floor)
                            | (reglist ? OPf_STACKED : 0);
        rcop->op_private = 0;
        rcop->op_other = o;
+       rcop->op_targ = cv_targ;
 
        /* /$x/ may cause an eval, since $x might be qr/(?{..})/  */
        if (PL_hints & HINT_RE_EVAL) PL_cv_has_eval = 1;
index a82dcbb..b40733f 100644 (file)
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -114,7 +114,10 @@ PP(pp_regcomp)
        PL_reginterp_cnt = (I32_MAX>>1); /* Mark as safe.  */
 
     new_re = re_op_compile(args, nargs, pm->op_code_list, eng, re,
-               &is_bare_re, (pm->op_pmflags & RXf_PMf_COMPILETIME));
+               &is_bare_re,
+               (pm->op_pmflags & (RXf_PMf_COMPILETIME|PMf_HAS_CV)));
+    if (pm->op_pmflags & PMf_HAS_CV)
+       ((struct regexp *)SvANY(new_re))->qr_anoncv = PAD_SV(PL_op->op_targ);
 
     if (is_bare_re) {
        REGEXP *tmp;
index 6c16e8e..25ffc00 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -5189,18 +5189,50 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
        }
 
        if (pat_count > 1) {
-           /* concat multiple args */
+           /* concat multiple args and find any code block indexes */
+
+           OP *o = NULL;
+           int n = 0;
+
+           if (pRExC_state->num_code_blocks) {
+               o = cLISTOPx(expr)->op_first;
+               assert(o->op_type == OP_PUSHMARK);
+               o = o->op_sibling;
+           }
 
-           pRExC_state->num_code_blocks = 0; /* XXX tmp */
            pat = newSVpvn("", 0);
            SAVEFREESV(pat);
            for (svp = patternp; svp < patternp + pat_count; svp++) {
                SV *sv, *msv = *svp;
+               bool code = 0;
+               if (o) {
+                   if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
+                       n++;
+                       assert(n <= pRExC_state->num_code_blocks);
+                       pRExC_state->code_blocks[n-1].start = SvCUR(pat);
+                       pRExC_state->code_blocks[n-1].block = o;
+                       code = 1;
+                       o = o->op_sibling; /* skip CONST */
+                       assert(o);
+                   }
+                   o = o->op_sibling;;
+               }
+
                if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
                        (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
+               {
                    sv_setsv(pat, sv);
-               else
+                   /* overloading involved: all bets are off over literal
+                    * code. Pretend we haven't seen it */
+                   pRExC_state->num_code_blocks -= n;
+                   n = 0;
+
+               }
+               else {
                    sv_catsv_nomg(pat, msv);
+                   if (code)
+                       pRExC_state->code_blocks[n-1].end = SvCUR(pat);
+               }
            }
            SvSETMAGIC(pat);
        }
index 78b2e41..e9bb50e 100644 (file)
@@ -400,12 +400,12 @@ sub run_tests {
            # literal qr code only created once, embedded with text
 
            $cr2 //= qr/B(??{$x})$/;
-           tok(0,   "ABa" =~ /^A$cr2/, "[$x] literal qr once embedded text");
+           ok("ABa" =~ /^A$cr2/, "[$x] literal qr once embedded text");
 
            # literal qr code only created once, embedded with text + lit code
 
            $cr3 //= qr/C(??{$x})$/;
-           tok($bc, "A$x-BCa" =~ /^A(??{$x})-B$cr3/,
+           ok("A$x-BCa" =~ /^A(??{$x})-B$cr3/,
                            "[$x] literal qr once embedded text + lit code");
 
            # literal qr code only created once, embedded with text + run code
@@ -451,12 +451,12 @@ sub run_tests {
 
                my $rr5 = qr/^A(??{"$x$y"})-$r5/;
                push @rr5, $rr5;
-               tok("$x$y" ne "ad", "A$x$y-C$x" =~ $rr5,
+               tok($bc, "A$x$y-C$x" =~ $rr5,
                                "[$x-$y] literal qr + r5");
 
                my $rr6 = qr/^A(??{"$x$y"})-$r6/;
                push @rr6, $rr6;
-               tok("$x$y" ne "ad", "A$x$y-$x-C$x" =~ $rr6,
+               tok($bc, "A$x$y-$x-C$x" =~ $rr6,
                                "[$x-$y] literal qr + r6");
            }
 
@@ -464,14 +464,14 @@ sub run_tests {
                my $y = 'Y';
                my $yy = (qw(d e f))[$i];
                my $rr5 = $rr5[$i];
-               tok("$x$yy" ne "ad", "A$x$yy-C$x" =~ $rr5,
+               tok($bc, "A$x$yy-C$x" =~ $rr5,
                                "[$x-$yy] literal qr + r5, outside");
                tok(1,               "A$x$yy-C$x-D$x" =~ /$rr5-D(??{$x})/,
                                "[$x-$yy] literal qr + r5 + lit, outside");
 
                my $rr6 = $rr6[$i];
                push @rr6, $rr6;
-               tok("$x$yy" ne "ad", "A$x$yy-$x-C$x" =~ $rr6,
+               tok($bc, "A$x$yy-$x-C$x" =~ $rr6,
                                "[$x-$yy] literal qr + r6, outside");
                tok(1,               "A$x$yy-$x-C$x-D$x" =~ /$rr6-D(??{$x})/,
                                "[$x-$yy] literal qr + r6 +lit, outside");