This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perl5171delta: typo
[perl5.git] / regcomp.c
index b9f9e41..acecaa9 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -133,7 +133,6 @@ typedef struct RExC_state_t {
     I32                nestroot;               /* root parens we are in - used by accept */
     I32                extralen;
     I32                seen_zerolen;
-    I32                seen_evals;
     regnode    **open_parens;          /* pointers to open parens */
     regnode    **close_parens;         /* pointers to close parens */
     regnode    *opend;                 /* END node in program */
@@ -194,7 +193,6 @@ typedef struct RExC_state_t {
 #define RExC_nestroot   (pRExC_state->nestroot)
 #define RExC_extralen  (pRExC_state->extralen)
 #define RExC_seen_zerolen      (pRExC_state->seen_zerolen)
-#define RExC_seen_evals        (pRExC_state->seen_evals)
 #define RExC_utf8      (pRExC_state->utf8)
 #define RExC_uni_semantics     (pRExC_state->uni_semantics)
 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
@@ -5043,7 +5041,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
        int n = 0;
        STRLEN s;
        char *p, *newpat;
-       int newlen = plen + 5; /* allow for "qr''x" extra chars */
+       int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
        SV *sv, *qr_ref;
        dSP;
 
@@ -5234,7 +5232,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
 REGEXP *
 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                    OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
-                    int *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
+                    bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
 {
     dVAR;
     REGEXP *rx;
@@ -5336,7 +5334,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     pRExC_state->num_code_blocks = 0;
 
     if (is_bare_re)
-       *is_bare_re = 0;
+       *is_bare_re = FALSE;
 
     if (expr && (expr->op_type == OP_LIST ||
                (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
@@ -5385,6 +5383,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
            OP *o = NULL;
            int n = 0;
            bool utf8 = 0;
+            STRLEN orig_patlen = 0;
 
            if (pRExC_state->num_code_blocks) {
                o = cLISTOPx(expr)->op_first;
@@ -5426,11 +5425,36 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                    o = o->op_sibling;;
                }
 
+               if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
+                       (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
+               {
+                   sv_setsv(pat, sv);
+                   /* overloading involved: all bets are off over literal
+                    * code. Pretend we haven't seen it */
+                   pRExC_state->num_code_blocks -= n;
+                   n = 0;
+                    rx = NULL;
+
+               }
+               else  {
+                    while (SvAMAGIC(msv)
+                            && (sv = AMG_CALLunary(msv, string_amg))
+                            && sv != msv)
+                    {
+                        msv = sv;
+                        SvGETMAGIC(msv);
+                    }
+                    if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
+                        msv = SvRV(msv);
+                    orig_patlen = SvCUR(pat);
+                    sv_catsv_nomg(pat, msv);
+                    rx = msv;
+                    if (code)
+                        pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
+                }
+
                /* extract any code blocks within any embedded qr//'s */
-               rx = msv;
-               if (SvROK(rx))
-                   rx = SvRV(rx);
-               if (SvTYPE(rx) == SVt_REGEXP
+               if (rx && SvTYPE(rx) == SVt_REGEXP
                    && RX_ENGINE((REGEXP*)rx)->op_comp)
                {
 
@@ -5448,7 +5472,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                        pRExC_state->num_code_blocks += ri->num_code_blocks;
                        for (i=0; i < ri->num_code_blocks; i++) {
                            struct reg_code_block *src, *dst;
-                           STRLEN offset =  SvCUR(pat)
+                           STRLEN offset =  orig_patlen
                                + ((struct regexp *)SvANY(rx))->pre_prefix;
                            assert(n < pRExC_state->num_code_blocks);
                            src = &ri->code_blocks[i];
@@ -5464,32 +5488,20 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                        }
                    }
                }
-
-               if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
-                       (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
-               {
-                   sv_setsv(pat, sv);
-                   /* overloading involved: all bets are off over literal
-                    * code. Pretend we haven't seen it */
-                   pRExC_state->num_code_blocks -= n;
-                   n = 0;
-
-               }
-               else {
-                   if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) {
-                       msv = SvRV(msv);
-                       PL_reginterp_cnt +=
-                           RX_SEEN_EVALS((REGEXP *)MUTABLE_PTR(msv));
-                   }
-                   sv_catsv_nomg(pat, msv);
-                   if (code)
-                       pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
-               }
            }
            SvSETMAGIC(pat);
        }
-       else
+       else {
+            SV *sv;
            pat = *patternp;
+            while (SvAMAGIC(pat)
+                    && (sv = AMG_CALLunary(pat, string_amg))
+                    && sv != pat)
+            {
+                pat = sv;
+                SvGETMAGIC(pat);
+            }
+        }
 
        /* handle bare regex: foo =~ $re */
        {
@@ -5498,7 +5510,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                re = SvRV(re);
            if (SvTYPE(re) == SVt_REGEXP) {
                if (is_bare_re)
-                   *is_bare_re = 1;
+                   *is_bare_re = TRUE;
                SvREFCNT_inc(re);
                Safefree(pRExC_state->code_blocks);
                return (REGEXP*)re;
@@ -5718,7 +5730,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     RExC_seen = 0;
     RExC_in_lookbehind = 0;
     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
-    RExC_seen_evals = 0;
     RExC_extralen = 0;
     RExC_override_recoding = 0;
 
@@ -5919,8 +5930,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     RExC_emit_bound = ri->program + RExC_size + 1;
     pRExC_state->code_index = 0;
 
-    /* Store the count of eval-groups for security checks: */
-    RExC_rx->seen_evals = RExC_seen_evals;
     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
     if (reg(pRExC_state, 0, &flags,1) == NULL) {
        ReREFCNT_dec(rx);   
@@ -6073,7 +6082,7 @@ reStudy:
        else if ((!sawopen || !RExC_sawback) &&
            (OP(first) == STAR &&
            PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
-           !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
+           !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
        {
            /* turn .* into ^.* with an implied $*=1 */
            const int type =
@@ -6086,7 +6095,7 @@ reStudy:
            goto again;
        }
        if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
-           && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
+           && !pRExC_state->num_code_blocks) /* May examine pos and $& */
            /* x+ must match at the 1st pos of run of x's */
            r->intflags |= PREGf_SKIP;
 
@@ -6359,7 +6368,7 @@ reStudy:
        r->extflags |= RXf_GPOS_SEEN;
     if (RExC_seen & REG_SEEN_LOOKBEHIND)
        r->extflags |= RXf_LOOKBEHIND_SEEN;
-    if (RExC_seen & REG_SEEN_EVAL)
+    if (pRExC_state->num_code_blocks)
        r->extflags |= RXf_EVAL_SEEN;
     if (RExC_seen & REG_SEEN_CANY)
        r->extflags |= RXf_CANY_SEEN;
@@ -6367,6 +6376,8 @@ reStudy:
        r->intflags |= PREGf_VERBARG_SEEN;
     if (RExC_seen & REG_SEEN_CUTGROUP)
        r->intflags |= PREGf_CUTGROUP_SEEN;
+    if (pm_flags & PMf_USE_RE_EVAL)
+       r->intflags |= PREGf_USE_RE_EVAL;
     if (RExC_paren_names)
         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
     else
@@ -6879,7 +6890,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
                       (unsigned long) flags);
         }
-        /* NOT REACHED */
+        assert(0); /* NOT REACHED */
     }
     return NULL;
 }
@@ -8493,7 +8504,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
                 }
                 goto gen_recurse_regop;
-                /* NOT REACHED */
+                assert(0); /* NOT REACHED */
             case '+':
                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
                     RExC_parse++;
@@ -8563,7 +8574,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 nextchar(pRExC_state);
                 return ret;
             } /* named and numeric backreferences */
-            /* NOT REACHED */
+            assert(0); /* NOT REACHED */
 
            case '?':           /* (??...) */
                is_logical = 1;
@@ -8581,7 +8592,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                struct reg_code_block *cb;
 
                RExC_seen_zerolen++;
-               RExC_seen |= REG_SEEN_EVAL;
 
                if (   !pRExC_state->num_code_blocks
                    || pRExC_state->code_index >= pRExC_state->num_code_blocks
@@ -8596,9 +8606,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                /* this is a pre-compiled code block (?{...}) */
                cb = &pRExC_state->code_blocks[pRExC_state->code_index];
                RExC_parse = RExC_start + cb->end;
-               if (SIZE_ONLY)
-                   RExC_seen_evals++;
-               else {
+               if (!SIZE_ONLY) {
                    OP *o = cb->block;
                    if (cb->src_regex) {
                        n = add_data(pRExC_state, 2, "rl");
@@ -8616,10 +8624,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                nextchar(pRExC_state);
 
                if (is_logical) {
+                    regnode *eval;
                    ret = reg_node(pRExC_state, LOGICAL);
-                   if (!SIZE_ONLY)
+                    eval = reganode(pRExC_state, EVAL, n);
+                   if (!SIZE_ONLY) {
                        ret->flags = 2;
-                    REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
+                        /* for later propagation into (??{}) return value */
+                        eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
+                    }
+                    REGTAIL(pRExC_state, ret, eval);
                     /* deal with the length of this later - MJD */
                    return ret;
                }
@@ -9149,7 +9162,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
        }
        else
            FAIL("Junk on end of regexp");      /* "Can't happen". */
-       /* NOTREACHED */
+       assert(0); /* NOTREACHED */
     }
 
     if (RExC_in_lookbehind) {
@@ -9698,8 +9711,46 @@ S_reg_recode(pTHX_ const char value, SV **encp)
    sequence, we return.
 
    Note: we have to be careful with escapes, as they can be both literal
-   and special, and in the case of \10 and friends can either, depending
-   on context. Specifically there are two separate switches for handling
+   and special, and in the case of \10 and friends, context determines which.
+
+   A summary of the code structure is:
+
+   switch (first_byte) {
+       cases for each special:
+           handle this special;
+           break;
+       case '\\':
+           switch (2nd byte) {
+               cases for each unambiguous special:
+                   handle this special;
+                   break;
+               cases for each ambigous special/literal:
+                   disambiguate;
+                   if (special)  handle here
+                   else goto defchar;
+               default: // unambiguously literal:
+                   goto defchar;
+           }
+       default:  // is a literal char
+           // FALL THROUGH
+       defchar:
+           create EXACTish node for literal;
+           while (more input and node isn't full) {
+               switch (input_byte) {
+                  cases for each special;
+                       make sure parse pointer is set so that the next call to
+                           regatom will see this special first
+                       goto loopdone; // EXACTish node terminated by prev. char
+                  default:
+                      append char to EXACTISH node;
+               }
+               get next input byte;
+           }
+        loopdone:
+   }
+   return the generated node;
+
+   Specifically there are two separate switches for handling
    escape sequences, with the one for handling literal escapes requiring
    a dummy entry for all of the special escapes that are actually handled
    by the other.
@@ -10168,6 +10219,7 @@ tryagain:
                         vFAIL("Reference to nonexistent or unclosed group");
                 }
                if (!isg && num > 9 && num >= RExC_npar)
+                    /* Probably a character specified in octal, e.g. \35 */
                    goto defchar;
                else {
                    char * const parse_start = RExC_parse - 1; /* MJD */
@@ -10382,38 +10434,39 @@ tryagain:
                            break;
                        }
                    case 'x':
-                       if (*++p == '{') {
-                           char* const e = strchr(p, '}');
+                       {
+                           STRLEN brace_len = len;
+                           UV result;
+                           const char* error_msg;
 
-                           if (!e) {
-                               RExC_parse = p + 1;
-                               vFAIL("Missing right brace on \\x{}");
+                           bool valid = grok_bslash_x(p,
+                                                      &result,
+                                                      &brace_len,
+                                                      &error_msg,
+                                                      1);
+                           p += brace_len;
+                           if (! valid) {
+                               RExC_parse = p; /* going to die anyway; point
+                                                  to exact spot of failure */
+                               vFAIL(error_msg);
                            }
                            else {
-                                I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
-                                    | PERL_SCAN_DISALLOW_PREFIX;
-                                STRLEN numlen = e - p - 1;
-                               ender = grok_hex(p + 1, &numlen, &flags, NULL);
-                               if (ender > 0xff)
-                                   REQUIRE_UTF8;
-                               p = e + 1;
+                               ender = result;
                            }
+                           if (PL_encoding && ender < 0x100) {
+                               goto recode_encoding;
+                           }
+                           if (ender > 0xff) {
+                               REQUIRE_UTF8;
+                           }
+                           break;
                        }
-                       else {
-                            I32 flags = PERL_SCAN_DISALLOW_PREFIX;
-                           STRLEN numlen = 2;
-                           ender = grok_hex(p, &numlen, &flags, NULL);
-                           p += numlen;
-                       }
-                       if (PL_encoding && ender < 0x100)
-                           goto recode_encoding;
-                       break;
                    case 'c':
                        p++;
                        ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
                        break;
                    case '0': case '1': case '2': case '3':case '4':
-                   case '5': case '6': case '7': case '8':case '9':
+                   case '5': case '6': case '7':
                        if (*p == '0' ||
                            (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
                        {
@@ -10446,7 +10499,7 @@ tryagain:
                            FAIL("Trailing \\");
                        /* FALL THROUGH */
                    default:
-                       if (!SIZE_ONLY&& isALPHA(*p)) {
+                       if (!SIZE_ONLY&& isALNUMC(*p)) {
                            ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
                        }
                        goto normal_default;
@@ -11490,22 +11543,18 @@ parseit:
                }
                break;
            case 'x':
-               if (*RExC_parse == '{') {
-                    I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
-                        | PERL_SCAN_DISALLOW_PREFIX;
-                   char * const e = strchr(RExC_parse++, '}');
-                    if (!e)
-                        vFAIL("Missing right brace on \\x{}");
-
-                   numlen = e - RExC_parse;
-                   value = grok_hex(RExC_parse, &numlen, &flags, NULL);
-                   RExC_parse = e + 1;
-               }
-               else {
-                    I32 flags = PERL_SCAN_DISALLOW_PREFIX;
-                   numlen = 2;
-                   value = grok_hex(RExC_parse, &numlen, &flags, NULL);
+               RExC_parse--;   /* function expects to be pointed at the 'x' */
+               {
+                   const char* error_msg;
+                   bool valid = grok_bslash_x(RExC_parse,
+                                              &value,
+                                              &numlen,
+                                              &error_msg,
+                                              1);
                    RExC_parse += numlen;
+                   if (! valid) {
+                       vFAIL(error_msg);
+                   }
                }
                if (PL_encoding && value < 0x100)
                    goto recode_encoding;
@@ -13912,8 +13961,6 @@ Perl_save_re_context(pTHX)
 
     Copy(&PL_reg_state, state, 1, struct re_save_state);
 
-    PL_reg_start_tmp = 0;
-    PL_reg_start_tmpl = 0;
     PL_reg_oldsaved = NULL;
     PL_reg_oldsavedlen = 0;
     PL_reg_maxiter = 0;