This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Deparse: better handle BEGIN { use_ok() }
[perl5.git] / regcomp.c
index 322d230..97c5949 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -119,8 +119,7 @@ typedef struct scan_frame {
 
 /* Certain characters are output as a sequence with the first being a
  * backslash. */
-#define isBACKSLASHED_PUNCT(c)                                              \
-                    ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
+#define isBACKSLASHED_PUNCT(c)  strchr("-[]\\^", c)
 
 
 struct RExC_state_t {
@@ -818,13 +817,6 @@ static const scan_data_t zero_scan_data =
                                        REPORT_LOCATION_ARGS(loc));      \
 } STMT_END
 
-#define        vWARN4dep(loc, m, a1, a2, a3) STMT_START {                             \
-    __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN2(WARN_REGEXP,WARN_DEPRECATED), \
-                                       m REPORT_LOCATION,                      \
-                                      a1, a2, a3,                             \
-                                       REPORT_LOCATION_ARGS(loc));             \
-} STMT_END
-
 #define        ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
     __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),        \
                                           m REPORT_LOCATION,            \
@@ -2362,8 +2354,9 @@ is the recommended Unicode-aware way of saying
 
 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
-       U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
+       U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
        Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
+        TRIE_LIST_LEN( state ) = ging;                          \
     }                                                           \
     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
@@ -5218,15 +5211,21 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                            However, this time it's not a subexpression
                            we care about, but the expression itself. */
                         && (maxcount == REG_INFTY)
-                        && data && ++data->whilem_c < 16) {
+                        && data) {
                    /* This stays as CURLYX, we can put the count/of pair. */
                    /* Find WHILEM (as in regexec.c) */
                    regnode *nxt = oscan + NEXT_OFF(oscan);
 
                    if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
                        nxt += ARG(nxt);
-                   PREVOPER(nxt)->flags = (U8)(data->whilem_c
-                       | (RExC_whilem_seen << 4)); /* On WHILEM */
+                    nxt = PREVOPER(nxt);
+                    if (nxt->flags & 0xf) {
+                        /* we've already set whilem count on this node */
+                    } else if (++data->whilem_c < 16) {
+                        assert(data->whilem_c <= RExC_whilem_seen);
+                        nxt->flags = (U8)(data->whilem_c
+                            | (RExC_whilem_seen << 4)); /* On WHILEM */
+                    }
                }
                if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
                    pars++;
@@ -6133,10 +6132,13 @@ S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
 {
     int n;
 
-    if (cbs->attached)
+    if (--cbs->refcnt > 0)
         return;
-    for (n = 0; n < cbs->count; n++)
-        SvREFCNT_dec(cbs->cb[n].src_regex);
+    for (n = 0; n < cbs->count; n++) {
+        REGEXP *rx = cbs->cb[n].src_regex;
+        cbs->cb[n].src_regex = NULL;
+        SvREFCNT_dec(rx);
+    }
     Safefree(cbs->cb);
     Safefree(cbs);
 }
@@ -6148,7 +6150,7 @@ S_alloc_code_blocks(pTHX_  int ncode)
      struct reg_code_blocks *cbs;
     Newx(cbs, 1, struct reg_code_blocks);
     cbs->count = ncode;
-    cbs->attached = FALSE;
+    cbs->refcnt = 1;
     SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
     if (ncode)
         Newx(cbs->cb, ncode, struct reg_code_block);
@@ -6184,7 +6186,9 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
 
     while (s < *plen_p) {
         append_utf8_from_native_byte(src[s], &d);
+
         if (n < num_code_blocks) {
+            assert(pRExC_state->code_blocks);
             if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
                 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
                 assert(*(d - 1) == '(');
@@ -6412,10 +6416,11 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
                  * different closure than last time */
                 *recompile_p = 1;
                 if (pRExC_state->code_blocks) {
-                    pRExC_state->code_blocks->count += ri->code_blocks->count;
+                    int new_count = pRExC_state->code_blocks->count
+                            + ri->code_blocks->count;
                     Renew(pRExC_state->code_blocks->cb,
-                            pRExC_state->code_blocks->count,
-                            struct reg_code_block);
+                            new_count, struct reg_code_block);
+                    pRExC_state->code_blocks->count = new_count;
                 }
                 else
                     pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
@@ -6526,7 +6531,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
        int n = 0;
        STRLEN s;
        char *p, *newpat;
-       int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
+       int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
        SV *sv, *qr_ref;
        dSP;
 
@@ -6592,7 +6597,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
            SV * const errsv = ERRSV;
            if (SvTRUE_NN(errsv))
                 /* use croak_sv ? */
-               Perl_croak_nocontext("%"SVf, SVfARG(errsv));
+               Perl_croak_nocontext("%" SVf, SVfARG(errsv));
        }
        assert(SvROK(qr_ref));
        qr = SvRV(qr_ref);
@@ -7168,8 +7173,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     if (pm_flags & PMf_IS_QR) {
        ri->code_blocks = pRExC_state->code_blocks;
        if (ri->code_blocks)
-            /* disarm earlier SAVEDESTRUCTOR_X */
-            ri->code_blocks->attached = TRUE;
+            ri->code_blocks->refcnt++;
     }
 
     {
@@ -7789,6 +7793,18 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
 
     while ( RExC_recurse_count > 0 ) {
         const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
+        /*
+         * This data structure is set up in study_chunk() and is used
+         * to calculate the distance between a GOSUB regopcode and
+         * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
+         * it refers to.
+         *
+         * If for some reason someone writes code that optimises
+         * away a GOSUB opcode then the assert should be changed to
+         * an if(scan) to guard the ARG2L_SET() - Yves
+         *
+         */
+        assert(scan && OP(scan) == GOSUB);
         ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - scan );
     }
 
@@ -7879,21 +7895,18 @@ SV*
 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
                          const U32 flags)
 {
-    AV *retarray = NULL;
     SV *ret;
     struct regexp *const rx = ReANY(r);
 
     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
 
-    if (flags & RXapif_ALL)
-        retarray=newAV();
-
     if (rx && RXp_PAREN_NAMES(rx)) {
         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
         if (he_str) {
             IV i;
             SV* sv_dat=HeVAL(he_str);
             I32 *nums=(I32*)SvPVX(sv_dat);
+            AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
             for ( i=0; i<SvIVX(sv_dat); i++ ) {
                 if ((I32)(rx->nparens) >= nums[i]
                     && rx->offs[nums[i]].start != -1
@@ -10241,7 +10254,7 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
             {
                 AV* list = (AV*) *listp;
                 IV k;
-                for (k = 0; k <= av_tindex_nomg(list); k++) {
+                for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
                     SV** c_p = av_fetch(list, k, FALSE);
                     UV c;
                     assert(c_p);
@@ -11709,11 +11722,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
            nextchar(pRExC_state);
             if (max < min) {    /* If can't match, warn and optimize to fail
                                    unconditionally */
+                reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
                 if (PASS2) {
                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
+                    NEXT_OFF(orig_emit)= regarglen[OPFAIL] + NODE_STEP_REGNODE;
                 }
-                reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
-                NEXT_OFF(orig_emit)= regarglen[OPFAIL] + NODE_STEP_REGNODE;
                 return ret;
             }
             else if (min == max && *RExC_parse == '?')
@@ -12004,7 +12017,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
 
     RExC_parse++;      /* Skip past the '{' */
 
-    if (! (endbrace = strchr(RExC_parse, '}'))) { /* no trailing brace */
+    endbrace = strchr(RExC_parse, '}');
+    if (! endbrace) { /* no trailing brace */
         vFAIL2("Missing right brace on \\%c{}", 'N');
     }
     else if(!(endbrace == RExC_parse           /* nothing between the {} */
@@ -12385,6 +12399,52 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
     }
 }
 
+STATIC bool
+S_new_regcurly(const char *s, const char *e)
+{
+    /* This is a temporary function designed to match the most lenient form of
+     * a {m,n} quantifier we ever envision, with either number omitted, and
+     * spaces anywhere between/before/after them.
+     *
+     * If this function fails, then the string it matches is very unlikely to
+     * ever be considered a valid quantifier, so we can allow the '{' that
+     * begins it to be considered as a literal */
+
+    bool has_min = FALSE;
+    bool has_max = FALSE;
+
+    PERL_ARGS_ASSERT_NEW_REGCURLY;
+
+    if (s >= e || *s++ != '{')
+       return FALSE;
+
+    while (s < e && isSPACE(*s)) {
+        s++;
+    }
+    while (s < e && isDIGIT(*s)) {
+        has_min = TRUE;
+        s++;
+    }
+    while (s < e && isSPACE(*s)) {
+        s++;
+    }
+
+    if (*s == ',') {
+       s++;
+        while (s < e && isSPACE(*s)) {
+            s++;
+        }
+        while (s < e && isDIGIT(*s)) {
+            has_max = TRUE;
+            s++;
+        }
+        while (s < e && isSPACE(*s)) {
+            s++;
+        }
+    }
+
+    return s < e && *s == '}' && (has_min || has_max);
+}
 
 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
@@ -12819,6 +12879,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             /* FALLTHROUGH */
 
           finish_meta_pat:
+            if (   UCHARAT(RExC_parse + 1) == '{'
+                && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
+            {
+                RExC_parse += 2;
+                vFAIL("Unescaped left brace in regex is illegal here");
+            }
            nextchar(pRExC_state);
             Set_Node_Length(ret, 2); /* MJD */
            break;
@@ -13368,13 +13434,36 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                    } /* End of switch on '\' */
                    break;
                case '{':
-                   /* Currently we don't care if the lbrace is at the start
-                    * of a construct.  This catches it in the middle of a
-                    * literal string, or when it's the first thing after
-                    * something like "\b" */
-                   if (len || (p > RExC_start && isALPHA_A(*(p -1)))) {
-                        RExC_parse = p + 1;
-                       vFAIL("Unescaped left brace in regex is illegal here");
+                    /* Currently we allow an lbrace at the start of a construct
+                     * without raising a warning.  This is because we think we
+                     * will never want such a brace to be meant to be other
+                     * than taken literally. */
+                   if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
+
+                        /* But, we raise a fatal warning otherwise, as the
+                         * deprecation cycle has come and gone.  Except that it
+                         * turns out that some heavily-relied on upstream
+                         * software, notably GNU Autoconf, have failed to fix
+                         * their uses.  For these, don't make it fatal unless
+                         * we anticipate using the '{' for something else.
+                         * This happens after any alpha, and for a looser {m,n}
+                         * quantifier specification */
+                        if (      RExC_strict
+                            || (  p > parse_start + 1
+                                && isALPHA_A(*(p - 1))
+                                && *(p - 2) == '\\')
+                            || new_regcurly(p, RExC_end))
+                        {
+                            RExC_parse = p + 1;
+                            vFAIL("Unescaped left brace in regex is "
+                                  "illegal here");
+                        }
+                        if (PASS2) {
+                            ckWARNregdep(p + 1,
+                                        "Unescaped left brace in regex is "
+                                        "deprecated here (and will be fatal "
+                                        "in Perl 5.30), passed through");
+                        }
                    }
                    goto normal_default;
                 case '}':
@@ -13413,10 +13502,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  * this character again next time through, when it will be the
                  * only thing in its new node */
 
-                if ((next_is_quantifier = (   LIKELY(p < RExC_end)
-                                           && UNLIKELY(ISMULT2(p))))
-                    && LIKELY(len))
-               {
+                next_is_quantifier =    LIKELY(p < RExC_end)
+                                     && UNLIKELY(ISMULT2(p));
+
+                if (next_is_quantifier && LIKELY(len)) {
                     p = oldp;
                     goto loopdone;
                 }
@@ -14858,7 +14947,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
       no_close:
         /* We output the messages even if warnings are off, because we'll fail
          * the very next thing, and these give a likely diagnosis for that */
-        if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
+        if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
             output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
         }
 
@@ -14973,7 +15062,7 @@ redo_curchar:
                                            stack, fence, fence_stack));
 #endif
 
-        top_index = av_tindex_nomg(stack);
+        top_index = av_tindex_skip_len_mg(stack);
 
         switch (curchar) {
             SV** stacked_ptr;       /* Ptr to something already on 'stack' */
@@ -15151,7 +15240,7 @@ redo_curchar:
                 goto done;
 
             case ')':
-                if (av_tindex_nomg(fence_stack) < 0) {
+                if (av_tindex_skip_len_mg(fence_stack) < 0) {
                     RExC_parse++;
                     vFAIL("Unexpected ')'");
                 }
@@ -15347,7 +15436,7 @@ redo_curchar:
              * may have altered the stack in the time since we earlier set
              * 'top_index'.  */
 
-            top_index = av_tindex_nomg(stack);
+            top_index = av_tindex_skip_len_mg(stack);
             if (top_index - fence >= 0) {
                 /* If the top entry on the stack is an operator, it had better
                  * be a '!', otherwise the entry below the top operand should
@@ -15398,15 +15487,15 @@ redo_curchar:
     } /* End of loop parsing through the construct */
 
   done:
-    if (av_tindex_nomg(fence_stack) >= 0) {
+    if (av_tindex_skip_len_mg(fence_stack) >= 0) {
         vFAIL("Unmatched (");
     }
 
-    if (av_tindex_nomg(stack) < 0   /* Was empty */
+    if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
         || ((final = av_pop(stack)) == NULL)
         || ! IS_OPERAND(final)
         || SvTYPE(final) != SVt_INVLIST
-        || av_tindex_nomg(stack) >= 0)  /* More left on stack */
+        || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
     {
       bad_syntax:
         SvREFCNT_dec(final);
@@ -15509,8 +15598,8 @@ S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
                              AV * stack, const IV fence, AV * fence_stack)
 {   /* Dumps the stacks in handle_regex_sets() */
 
-    const SSize_t stack_top = av_tindex_nomg(stack);
-    const SSize_t fence_stack_top = av_tindex_nomg(fence_stack);
+    const SSize_t stack_top = av_tindex_skip_len_mg(stack);
+    const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
     SSize_t i;
 
     PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
@@ -15964,7 +16053,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     while (1) {
 
         if (   posix_warnings
-            && av_tindex_nomg(posix_warnings) >= 0
+            && av_tindex_skip_len_mg(posix_warnings) >= 0
             && RExC_parse > not_posix_region_end)
         {
             /* Warnings about posix class issues are considered tentative until
@@ -16020,7 +16109,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  * posix class, and it failed, it was a false alarm, as this
                  * successful one proves */
                 if (   posix_warnings
-                    && av_tindex_nomg(posix_warnings) >= 0
+                    && av_tindex_skip_len_mg(posix_warnings) >= 0
                     && not_posix_region_end >= RExC_parse
                     && not_posix_region_end <= posix_class_end)
                 {
@@ -16911,22 +17000,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     literal[d++] = (char) value;
                     literal[d++] = '\0';
 
-                    vWARN4dep(RExC_parse,
-                           "\"%.*s\" is more clearly written simply as \"%s\". "
-                           "This will be a fatal error in Perl 5.28",
+                    vWARN4(RExC_parse,
+                           "\"%.*s\" is more clearly written simply as \"%s\"",
                            (int) (RExC_parse - rangebegin),
                            rangebegin,
                            literal
-                    );
+                        );
                 }
                 else if isMNEMONIC_CNTRL(value) {
-                    vWARN4dep(RExC_parse,
-                           "\"%.*s\" is more clearly written simply as \"%s\". "
-                           "This will be a fatal error in Perl 5.28",
+                    vWARN4(RExC_parse,
+                           "\"%.*s\" is more clearly written simply as \"%s\"",
                            (int) (RExC_parse - rangebegin),
                            rangebegin,
                            cntrl_to_mnemonic((U8) value)
-                    );
+                        );
                 }
             }
         }
@@ -16979,7 +17066,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     } /* End of loop through all the text within the brackets */
 
 
-    if (   posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
+    if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
         output_or_return_posix_warnings(pRExC_state, posix_warnings,
                                         return_posix_warnings);
     }
@@ -17012,7 +17099,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 #endif
 
         /* Look at the longest folds first */
-        for (cp_count = av_tindex_nomg(multi_char_matches);
+        for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
                         cp_count > 0;
                         cp_count--)
         {
@@ -17394,7 +17481,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     {
                         AV* list = (AV*) *listp;
                         IV k;
-                        for (k = 0; k <= av_tindex_nomg(list); k++) {
+                        for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
                             SV** c_p = av_fetch(list, k, FALSE);
                             UV c;
                             assert(c_p);
@@ -18085,7 +18172,7 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
 
            si = *ary;  /* ary[0] = the string to initialize the swash with */
 
-            if (av_tindex_nomg(av) >= 2) {
+            if (av_tindex_skip_len_mg(av) >= 2) {
                 if (only_utf8_locale_ptr
                     && ary[2]
                     && ary[2] != &PL_sv_undef)
@@ -18101,7 +18188,7 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
                  * is any inversion list generated at compile time; [4]
                  * indicates if that inversion list has any user-defined
                  * properties in it. */
-                if (av_tindex_nomg(av) >= 3) {
+                if (av_tindex_skip_len_mg(av) >= 3) {
                     invlist = ary[3];
                     if (SvUV(ary[4])) {
                         swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
@@ -18496,7 +18583,8 @@ S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const
 * set up NEXT_OFF() of the inserted node if needed. Something like this:
 *
 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
-* NEXT_OFF(orig_emit)= regarglen[OPFAIL] + NODE_STEP_REGNODE;
+* if (PASS2)
+*     NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
 *
 */
 STATIC void
@@ -19520,10 +19608,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
     if (ri->u.offsets)
         Safefree(ri->u.offsets);             /* 20010421 MJD */
 #endif
-    if (ri->code_blocks) {
-        ri->code_blocks->attached = FALSE;
+    if (ri->code_blocks)
         S_free_codeblocks(aTHX_ ri->code_blocks);
-    }
 
     if (ri->data) {
        int n = ri->data->count;
@@ -19751,7 +19837,7 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
             reti->code_blocks->cb[n].src_regex = (REGEXP*)
                    sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
         reti->code_blocks->count = ri->code_blocks->count;
-        reti->code_blocks->attached = TRUE;
+        reti->code_blocks->refcnt = 1;
     }
     else
        reti->code_blocks = NULL;