This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
revert a trailing whitespace removal
[perl5.git] / regcomp.c
index eefc2cc..acd6e9b 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1555,7 +1555,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     if (!SvIOK(re_trie_maxbuff)) {
         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
     }
-    DEBUG_OPTIMISE_r({
+    DEBUG_TRIE_COMPILE_r({
                 PerlIO_printf( Perl_debug_log,
                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
                   (int)depth * 2 + 2, "", 
@@ -1597,9 +1597,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
      */
 
     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
-        regnode * const noper = NEXTOPER( cur );
+        regnode *noper = NEXTOPER( cur );
         const U8 *uc = (U8*)STRING( noper );
-        const U8 * const e  = uc + STR_LEN( noper );
+        const U8 *e  = uc + STR_LEN( noper );
         STRLEN foldlen = 0;
         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
         STRLEN skiplen = 0;
@@ -1609,9 +1609,18 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
 
         if (OP(noper) == NOTHING) {
-            trie->minlen= 0;
-            continue;
+            regnode *noper_next= regnext(noper);
+            if (noper_next != tail && OP(noper_next) == flags) {
+                noper = noper_next;
+                uc= (U8*)STRING(noper);
+                e= uc + STR_LEN(noper);
+               trie->minlen= STR_LEN(noper);
+            } else {
+               trie->minlen= 0;
+               continue;
+           }
         }
+
         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
                                           regardless of encoding */
@@ -1750,9 +1759,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
 
         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
 
-           regnode * const noper = NEXTOPER( cur );
+            regnode *noper   = NEXTOPER( cur );
            U8 *uc           = (U8*)STRING( noper );
-           const U8 * const e = uc + STR_LEN( noper );
+            const U8 *e      = uc + STR_LEN( noper );
            U32 state        = 1;         /* required init */
            U16 charid       = 0;         /* sanity init */
            U8 *scan         = (U8*)NULL; /* sanity init */
@@ -1761,6 +1770,15 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
            U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
             STRLEN skiplen   = 0;
 
+            if (OP(noper) == NOTHING) {
+                regnode *noper_next= regnext(noper);
+                if (noper_next != tail && OP(noper_next) == flags) {
+                    noper = noper_next;
+                    uc= (U8*)STRING(noper);
+                    e= uc + STR_LEN(noper);
+                }
+            }
+
             if (OP(noper) != NOTHING) {
                 for ( ; uc < e ; uc += len ) {
 
@@ -1948,9 +1966,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
 
         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
 
-           regnode * const noper   = NEXTOPER( cur );
+            regnode *noper   = NEXTOPER( cur );
            const U8 *uc     = (U8*)STRING( noper );
-           const U8 * const e = uc + STR_LEN( noper );
+            const U8 *e      = uc + STR_LEN( noper );
 
             U32 state        = 1;         /* required init */
 
@@ -1963,6 +1981,14 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
             STRLEN skiplen   = 0;
             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
 
+            if (OP(noper) == NOTHING) {
+                regnode *noper_next= regnext(noper);
+                if (noper_next != tail && OP(noper_next) == flags) {
+                    noper = noper_next;
+                    uc= (U8*)STRING(noper);
+                    e= uc + STR_LEN(noper);
+                }
+            }
 
             if ( OP(noper) != NOTHING ) {
                 for ( ; uc < e ; uc += len ) {
@@ -3237,7 +3263,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         }
 
                         
-                        DEBUG_OPTIMISE_r({
+                        DEBUG_TRIE_COMPILE_r({
                             regprop(RExC_rx, mysv, tail );
                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
                                 (int)depth * 2 + 2, "", 
@@ -3303,9 +3329,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                             U8 noper_trietype = TRIE_TYPE( noper_type );
 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
                             regnode * const noper_next = regnext( noper );
+                           U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
+                           U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
 #endif
 
-                            DEBUG_OPTIMISE_r({
+                            DEBUG_TRIE_COMPILE_r({
                                 regprop(RExC_rx, mysv, cur);
                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
@@ -3319,8 +3347,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
                                     SvPV_nolen_const(mysv));
                                 }
-                                PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
-                                   REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
+                                PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
+                                   REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
+                                  PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] 
+                               );
                             });
 
                             /* Is noper a trieable nodetype that can be merged with the
@@ -3328,22 +3358,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                             if ( noper_trietype
                                   &&
                                   (
-                                        /* XXX: Currently we cannot allow a NOTHING node to be the first element
-                                         * of a TRIEABLE sequence, Otherwise we will overwrite the regop following
-                                         * the NOTHING with the TRIE regop later on. This is because a NOTHING node
-                                         * is only one regnode wide, and a TRIE is two regnodes. An example of a
-                                         * problematic pattern is: "x" =~ /\A(?>(?:(?:)A|B|C?x))\z/
-                                         * At a later point of time we can somewhat workaround this by handling
-                                         * NOTHING -> EXACT sequences as generated by /(?:)A|(?:)B/ type patterns,
-                                         * as we can effectively ignore the NOTHING regop in that case.
-                                         * This clause, which allows NOTHING to start a sequence is left commented
-                                         * out as a reference.
-                                         * - Yves
-
-                                           ( noper_trietype == NOTHING)
-                                           || ( trietype == NOTHING )
-                                        */
-                                        ( noper_trietype == NOTHING && trietype )
+                                        ( noper_trietype == NOTHING)
+                                        || ( trietype == NOTHING )
                                         || ( trietype == noper_trietype )
                                   )
 #ifdef NOJUMPTRIE
@@ -3355,15 +3371,26 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                  * Either we are the first node in a new trieable sequence,
                                  * in which case we do some bookkeeping, otherwise we update
                                  * the end pointer. */
-                                count++;
                                 if ( !first ) {
                                     first = cur;
                                     trietype = noper_trietype;
+                                   if ( noper_trietype == NOTHING ) {
+#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
+                                       regnode * const noper_next = regnext( noper );
+                                        U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
+                                       U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
+#endif
+
+                                        if ( noper_next_trietype )
+                                           trietype = noper_next_trietype;
+                                    }
                                 } else {
                                     if ( trietype == NOTHING )
                                         trietype = noper_trietype;
                                     last = cur;
                                 }
+                               if (first)
+                                   count++;
                             } /* end handle mergable triable node */
                             else {
                                 /* handle unmergable node -
@@ -3399,31 +3426,53 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                 }
                             } /* end handle unmergable node */
                         } /* loop over branches */
-                        DEBUG_OPTIMISE_r({
+                        DEBUG_TRIE_COMPILE_r({
                             regprop(RExC_rx, mysv, cur);
                             PerlIO_printf( Perl_debug_log,
                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
 
                         });
-                        if ( last && trietype != NOTHING ) {
-                            /* the last branch of the sequence was part of a trie,
-                             * so we have to construct it here outside of the loop
-                             */
-                            made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
+                        if ( last ) {
+                            if ( trietype != NOTHING ) {
+                                /* the last branch of the sequence was part of a trie,
+                                 * so we have to construct it here outside of the loop
+                                 */
+                                made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
 #ifdef TRIE_STUDY_OPT
-                            if ( ((made == MADE_EXACT_TRIE && 
-                                 startbranch == first) 
-                                 || ( first_non_open == first )) && 
-                                 depth==0 ) {
-                                flags |= SCF_TRIE_RESTUDY;
-                                if ( startbranch == first 
-                                     && scan == tail ) 
-                                {
-                                    RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
+                                if ( ((made == MADE_EXACT_TRIE &&
+                                     startbranch == first)
+                                     || ( first_non_open == first )) &&
+                                     depth==0 ) {
+                                    flags |= SCF_TRIE_RESTUDY;
+                                    if ( startbranch == first
+                                         && scan == tail )
+                                    {
+                                        RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
+                                    }
                                 }
-                            }
 #endif
+                            } else {
+                                /* at this point we know whatever we have is a NOTHING sequence/branch
+                                 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
+                                 */
+                                if ( startbranch == first ) {
+                                    regnode *opt;
+                                    /* the entire thing is a NOTHING sequence, something like this:
+                                     * (?:|) So we can turn it into a plain NOTHING op. */
+                                    DEBUG_TRIE_COMPILE_r({
+                                        regprop(RExC_rx, mysv, cur);
+                                        PerlIO_printf( Perl_debug_log,
+                                          "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
+                                          "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
+
+                                    });
+                                    OP(startbranch)= NOTHING;
+                                    NEXT_OFF(startbranch)= tail - startbranch;
+                                    for ( opt= startbranch + 1; opt < tail ; opt++ )
+                                        OP(opt)= OPTIMIZED;
+                                }
+                            }
                         } /* end if ( last) */
                     } /* TRIE_MAXBUF is non zero */
                     
@@ -4348,6 +4397,32 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 /* Lookbehind, or need to calculate parens/evals/stclass: */
                   && (scan->flags || data || (flags & SCF_DO_STCLASS))
                   && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
+            if ( OP(scan) == UNLESSM &&
+                 scan->flags == 0 &&
+                 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
+                 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
+            ) {
+                regnode *opt;
+                regnode *upto= regnext(scan);
+                DEBUG_PARSE_r({
+                    SV * const mysv_val=sv_newmortal();
+                    DEBUG_STUDYDATA("OPFAIL",data,depth);
+
+                    /*DEBUG_PARSE_MSG("opfail");*/
+                    regprop(RExC_rx, mysv_val, upto);
+                    PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
+                                  SvPV_nolen_const(mysv_val),
+                                  (IV)REG_NODE_NUM(upto),
+                                  (IV)(upto - scan)
+                    );
+                });
+                OP(scan) = OPFAIL;
+                NEXT_OFF(scan) = upto - scan;
+                for (opt= scan + 1; opt < upto ; opt++)
+                    OP(opt) = OPTIMIZED;
+                scan= upto;
+                continue;
+            }
             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY 
                 || OP(scan) == UNLESSM )
             {
@@ -4512,8 +4587,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         }
                     }
                 }
-
-
            }
 #endif
        }
@@ -5071,11 +5144,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
        * Clever compilers notice this and complain. --jhi */
     REGC((U8)REG_MAGIC, (char*)RExC_emit);
 #endif
-    DEBUG_PARSE_r(
-       PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
-        RExC_lastnum=0;
-        RExC_lastparse=NULL;
-    );
+    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
     if (reg(pRExC_state, 0, &flags,1) == NULL) {
        RExC_precomp = NULL;
        return(NULL);
@@ -7911,7 +7980,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                    RExC_parse++;
                }
                if (*RExC_parse != ')') {
-                   RExC_parse = s;
+                   RExC_parse = s;             
                    vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
                }
                if (!SIZE_ONLY) {
@@ -8382,9 +8451,24 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             }
            break;
        }
+        DEBUG_PARSE_r(if (!SIZE_ONLY) {
+            SV * const mysv_val1=sv_newmortal();
+            SV * const mysv_val2=sv_newmortal();
+            DEBUG_PARSE_MSG("lsbr");
+            regprop(RExC_rx, mysv_val1, lastbr);
+            regprop(RExC_rx, mysv_val2, ender);
+            PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
+                          SvPV_nolen_const(mysv_val1),
+                          (IV)REG_NODE_NUM(lastbr),
+                          SvPV_nolen_const(mysv_val2),
+                          (IV)REG_NODE_NUM(ender),
+                          (IV)(ender - lastbr)
+            );
+        });
         REGTAIL(pRExC_state, lastbr, ender);
 
        if (have_branch && !SIZE_ONLY) {
+            char is_nothing= 1;
            if (depth==1)
                RExC_seen |= REG_TOP_LEVEL_BRANCHES;
 
@@ -8393,11 +8477,44 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                const U8 op = PL_regkind[OP(br)];
                if (op == BRANCH) {
                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
+                    if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
+                        is_nothing= 0;
                }
                else if (op == BRANCHJ) {
                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
+                    /* for now we always disable this optimisation * /
+                    if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
+                    */
+                        is_nothing= 0;
                }
            }
+            if (is_nothing) {
+                br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
+                DEBUG_PARSE_r(if (!SIZE_ONLY) {
+                    SV * const mysv_val1=sv_newmortal();
+                    SV * const mysv_val2=sv_newmortal();
+                    DEBUG_PARSE_MSG("NADA");
+                    regprop(RExC_rx, mysv_val1, ret);
+                    regprop(RExC_rx, mysv_val2, ender);
+                    PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
+                                  SvPV_nolen_const(mysv_val1),
+                                  (IV)REG_NODE_NUM(ret),
+                                  SvPV_nolen_const(mysv_val2),
+                                  (IV)REG_NODE_NUM(ender),
+                                  (IV)(ender - ret)
+                    );
+                });
+                OP(br)= NOTHING;
+                if (OP(ender) == TAIL) {
+                    NEXT_OFF(br)= 0;
+                    RExC_emit= br + 1;
+                } else {
+                    regnode *opt;
+                    for ( opt= br + 1; opt < ender ; opt++ )
+                        OP(opt)= OPTIMIZED;
+                    NEXT_OFF(br)= ender - br;
+                }
+            }
        }
     }
 
@@ -9077,12 +9194,6 @@ tryagain:
        vFAIL("Internal urp");
                                /* Supposed to be caught earlier. */
        break;
-    case '{':
-       if (!regcurly(RExC_parse)) {
-           RExC_parse++;
-           goto defchar;
-       }
-       /* FALL THROUGH */
     case '?':
     case '+':
     case '*':
@@ -9208,9 +9319,6 @@ tryagain:
            ret = reg_node(pRExC_state, op);
            FLAGS(ret) = get_regex_charset(RExC_flags);
            *flagp |= SIMPLE;
-           if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
-               ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
-           }
            goto finish_meta_pat;
        case 'B':
            RExC_seen_zerolen++;
@@ -9235,9 +9343,6 @@ tryagain:
            ret = reg_node(pRExC_state, op);
            FLAGS(ret) = get_regex_charset(RExC_flags);
            *flagp |= SIMPLE;
-           if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
-               ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
-           }
            goto finish_meta_pat;
        case 's':
            switch (get_regex_charset(RExC_flags)) {
@@ -9744,15 +9849,22 @@ tryagain:
                        /* FALL THROUGH */
                    default:
                        if (!SIZE_ONLY&& isALPHA(*p)) {
-                           /* Include any { following the alpha to emphasize
-                            * that it could be part of an escape at some point
-                            * in the future */
-                           int len = (*(p + 1) == '{') ? 2 : 1;
-                           ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
+                           ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
                        }
                        goto normal_default;
                    }
                    break;
+               case '{':
+                   /* Currently we don't warn when the lbrace is at the start
+                    * of a construct.  This catches it in the middle of a
+                    * literal string, or when its the first thing after
+                    * something like "\b" */
+                   if (! SIZE_ONLY
+                       && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
+                   {
+                       ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
+                   }
+                   /*FALLTHROUGH*/
                default:
                  normal_default:
                    if (UTF8_IS_START(*p) && UTF) {
@@ -13450,8 +13562,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
  * Local variables:
  * c-indentation-style: bsd
  * c-basic-offset: 4
- * indent-tabs-mode: t
+ * indent-tabs-mode: nil
  * End:
  *
- * ex: set ts=8 sts=4 sw=4 noet:
+ * ex: set ts=8 sts=4 sw=4 et:
  */