This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add RXf_UNBOUNDED_QUANTIFIER and regexp->maxlen
authorYves Orton <yves.orton@booking.com>
Sun, 2 Feb 2014 15:37:37 +0000 (23:37 +0800)
committerYves Orton <yves.orton@booking.com>
Mon, 3 Feb 2014 14:44:29 +0000 (22:44 +0800)
The flag tells us that a pattern may match an infinitely long string.

The new member in the regexp struct tells us how long the string might
be.

With these two items we can implement regexp based $/

regcomp.c
regcomp.h
regexp.h
regnodes.h

index 884d9cc..8246497 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -159,6 +159,7 @@ struct RExC_state_t {
                                            within pattern */
     int                num_code_blocks;        /* size of code_blocks[] */
     int                code_index;             /* next code_blocks[] slot */
+    SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 #if ADD_TO_REGEXEC
     char       *starttry;              /* -Dr: where regtry was called. */
 #define RExC_starttry  (pRExC_state->starttry)
@@ -196,6 +197,7 @@ struct RExC_state_t {
 #define RExC_sawback   (pRExC_state->sawback)
 #define RExC_seen      (pRExC_state->seen)
 #define RExC_size      (pRExC_state->size)
+#define RExC_maxlen        (pRExC_state->maxlen)
 #define RExC_npar      (pRExC_state->npar)
 #define RExC_nestroot   (pRExC_state->nestroot)
 #define RExC_extralen  (pRExC_state->extralen)
@@ -774,6 +776,9 @@ static const scan_data_t zero_scan_data =
             if (RExC_seen & REG_GOSTART_SEEN)                               \
                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
                                                                             \
+            if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
+                PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
+                                                                            \
             PerlIO_printf(Perl_debug_log,"\n");                             \
         });
 
@@ -3671,7 +3676,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge
                                                                 strings after
                                                             this. */
-            if (flags & SCF_DO_STCLASS)
+                if (flags & SCF_DO_STCLASS)
                    ssc_init_zero(pRExC_state, &accum);
 
                while (OP(scan) == code) {
@@ -4494,12 +4499,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                is_inf_internal |= deltanext == SSize_t_MAX
                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
                is_inf |= is_inf_internal;
-               if (is_inf)
+                if (is_inf) {
                    delta = SSize_t_MAX;
-               else
+                } else {
                    delta += (minnext + deltanext) * maxcount
                              - minnext * mincount;
-
+                }
                /* Try powerful optimization CURLYX => CURLYN. */
                if (  OP(oscan) == CURLYX && data
                      && data->flags & SF_IN_PAR
@@ -5366,6 +5371,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
 
     *scanp = scan;
     *deltap = is_inf_internal ? SSize_t_MAX : delta;
+
     if (flags & SCF_DO_SUBSTR && is_inf)
        data->pos_delta = SSize_t_MAX - data->pos_min;
     if (is_par > (I32)U8_MAX)
@@ -5385,7 +5391,16 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
 
     DEBUG_STUDYDATA("post-fin:",data,depth);
 
-    return min < stopmin ? min : stopmin;
+    {
+        SSize_t final_minlen= min < stopmin ? min : stopmin;
+
+        if (RExC_maxlen < final_minlen + delta) {
+            RExC_maxlen = final_minlen + delta;
+        }
+
+        return final_minlen;
+    }
+    /* not-reached */
 }
 
 STATIC U32
@@ -6376,6 +6391,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     RExC_sawback = 0;
 
     RExC_seen = 0;
+    RExC_maxlen = 0;
     RExC_in_lookbehind = 0;
     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
     RExC_extralen = 0;
@@ -7017,13 +7033,15 @@ reStudy:
     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
        the "real" pattern. */
     DEBUG_OPTIMISE_r({
-       PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
-                     (IV)minlen, (IV)r->minlen);
+        PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
+                      (IV)minlen, (IV)r->minlen, RExC_maxlen);
     });
     r->minlenret = minlen;
     if (r->minlen < minlen)
         r->minlen = minlen;
 
+
+
     if (RExC_seen & REG_GPOS_SEEN)
         r->intflags |= PREGf_GPOS_SEEN;
     if (RExC_seen & REG_LOOKBEHIND_SEEN)
@@ -7047,6 +7065,9 @@ reStudy:
     else
         RXp_PAREN_NAMES(r) = NULL;
 
+    if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)
+        r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
+
     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
      * so it can be used in pp.c */
     if (r->intflags & PREGf_ANCH)
@@ -10466,6 +10487,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                ARG1_SET(ret, (U16)min);
                ARG2_SET(ret, (U16)max);
            }
+            if (max == REG_INFTY)
+                RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
 
            goto nest_check;
        }
@@ -10503,6 +10526,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
        reginsert(pRExC_state, STAR, ret, depth+1);
        ret->flags = 0;
        RExC_naughty += 4;
+        RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
     }
     else if (op == '*') {
        min = 0;
@@ -10512,6 +10536,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
        reginsert(pRExC_state, PLUS, ret, depth+1);
        ret->flags = 0;
        RExC_naughty += 3;
+        RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
     }
     else if (op == '+') {
        min = 1;
index 551f00d..832ed3e 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -582,6 +582,8 @@ struct regnode_ssc {
 #define REG_RUN_ON_COMMENT_SEEN             0x00000200
 #define REG_UNFOLDED_MULTI_SEEN             0x00000400
 #define REG_GOSTART_SEEN                    0x00000800
+#define REG_UNBOUNDED_QUANTIFIER_SEEN       0x00001000
+
 
 START_EXTERN_C
 
index a26c4f2..45ba87a 100644 (file)
--- a/regexp.h
+++ b/regexp.h
@@ -128,6 +128,7 @@ struct reg_code_block {
        SSize_t suboffset; /* byte offset of subbeg from logical start of str */ \
        SSize_t subcoffset; /* suboffset equiv, but in chars (for @-/@+) */ \
        /* Information about the match that isn't often used */         \
+        SSize_t maxlen;        /* mininum possible number of chars in string to match */\
        /* offset from wrapped to the start of precomp */               \
        PERL_BITFIELD32 pre_prefix:4;                                   \
         /* original flags used to compile the pattern, may differ */    \
@@ -398,7 +399,7 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp)
 #define RXf_UNUSED8             (1<<(RXf_BASE_SHIFT+8))
 
 /* Special */
-#define RXf_UNUSED9             (1<<(RXf_BASE_SHIFT+9))
+#define RXf_UNBOUNDED_QUANTIFIER_SEEN   (1<<(RXf_BASE_SHIFT+9))
 #define RXf_CHECK_ALL          (1<<(RXf_BASE_SHIFT+10))
 
 /* UTF8 related */
index 088e5a6..f9d4fc0 100644 (file)
@@ -659,7 +659,7 @@ EXTCONST char * const PL_reg_extflags_name[] = {
        "NO_INPLACE_SUBST", /* 0x00008000 */
        "EVAL_SEEN",        /* 0x00010000 */
        "UNUSED8",          /* 0x00020000 */
-       "UNUSED9",          /* 0x00040000 */
+       "UNBOUNDED_QUANTIFIER_SEEN",/* 0x00040000 */
        "CHECK_ALL",        /* 0x00080000 */
        "MATCH_UTF8",       /* 0x00100000 */
        "USE_INTUIT_NOML",  /* 0x00200000 */