This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fwd: CPAN Upload: J/JP/JPEACOCK/version-0.70.tar.gz
[perl5.git] / regcomp.c
index db25fb2..5f1efdc 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -662,7 +662,7 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min
     }
     data->last_end = -1;
     data->flags &= ~SF_BEFORE_EOL;
-    DEBUG_STUDYDATA("cl_anything: ",data,0);
+    DEBUG_STUDYDATA("commit: ",data,0);
 }
 
 /* Can match anything (initialization) */
@@ -2050,6 +2050,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
         /* needed for dumping*/
         DEBUG_r(if (optimize) {
             regnode *opt = convert;
+
             while ( ++opt < optimize) {
                 Set_Node_Offset_Length(opt,0,0);
             }
@@ -4139,8 +4140,50 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
     r->engine= RE_ENGINE_PTR;
     r->refcnt = 1;
     r->prelen = xend - exp;
-    r->precomp = savepvn(RExC_precomp, r->prelen);
     r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
+    {
+        bool has_k     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
+       bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
+       bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
+       U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12);
+       const char *fptr = STD_PAT_MODS;        /*"msix"*/
+       char *p;
+        r->wraplen = r->prelen + has_minus + has_k + has_runon
+            + (sizeof(STD_PAT_MODS) - 1)
+            + (sizeof("(?:)") - 1);
+
+        Newx(r->wrapped, r->wraplen, char );
+        p = r->wrapped;
+        *p++='('; *p++='?';
+        if (has_k)
+            *p++ = KEEPCOPY_PAT_MOD; /*'k'*/
+        {
+            char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
+            char *colon = r + 1;
+            char ch;
+
+            while((ch = *fptr++)) {
+                if(reganch & 1)
+                    *p++ = ch;
+                else
+                    *r-- = ch;
+                reganch >>= 1;
+            }
+            if(has_minus) {
+                *r = '-';
+                p = colon;
+            }
+        }
+
+        *p++=':';
+        Copy(RExC_precomp, p, r->prelen, char);
+        r->precomp = p;
+        p += r->prelen;
+        if (has_runon)
+            *p++='\n';
+        *p=')';
+    }
+
     r->intflags = 0;
     r->nparens = RExC_npar - 1;        /* set early to validate backrefs */
     
@@ -4669,8 +4712,9 @@ Perl_reg_named_buff_get(pTHX_ SV* namesv, const REGEXP * const from_re, U32 flag
                 SV* sv_dat=HeVAL(he_str);
                 I32 *nums=(I32*)SvPVX(sv_dat);
                 for ( i=0; i<SvIVX(sv_dat); i++ ) {
-                    if ((I32)(rx->lastparen) >= nums[i] &&
-                        rx->endp[nums[i]] != -1) 
+                    if ((I32)(rx->nparens) >= nums[i]
+                        && rx->startp[nums[i]] != -1
+                        && rx->endp[nums[i]] != -1)
                     {
                         ret = reg_numbered_buff_get(nums[i],rx,NULL,0);
                         if (!retarray) 
@@ -4904,6 +4948,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
     const I32 oregflags = RExC_flags;
     bool have_branch = 0;
     bool is_open = 0;
+    I32 freeze_paren = 0;
+    I32 after_freeze = 0;
 
     /* for (?g), (?gc), and (?o) warnings; warning
        about (?c) will warn about (?g) -- japhy    */
@@ -5074,8 +5120,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     nextchar(pRExC_state);
                     return ret;
                 }
-                goto unknown;
-           case '<':           /* (?<...) */
+                RExC_parse++;
+               vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
+               /*NOTREACHED*/
+            case '<':           /* (?<...) */
                if (*RExC_parse == '!')
                    paren = ',';
                else if (*RExC_parse != '=') 
@@ -5090,8 +5138,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                        SIZE_ONLY ?  /* reverse test from the others */
                        REG_RSN_RETURN_NAME : 
                        REG_RSN_RETURN_NULL);
-                   if (RExC_parse == name_start)
-                       goto unknown;
+                   if (RExC_parse == name_start) {
+                       RExC_parse++;
+                       vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
+                       /*NOTREACHED*/
+                    }
                    if (*RExC_parse != paren)
                        vFAIL2("Sequence (?%c... not terminated",
                            paren=='>' ? '<' : paren);
@@ -5117,11 +5168,26 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                             Perl_croak(aTHX_
                                 "panic: paren_name hash element allocation failed");
                         } else if ( SvPOK(sv_dat) ) {
-                            IV count=SvIV(sv_dat);
-                            I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
-                            SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
-                            pv[count]=RExC_npar;
-                            SvIVX(sv_dat)++;
+                            /* (?|...) can mean we have dupes so scan to check
+                               its already been stored. Maybe a flag indicating
+                               we are inside such a construct would be useful,
+                               but the arrays are likely to be quite small, so
+                               for now we punt -- dmq */
+                            IV count = SvIV(sv_dat);
+                            I32 *pv = (I32*)SvPVX(sv_dat);
+                            IV i;
+                            for ( i = 0 ; i < count ; i++ ) {
+                                if ( pv[i] == RExC_npar ) {
+                                    count = 0;
+                                    break;
+                                }
+                            }
+                            if ( count ) {
+                                pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
+                                SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
+                                pv[count] = RExC_npar;
+                                SvIVX(sv_dat)++;
+                            }
                         } else {
                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
@@ -5149,6 +5215,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                    nextchar(pRExC_state);
                    return ret;
                }
+               break;
+           case '|':           /* (?|...) */
+               /* branch reset, behave like a (?:...) except that
+                  buffers in alternations share the same numbers */
+               paren = ':'; 
+               after_freeze = freeze_paren = RExC_npar;
+               break;
            case ':':           /* (?:...) */
            case '>':           /* (?>...) */
                break;
@@ -5260,8 +5333,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                /* FALL THROUGH*/
            case '?':           /* (??...) */
                is_logical = 1;
-               if (*RExC_parse != '{')
-                   goto unknown;
+               if (*RExC_parse != '{') {
+                   RExC_parse++;
+                   vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
+                   /*NOTREACHED*/
+               }
                paren = *RExC_parse++;
                /* FALL THROUGH */
            case '{':           /* (?{...}) */
@@ -5507,8 +5583,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         }
                        break;
                     case '-':
-                        if (flagsp == &negflags)
-                            goto unknown;
+                        if (flagsp == &negflags) {
+                            RExC_parse++;
+                           vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
+                           /*NOTREACHED*/
+                       }
                        flagsp = &negflags;
                        wastedflags = 0;  /* reset so (?g-c) warns twice */
                        break;
@@ -5528,7 +5607,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                        }
                        /*NOTREACHED*/
                     default:
-                    unknown:
                        RExC_parse++;
                        vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
                        /*NOTREACHED*/
@@ -5546,7 +5624,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
            if (!SIZE_ONLY ){
                if (!RExC_nestroot) 
                    RExC_nestroot = parno;
-               if (RExC_seen & REG_SEEN_RECURSE) {
+               if (RExC_seen & REG_SEEN_RECURSE
+                   && !RExC_open_parens[parno-1])
+               {
                    DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
                        "Setting open paren #%"IVdf" to %d\n", 
                        (IV)parno, REG_NODE_NUM(ret)));
@@ -5600,6 +5680,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
        if (SIZE_ONLY)
            RExC_extralen += 2;         /* Account for LONGJMP. */
        nextchar(pRExC_state);
+       if (freeze_paren) {
+           if (RExC_npar > after_freeze)
+               after_freeze = RExC_npar;
+            RExC_npar = freeze_paren;      
+        }
         br = regbranch(pRExC_state, &flags, 0, depth+1);
 
        if (br == NULL)
@@ -5701,7 +5786,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
            FAIL("Junk on end of regexp");      /* "Can't happen". */
        /* NOTREACHED */
     }
-
+    if (after_freeze)
+        RExC_npar = after_freeze;
     return(ret);
 }
 
@@ -6643,9 +6729,7 @@ tryagain:
 
     case '#':
        if (RExC_flags & RXf_PMf_EXTENDED) {
-           while (RExC_parse < RExC_end && *RExC_parse != '\n')
-               RExC_parse++;
-           if (RExC_parse < RExC_end)
+           if ( reg_skipcomment( pRExC_state ) )
                goto tryagain;
        }
        /* FALL THROUGH */
@@ -6674,7 +6758,7 @@ tryagain:
                char * const oldp = p;
 
                if (RExC_flags & RXf_PMf_EXTENDED)
-                   p = regwhite(p, RExC_end);
+                   p = regwhite( pRExC_state, p );
                switch (*p) {
                case '^':
                case '$':
@@ -6822,13 +6906,13 @@ tryagain:
                        ender = *p++;
                    break;
                }
-               if (RExC_flags & RXf_PMf_EXTENDED)
-                   p = regwhite(p, RExC_end);
+               if ( RExC_flags & RXf_PMf_EXTENDED)
+                   p = regwhite( pRExC_state, p );
                if (UTF && FOLD) {
                    /* Prime the casefolded buffer. */
                    ender = toFOLD_uni(ender, tmpbuf, &foldlen);
                }
-               if (ISMULT2(p)) { /* Back off on ?+*. */
+               if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
                    if (len)
                        p = oldp;
                    else if (UTF) {
@@ -6930,15 +7014,22 @@ tryagain:
 }
 
 STATIC char *
-S_regwhite(char *p, const char *e)
+S_regwhite( RExC_state_t *pRExC_state, char *p )
 {
+    const char *e = RExC_end;
     while (p < e) {
        if (isSPACE(*p))
            ++p;
        else if (*p == '#') {
+            bool ended = 0;
            do {
-               p++;
-           } while (p < e && *p != '\n');
+               if (*p++ == '\n') {
+                   ended = 1;
+                   break;
+               }
+           } while (p < e);
+           if (!ended)
+               RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
        }
        else
            break;
@@ -7720,6 +7811,49 @@ parseit:
 #undef _C_C_T_
 
 
+/* reg_skipcomment()
+
+   Absorbs an /x style # comments from the input stream.
+   Returns true if there is more text remaining in the stream.
+   Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
+   terminates the pattern without including a newline.
+
+   Note its the callers responsibility to ensure that we are
+   actually in /x mode
+
+*/
+
+STATIC bool
+S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
+{
+    bool ended = 0;
+    while (RExC_parse < RExC_end)
+        if (*RExC_parse++ == '\n') {
+            ended = 1;
+            break;
+        }
+    if (!ended) {
+        /* we ran off the end of the pattern without ending
+           the comment, so we have to add an \n when wrapping */
+        RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
+        return 0;
+    } else
+        return 1;
+}
+
+/* nextchar()
+
+   Advance that parse position, and optionall absorbs
+   "whitespace" from the inputstream.
+
+   Without /x "whitespace" means (?#...) style comments only,
+   with /x this means (?#...) and # comments and whitespace proper.
+
+   Returns the RExC_parse point from BEFORE the scan occurs.
+
+   This is the /x friendly way of saying RExC_parse++.
+*/
+
 STATIC char*
 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
 {
@@ -7742,9 +7876,8 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
                continue;
            }
            else if (*RExC_parse == '#') {
-               while (RExC_parse < RExC_end)
-                   if (*RExC_parse++ == '\n') break;
-               continue;
+               if ( reg_skipcomment( pRExC_state ) )
+                   continue;
            }
        }
        return retval;
@@ -8513,10 +8646,6 @@ Perl_pregfree(pTHX_ struct regexp *r)
        return;
        
     CALLREGFREE_PVT(r); /* free the private data */
-    
-    /* gcov results gave these as non-null 100% of the time, so there's no
-       optimisation in checking them before calling Safefree  */
-    Safefree(r->precomp);
     RX_MATCH_COPY_FREE(r);
 #ifdef PERL_OLD_COPY_ON_WRITE
     if (r->saved_copy)
@@ -8534,8 +8663,8 @@ Perl_pregfree(pTHX_ struct regexp *r)
        Safefree(r->substrs);
     }
     if (r->paren_names)
-            SvREFCNT_dec(r->paren_names);
-    
+        SvREFCNT_dec(r->paren_names);
+    Safefree(r->wrapped);
     Safefree(r->startp);
     Safefree(r->endp);
     Safefree(r);
@@ -8727,11 +8856,14 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
     } else 
         ret->substrs = NULL;    
 
-    ret->precomp        = SAVEPVN(r->precomp, r->prelen);
+    ret->wrapped        = SAVEPVN(r->wrapped, r->wraplen);
+    ret->precomp        = ret->wrapped + (r->precomp - r->wrapped);
+    ret->prelen         = r->prelen;
+    ret->wraplen        = r->wraplen;
+
     ret->refcnt         = r->refcnt;
     ret->minlen         = r->minlen;
     ret->minlenret      = r->minlenret;
-    ret->prelen         = r->prelen;
     ret->nparens        = r->nparens;
     ret->lastparen      = r->lastparen;
     ret->lastcloseparen = r->lastcloseparen;
@@ -8798,8 +8930,8 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param)
         reti->swap = NULL;
     }
 
-
     reti->regstclass = NULL;
+
     if (ri->data) {
        struct reg_data *d;
         const int count = ri->data->count;
@@ -8904,83 +9036,18 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param)
     
  */
 #ifndef PERL_IN_XSUB_RE
+
 char *
 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
     dVAR;
     const regexp * const re = (regexp *)mg->mg_obj;
-
-    if (!mg->mg_ptr) {
-       const char *fptr = STD_PAT_MODS;        /*"msix"*/
-       char reflags[7];
-       char ch;
-       bool hask = ((re->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
-       bool hasm = ((re->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
-        U16 reganch = (U16)((re->extflags & RXf_PMf_STD_PMMOD) >> 12);
-        bool need_newline = 0;
-        int left = 0;
-       int right = 4 + hask;
-        if (hask) 
-            reflags[left++] = KEEPCOPY_PAT_MOD; /*'k'*/
-       while((ch = *fptr++)) {
-           if(reganch & 1) {
-               reflags[left++] = ch;
-           }
-           else {
-               reflags[right--] = ch;
-           }
-           reganch >>= 1;
-       }
-       if(hasm) {
-           reflags[left] = '-';
-           left = 5 + hask;
-       }
-        /* printf("[%*.7s]\n",left,reflags); */
-       mg->mg_len = re->prelen + 4 + left;
-       /*
-        * If /x was used, we have to worry about a regex ending with a
-        * comment later being embedded within another regex. If so, we don't
-        * want this regex's "commentization" to leak out to the right part of
-        * the enclosing regex, we must cap it with a newline.
-        *
-        * So, if /x was used, we scan backwards from the end of the regex. If
-        * we find a '#' before we find a newline, we need to add a newline
-        * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
-        * we don't need to add anything.  -jfriedl
-        */
-       if (PMf_EXTENDED & re->extflags) {
-           const char *endptr = re->precomp + re->prelen;
-           while (endptr >= re->precomp) {
-               const char c = *(endptr--);
-               if (c == '\n')
-                   break; /* don't need another */
-               if (c == '#') {
-                   /* we end while in a comment, so we need a newline */
-                   mg->mg_len++; /* save space for it */
-                   need_newline = 1; /* note to add it */
-                   break;
-               }
-           }
-       }
-
-       Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
-       mg->mg_ptr[0] = '(';
-       mg->mg_ptr[1] = '?';
-       Copy(reflags, mg->mg_ptr+2, left, char);
-       *(mg->mg_ptr+left+2) = ':';
-       Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
-       if (need_newline)
-           mg->mg_ptr[mg->mg_len - 2] = '\n';
-       mg->mg_ptr[mg->mg_len - 1] = ')';
-       mg->mg_ptr[mg->mg_len] = 0;
-    }
     if (haseval) 
         *haseval = re->seen_evals;
     if (flags)    
        *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
-    
     if (lp)
-       *lp = mg->mg_len;
-    return mg->mg_ptr;
+       *lp = re->wraplen;
+    return re->wrapped;
 }
 
 /*