This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
New Test for CGI::Push
[perl5.git] / regexec.c
index b5f8f47..7b9f1c4 100644 (file)
--- a/regexec.c
+++ b/regexec.c
 #define PERL_IN_REGEXEC_C
 #include "perl.h"
 
-#ifdef PERL_IN_XSUB_RE
-#  if defined(PERL_CAPI) || defined(PERL_OBJECT)
-#    include "XSUB.h"
-#  endif
-#endif
-
 #include "regcomp.h"
 
 #define RF_tainted     1               /* tainted information used? */
  */
 
 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
-#define CHR_DIST(a,b) (DO_UTF8(PL_reg_sv) ? utf8_distance(a,b) : a - b)
+#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 
 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
-#define HOP(pos,off) (DO_UTF8(PL_reg_sv) ? reghop((U8*)pos, off) : (U8*)(pos + off))
-#define HOPMAYBE(pos,off) (DO_UTF8(PL_reg_sv) ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
+#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
+#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
 #define HOPc(pos,off) ((char*)HOP(pos,off))
 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
 
 #define HOPBACK(pos, off) (            \
-    (UTF && DO_UTF8(PL_reg_sv))                \
+    (UTF && PL_reg_match_utf8)         \
        ? reghopmaybe((U8*)pos, -off)   \
     : (pos - off >= PL_bostr)          \
        ? (U8*)(pos - off)              \
 
 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
-#define HOP3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
-#define HOPMAYBE3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
+#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
+#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
 
 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
 
-static void restore_pos(pTHXo_ void *arg);
+/* for use after a quantifier and before an EXACT-like node -- japhy */
+#define NEXT_IMPT(to_rn) STMT_START { \
+    while (OP(to_rn) == OPEN || OP(to_rn) == CLOSE || OP(to_rn) == EVAL) \
+       to_rn += NEXT_OFF(to_rn); \
+} STMT_END 
+
+static void restore_pos(pTHX_ void *arg);
 
 STATIC CHECKPOINT
 S_regcppush(pTHX_ I32 parenfloor)
@@ -414,7 +414,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
              goto fail;
          }
          if (prog->check_offset_min == prog->check_offset_max &&
-             !(prog->reganch & ROPT_SANY_SEEN)) {
+             !(prog->reganch & ROPT_CANY_SEEN)) {
            /* Substring at constant offset from beg-of-str... */
            I32 slen;
 
@@ -490,7 +490,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
        if (data)
            *data->scream_olds = s;
     }
-    else if (prog->reganch & ROPT_SANY_SEEN)
+    else if (prog->reganch & ROPT_CANY_SEEN)
        s = fbm_instr((U8*)(s + start_shift),
                      (U8*)(strend - end_shift),
                      check, PL_multiline ? FBMrf_MULTILINE : 0);
@@ -776,7 +776,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
            PL_regdata = prog->data;
            PL_bostr = startpos;
        }
-        s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
+       s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
        if (!s) {
 #ifdef DEBUGGING
            char *what = 0;
@@ -878,7 +878,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
        unsigned int c2;
        char *e;
        register I32 tmp = 1;   /* Scratch variable? */
-       register bool do_utf8 = DO_UTF8(PL_reg_sv);
+       register bool do_utf8 = PL_reg_match_utf8;
 
        /* We know what class it must start with. */
        switch (OP(c)) {
@@ -895,6 +895,15 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                s += do_utf8 ? UTF8SKIP(s) : 1;
            }
            break;
+       case CANY:
+           while (s < strend) {
+               if (tmp && (norun || regtry(prog, s)))
+                   goto got_it;
+               else
+                   tmp = doevery;
+               s++;
+           }
+           break;
        case EXACTF:
            m = STRING(c);
            ln = STR_LEN(c);
@@ -1440,12 +1449,11 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
     }
 
     minlen = prog->minlen;
-    if (do_utf8) {
-      if (!(prog->reganch & ROPT_SANY_SEEN))
+    if (do_utf8 && !(prog->reganch & ROPT_CANY_SEEN)) {
         if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
     }
     else {
-      if (strend - startpos < minlen) goto phooey;
+        if (strend - startpos < minlen) goto phooey;
     }
 
     /* Check validity of program. */
@@ -1488,7 +1496,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
                  && mg->mg_len >= 0) {
            PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
            if (prog->reganch & ROPT_ANCH_GPOS) {
-               if (s > PL_reg_ganch)
+               if (s > PL_reg_ganch)
                    goto phooey;
                s = PL_reg_ganch;
            }
@@ -1767,7 +1775,7 @@ got_it:
            sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
                                                  restored, the value remains
                                                  the same. */
-       restore_pos(aTHXo_ 0);
+       restore_pos(aTHX_ 0);
     }
 
     /* make sure $`, $&, $', and $digit will work later */
@@ -1796,7 +1804,7 @@ phooey:
     DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
                          PL_colors[4],PL_colors[5]));
     if (PL_reg_eval_set)
-       restore_pos(aTHXo_ 0);
+       restore_pos(aTHX_ 0);
     return 0;
 }
 
@@ -1833,7 +1841,7 @@ S_regtry(pTHX_ regexp *prog, char *startpos)
        if (PL_reg_sv) {
            /* Make $_ available to executed code. */
            if (PL_reg_sv != DEFSV) {
-               /* SAVE_DEFSV does *not* suffice here for USE_THREADS */
+               /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
                SAVESPTR(DEFSV);
                DEFSV = PL_reg_sv;
            }
@@ -1850,8 +1858,19 @@ S_regtry(pTHX_ regexp *prog, char *startpos)
            PL_reg_oldpos   = mg->mg_len;
            SAVEDESTRUCTOR_X(restore_pos, 0);
         }
-       if (!PL_reg_curpm)
+        if (!PL_reg_curpm) {
            Newz(22,PL_reg_curpm, 1, PMOP);
+#ifdef USE_ITHREADS
+            {
+                SV* repointer = newSViv(0);
+                /* so we know which PL_regex_padav element is PL_reg_curpm */
+                SvFLAGS(repointer) |= SVf_BREAK;
+                av_push(PL_regex_padav,repointer);
+                PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
+                PL_regex_pad = AvARRAY(PL_regex_padav);
+            }
+#endif      
+        }
        PM_SETRE(PL_reg_curpm, prog);
        PL_reg_oldcurpm = PL_curpm;
        PL_curpm = PL_reg_curpm;
@@ -1948,6 +1967,16 @@ typedef union re_unwind_t {
     re_unwind_branch_t branch;
 } re_unwind_t;
 
+#define sayYES goto yes
+#define sayNO goto no
+#define sayYES_FINAL goto yes_final
+#define sayYES_LOUD  goto yes_loud
+#define sayNO_FINAL  goto no_final
+#define sayNO_SILENT goto do_no
+#define saySAME(x) if (x) goto yes; else goto no
+
+#define REPORT_CODE_OFF 24
+
 /*
  - regmatch - main matching routine
  *
@@ -1980,7 +2009,7 @@ S_regmatch(pTHX_ regnode *prog)
 #if 0
     I32 firstcp = PL_savestack_ix;
 #endif
-    register bool do_utf8 = DO_UTF8(PL_reg_sv);
+    register bool do_utf8 = PL_reg_match_utf8;
 
 #ifdef DEBUGGING
     PL_regindent++;
@@ -1990,25 +2019,7 @@ S_regmatch(pTHX_ regnode *prog)
     nextchr = UCHARAT(locinput);
     scan = prog;
     while (scan != NULL) {
-#define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO)
-#if 1
-#  define sayYES goto yes
-#  define sayNO goto no
-#  define sayYES_FINAL goto yes_final
-#  define sayYES_LOUD  goto yes_loud
-#  define sayNO_FINAL  goto no_final
-#  define sayNO_SILENT goto do_no
-#  define saySAME(x) if (x) goto yes; else goto no
-#  define REPORT_CODE_OFF 24
-#else
-#  define sayYES return 1
-#  define sayNO return 0
-#  define sayYES_FINAL return 1
-#  define sayYES_LOUD  return 1
-#  define sayNO_FINAL  return 0
-#  define sayNO_SILENT return 0
-#  define saySAME(x) return x
-#endif
+
        DEBUG_r( {
            SV *prop = sv_newmortal();
            int docolor = *PL_colors[0];
@@ -2104,6 +2115,18 @@ S_regmatch(pTHX_ regnode *prog)
        case SANY:
            if (!nextchr && locinput >= PL_regeol)
                sayNO;
+           if (do_utf8) {
+               locinput += PL_utf8skip[nextchr];
+               if (locinput > PL_regeol)
+                   sayNO;
+               nextchr = UCHARAT(locinput);
+           }
+           else
+               nextchr = UCHARAT(++locinput);
+           break;
+       case CANY:
+           if (!nextchr && locinput >= PL_regeol)
+               sayNO;
            nextchr = UCHARAT(++locinput);
            break;
        case REG_ANY:
@@ -2602,6 +2625,7 @@ S_regmatch(pTHX_ regnode *prog)
                    /* XXXX This is too dramatic a measure... */
                    PL_reg_maxiter = 0;
 
+                   logical = 0;
                    sayNO;
                }
                sw = SvTRUE(ret);
@@ -3009,21 +3033,39 @@ S_regmatch(pTHX_ regnode *prog)
                minmod = 0;
                if (ln && regrepeat_hard(scan, ln, &l) < ln)
                    sayNO;
-               if (ln && l == 0 && n >= ln
-                   /* In fact, this is tricky.  If paren, then the
-                      fact that we did/didnot match may influence
-                      future execution. */
-                   && !(paren && ln == 0))
-                   ln = n;
+               /* if we matched something zero-length we don't need to
+                  backtrack - capturing parens are already defined, so
+                  the caveat in the maximal case doesn't apply
+
+                  XXXX if ln == 0, we can redo this check first time
+                  through the following loop
+               */
+               if (ln && l == 0)
+                   n = ln;     /* don't backtrack */
                locinput = PL_reginput;
-               if (PL_regkind[(U8)OP(next)] == EXACT) {
-                   c1 = (U8)*STRING(next);
-                   if (OP(next) == EXACTF)
-                       c2 = PL_fold[c1];
-                   else if (OP(next) == EXACTFL)
-                       c2 = PL_fold_locale[c1];
-                   else
-                       c2 = c1;
+               if (
+                   PL_regkind[(U8)OP(next)] == EXACT ||
+                   OP(next) == OPEN ||
+                   OP(next) == CLOSE ||
+                   OP(next) == EVAL
+               ) {
+                   regnode *text_node = next;
+
+                   if (PL_regkind[(U8)OP(next)] != EXACT)
+                       NEXT_IMPT(text_node);
+
+                   if (PL_regkind[(U8)OP(text_node)] != EXACT) {
+                       c1 = c2 = -1000;
+                   }
+                   else {
+                       c1 = (U8)*STRING(text_node);
+                       if (OP(next) == EXACTF)
+                           c2 = PL_fold[c1];
+                       else if (OP(text_node) == EXACTFL)
+                           c2 = PL_fold_locale[c1];
+                       else
+                           c2 = c1;
+                   }
                }
                else
                    c1 = c2 = -1000;
@@ -3036,7 +3078,7 @@ S_regmatch(pTHX_ regnode *prog)
                        UCHARAT(PL_reginput) == c2)
                    {
                        if (paren) {
-                           if (n) {
+                           if (ln) {
                                PL_regstartp[paren] =
                                    HOPc(PL_reginput, -l) - PL_bostr;
                                PL_regendp[paren] = PL_reginput - PL_bostr;
@@ -3060,12 +3102,13 @@ S_regmatch(pTHX_ regnode *prog)
            }
            else {
                n = regrepeat_hard(scan, n, &l);
-               if (n != 0 && l == 0
-                   /* In fact, this is tricky.  If paren, then the
-                      fact that we did/didnot match may influence
-                      future execution. */
-                   && !(paren && ln == 0))
-                   ln = n;
+               /* if we matched something zero-length we don't need to
+                  backtrack, unless the minimum count is zero and we
+                  are capturing the result - in that case the capture
+                  being defined or not may affect later execution
+               */
+               if (n != 0 && l == 0 && !(paren && ln == 0))
+                   ln = n;     /* don't backtrack */
                locinput = PL_reginput;
                DEBUG_r(
                    PerlIO_printf(Perl_debug_log,
@@ -3074,14 +3117,29 @@ S_regmatch(pTHX_ regnode *prog)
                                  (IV) n, (IV)l)
                    );
                if (n >= ln) {
-                   if (PL_regkind[(U8)OP(next)] == EXACT) {
-                       c1 = (U8)*STRING(next);
-                       if (OP(next) == EXACTF)
-                           c2 = PL_fold[c1];
-                       else if (OP(next) == EXACTFL)
-                           c2 = PL_fold_locale[c1];
-                       else
-                           c2 = c1;
+                   if (
+                       PL_regkind[(U8)OP(next)] == EXACT ||
+                       OP(next) == OPEN ||
+                       OP(next) == CLOSE ||
+                       OP(next) == EVAL
+                   ) {
+                       regnode *text_node = next;
+
+                       if (PL_regkind[(U8)OP(next)] != EXACT)
+                           NEXT_IMPT(text_node);
+
+                       if (PL_regkind[(U8)OP(text_node)] != EXACT) {
+                           c1 = c2 = -1000;
+                       }
+                       else {
+                           c1 = (U8)*STRING(text_node);
+                           if (OP(text_node) == EXACTF)
+                               c2 = PL_fold[c1];
+                           else if (OP(text_node) == EXACTFL)
+                               c2 = PL_fold_locale[c1];
+                           else
+                               c2 = c1;
+                       }
                    }
                    else
                        c1 = c2 = -1000;
@@ -3151,22 +3209,46 @@ S_regmatch(pTHX_ regnode *prog)
            * Lookahead to avoid useless match attempts
            * when we know what character comes next.
            */
-           if (PL_regkind[(U8)OP(next)] == EXACT) {
-               U8 *s = (U8*)STRING(next);
-               if (!UTF) {
-                   c2 = c1 = *s;
-                   if (OP(next) == EXACTF)
-                       c2 = PL_fold[c1];
-                   else if (OP(next) == EXACTFL)
-                       c2 = PL_fold_locale[c1];
-               }
-               else { /* UTF */
-                   if (OP(next) == EXACTF) {
-                       c1 = to_utf8_lower(s);
-                       c2 = to_utf8_upper(s);
+
+           /*
+           * Used to only do .*x and .*?x, but now it allows
+           * for )'s, ('s and (?{ ... })'s to be in the way
+           * of the quantifier and the EXACT-like node.  -- japhy
+           */
+
+           if (
+               PL_regkind[(U8)OP(next)] == EXACT ||
+               OP(next) == OPEN ||
+               OP(next) == CLOSE ||
+               OP(next) == EVAL
+           ) {
+               U8 *s;
+               regnode *text_node = next;
+
+               if (PL_regkind[(U8)OP(next)] != EXACT)
+                   NEXT_IMPT(text_node);
+
+               if (PL_regkind[(U8)OP(text_node)] != EXACT) {
+                   c1 = c2 = -1000;
+               }
+               else {
+                   s = (U8*)STRING(text_node);
+
+                   if (!UTF) {
+                       c2 = c1 = *s;
+                       if (OP(text_node) == EXACTF)
+                           c2 = PL_fold[c1];
+                       else if (OP(text_node) == EXACTFL)
+                           c2 = PL_fold_locale[c1];
                    }
-                   else {
-                       c2 = c1 = utf8_to_uvchr(s, NULL);
+                   else { /* UTF */
+                       if (OP(text_node) == EXACTF) {
+                           c1 = to_utf8_lower(s);
+                           c2 = to_utf8_upper(s);
+                       }
+                       else {
+                           c2 = c1 = utf8_to_uvchr(s, NULL);
+                       }
                    }
                }
            }
@@ -3562,7 +3644,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
     register I32 c;
     register char *loceol = PL_regeol;
     register I32 hardcount = 0;
-    register bool do_utf8 = DO_UTF8(PL_reg_sv);
+    register bool do_utf8 = PL_reg_match_utf8;
 
     scan = PL_reginput;
     if (max != REG_INFTY && max < loceol - scan)
@@ -3583,6 +3665,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
     case SANY:
        scan = loceol;
        break;
+    case CANY:
+       scan = loceol;
+       break;
     case EXACT:                /* length of string is 1 */
        c = (U8)*STRING(p);
        while (scan < loceol && UCHARAT(scan) == c)
@@ -3798,7 +3883,7 @@ S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
        return 0;
 
     start = PL_reginput;
-    if (DO_UTF8(PL_reg_sv)) {
+    if (PL_reg_match_utf8) {
        while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
            if (!count++) {
                l = 0;
@@ -4034,12 +4119,8 @@ S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
     return s;
 }
 
-#ifdef PERL_OBJECT
-#include "XSUB.h"
-#endif
-
 static void
-restore_pos(pTHXo_ void *arg)
+restore_pos(pTHX_ void *arg)
 {
     if (PL_reg_eval_set) {
        if (PL_reg_oldsaved) {