Integrate:

[perl5.git] / regexec.c
diff --git a/regexec.c b/regexec.c

index 1127933..b761752 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -5,6 +5,17 @@
   * "One Ring to rule them all, One Ring to find them..."
   */
  
+/* This file contains functions for executing a regular expression.  See
+ * also regcomp.c which funnily enough, contains functions for compiling
+ * a regular expression.
+ *
+ * This file is also copied at build time to ext/re/re_exec.c, where
+ * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
+ * This causes the main functions to be compiled under new names and with
+ * debugging support added, which makes "use re 'debug'" work.
+ 
+ */
+
  /* NOTE: this is derived from Henry Spencer's regexp code, and should not
   * confused with the original package (see point 3 below).  Thanks, Henry!
   */
@@ -68,7 +79,7 @@
   ****    Alterations to Henry's code are...
   ****
   ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
- ****    2000, 2001, 2002, 2003, by Larry Wall and others
+ ****    2000, 2001, 2002, 2003, 2004, by Larry Wall and others
   ****
   ****    You may distribute under the terms of either the GNU General Public
   ****    License or the Artistic License, as specified in the README file.
@@ -953,6 +964,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
         char *m;
         STRLEN ln;
+       STRLEN lnc;
+       register STRLEN uskip;
         unsigned int c1;
         unsigned int c2;
         char *e;
@@ -963,7 +976,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         switch (OP(c)) {
         case ANYOF:
             if (do_utf8) {
-                while (s < strend) {
+                while (s + (uskip = UTF8SKIP(s)) <= strend) {
                       if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
                           !UTF8_IS_INVARIANT((U8)s[0]) ?
                           reginclass(c, (U8*)s, 0, do_utf8) :
@@ -975,7 +988,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                       }
                       else 
                            tmp = 1;
-                     s += UTF8SKIP(s);
+                     s += uskip;
                  }
             }
             else {
@@ -1008,10 +1021,12 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
             }
             break;
         case EXACTF:
-           m = STRING(c);
-           ln = STR_LEN(c);
+           m   = STRING(c);
+           ln  = STR_LEN(c);   /* length to match in octets/bytes */
+           lnc = (I32) ln;     /* length to match in characters */
             if (UTF) {
                 STRLEN ulen1, ulen2;
+               U8 *sm = (U8 *) m;
                 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
                 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
  
@@ -1022,6 +1037,11 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                                     0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
                                     0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+               lnc = 0;
+               while (sm < ((U8 *) m + ln)) {
+                   lnc++;
+                   sm += UTF8SKIP(sm);
+               }
             }
             else {
                 c1 = *(U8*)m;
@@ -1029,12 +1049,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
             }
             goto do_exactf;
         case EXACTFL:
-           m = STRING(c);
-           ln = STR_LEN(c);
+           m   = STRING(c);
+           ln  = STR_LEN(c);
+           lnc = (I32) ln;
             c1 = *(U8*)m;
             c2 = PL_fold_locale[c1];
           do_exactf:
-           e = HOP3c(strend, -(I32)ln, s);
+           e = HOP3c(strend, -((I32)lnc), s);
  
             if (norun && e < s)
                 e = s;                  /* Due to minlen logic of intuit() */
@@ -1057,6 +1078,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                 STRLEN len, foldlen;
                 
                 if (c1 == c2) {
+                   /* Upper and lower of 1st char are equal -
+                    * probably not a "letter". */
                     while (s <= e) {
                         c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
                                            ckWARN(WARN_UTF8) ?
@@ -1161,7 +1184,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                 tmp = ((OP(c) == BOUND ?
                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
                 LOAD_UTF8_CHARCLASS(alnum,"a");
-               while (s < strend) {
+               while (s + (uskip = UTF8SKIP(s)) <= strend) {
                     if (tmp == !(OP(c) == BOUND ?
                                  swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
                                  isALNUM_LC_utf8((U8*)s)))
@@ -1170,7 +1193,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                         if ((norun || regtry(prog, s)))
                             goto got_it;
                     }
-                   s += UTF8SKIP(s);
+                   s += uskip;
                 }
             }
             else {
@@ -1204,14 +1227,14 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                 tmp = ((OP(c) == NBOUND ?
                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
                 LOAD_UTF8_CHARCLASS(alnum,"a");
-               while (s < strend) {
+               while (s + (uskip = UTF8SKIP(s)) <= strend) {
                     if (tmp == !(OP(c) == NBOUND ?
                                  swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
                                  isALNUM_LC_utf8((U8*)s)))
                         tmp = !tmp;
                     else if ((norun || regtry(prog, s)))
                         goto got_it;
-                   s += UTF8SKIP(s);
+                   s += uskip;
                 }
             }
             else {
@@ -1233,7 +1256,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         case ALNUM:
             if (do_utf8) {
                 LOAD_UTF8_CHARCLASS(alnum,"a");
-               while (s < strend) {
+               while (s + (uskip = UTF8SKIP(s)) <= strend) {
                     if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
                         if (tmp && (norun || regtry(prog, s)))
                             goto got_it;
@@ -1242,7 +1265,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                     }
                     else
                         tmp = 1;
-                   s += UTF8SKIP(s);
+                   s += uskip;
                 }
             }
             else {
@@ -1262,7 +1285,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         case ALNUML:
             PL_reg_flags |= RF_tainted;
             if (do_utf8) {
-               while (s < strend) {
+               while (s + (uskip = UTF8SKIP(s)) <= strend) {
                     if (isALNUM_LC_utf8((U8*)s)) {
                         if (tmp && (norun || regtry(prog, s)))
                             goto got_it;
@@ -1271,7 +1294,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                     }
                     else
                         tmp = 1;
-                   s += UTF8SKIP(s);
+                   s += uskip;
                 }
             }
             else {
@@ -1291,7 +1314,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         case NALNUM:
             if (do_utf8) {
                 LOAD_UTF8_CHARCLASS(alnum,"a");
-               while (s < strend) {
+               while (s + (uskip = UTF8SKIP(s)) <= strend) {
                     if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
                         if (tmp && (norun || regtry(prog, s)))
                             goto got_it;
@@ -1300,7 +1323,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                     }
                     else
                         tmp = 1;
-                   s += UTF8SKIP(s);
+                   s += uskip;
                 }
             }
             else {
@@ -1320,7 +1343,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         case NALNUML:
             PL_reg_flags |= RF_tainted;
             if (do_utf8) {
-               while (s < strend) {
+               while (s + (uskip = UTF8SKIP(s)) <= strend) {
                     if (!isALNUM_LC_utf8((U8*)s)) {
                         if (tmp && (norun || regtry(prog, s)))
                             goto got_it;
@@ -1329,7 +1352,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                     }
                     else
                         tmp = 1;
-                   s += UTF8SKIP(s);
+                   s += uskip;
                 }
             }
             else {
@@ -1349,7 +1372,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         case SPACE:
             if (do_utf8) {
                 LOAD_UTF8_CHARCLASS(space," ");
-               while (s < strend) {
+               while (s + (uskip = UTF8SKIP(s)) <= strend) {
                     if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
                         if (tmp && (norun || regtry(prog, s)))
                             goto got_it;
@@ -1358,7 +1381,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                     }
                     else
                         tmp = 1;
-                   s += UTF8SKIP(s);
+                   s += uskip;
                 }
             }
             else {
@@ -1378,7 +1401,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         case SPACEL:
             PL_reg_flags |= RF_tainted;
             if (do_utf8) {
-               while (s < strend) {
+               while (s + (uskip = UTF8SKIP(s)) <= strend) {
                     if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
                         if (tmp && (norun || regtry(prog, s)))
                             goto got_it;
@@ -1387,7 +1410,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                     }
                     else
                         tmp = 1;
-                   s += UTF8SKIP(s);
+                   s += uskip;
                 }
             }
             else {
@@ -1407,7 +1430,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         case NSPACE:
             if (do_utf8) {
                 LOAD_UTF8_CHARCLASS(space," ");
-               while (s < strend) {
+               while (s + (uskip = UTF8SKIP(s)) <= strend) {
                     if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
                         if (tmp && (norun || regtry(prog, s)))
                             goto got_it;
@@ -1416,7 +1439,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                     }
                     else
                         tmp = 1;
-                   s += UTF8SKIP(s);
+                   s += uskip;
                 }
             }
             else {
@@ -1436,7 +1459,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         case NSPACEL:
             PL_reg_flags |= RF_tainted;
             if (do_utf8) {
-               while (s < strend) {
+               while (s + (uskip = UTF8SKIP(s)) <= strend) {
                     if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
                         if (tmp && (norun || regtry(prog, s)))
                             goto got_it;
@@ -1445,7 +1468,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                     }
                     else
                         tmp = 1;
-                   s += UTF8SKIP(s);
+                   s += uskip;
                 }
             }
             else {
@@ -1465,7 +1488,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         case DIGIT:
             if (do_utf8) {
                 LOAD_UTF8_CHARCLASS(digit,"0");
-               while (s < strend) {
+               while (s + (uskip = UTF8SKIP(s)) <= strend) {
                     if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
                         if (tmp && (norun || regtry(prog, s)))
                             goto got_it;
@@ -1474,7 +1497,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                     }
                     else
                         tmp = 1;
-                   s += UTF8SKIP(s);
+                   s += uskip;
                 }
             }
             else {
@@ -1494,7 +1517,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         case DIGITL:
             PL_reg_flags |= RF_tainted;
             if (do_utf8) {
-               while (s < strend) {
+               while (s + (uskip = UTF8SKIP(s)) <= strend) {
                     if (isDIGIT_LC_utf8((U8*)s)) {
                         if (tmp && (norun || regtry(prog, s)))
                             goto got_it;
@@ -1503,7 +1526,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                     }
                     else
                         tmp = 1;
-                   s += UTF8SKIP(s);
+                   s += uskip;
                 }
             }
             else {
@@ -1523,7 +1546,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         case NDIGIT:
             if (do_utf8) {
                 LOAD_UTF8_CHARCLASS(digit,"0");
-               while (s < strend) {
+               while (s + (uskip = UTF8SKIP(s)) <= strend) {
                     if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
                         if (tmp && (norun || regtry(prog, s)))
                             goto got_it;
@@ -1532,7 +1555,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                     }
                     else
                         tmp = 1;
-                   s += UTF8SKIP(s);
+                   s += uskip;
                 }
             }
             else {
@@ -1552,7 +1575,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         case NDIGITL:
             PL_reg_flags |= RF_tainted;
             if (do_utf8) {
-               while (s < strend) {
+               while (s + (uskip = UTF8SKIP(s)) <= strend) {
                     if (!isDIGIT_LC_utf8((U8*)s)) {
                         if (tmp && (norun || regtry(prog, s)))
                             goto got_it;
@@ -1561,7 +1584,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                     }
                     else
                         tmp = 1;
-                   s += UTF8SKIP(s);
+                   s += uskip;
                 }
             }
             else {
@@ -2021,28 +2044,17 @@ got_it:
  
      /* make sure $`, $&, $', and $digit will work later */
      if ( !(flags & REXEC_NOT_FIRST) ) {
-       RX_MATCH_COPY_FREE(prog);
+       if (RX_MATCH_COPIED(prog)) {
+           Safefree(prog->subbeg);
+           RX_MATCH_COPIED_off(prog);
+       }
         if (flags & REXEC_COPY_STR) {
             I32 i = PL_regeol - startpos + (stringarg - strbeg);
-#ifdef PERL_COPY_ON_WRITE
-           if ((SvIsCOW(sv)
-                || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
-               if (DEBUG_C_TEST) {
-                   PerlIO_printf(Perl_debug_log,
-                                 "Copy on write: regexp capture, type %d\n",
-                                 (int) SvTYPE(sv));
-               }
-               prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
-               prog->subbeg = SvPVX(prog->saved_copy);
-               assert (SvPOKp(prog->saved_copy));
-           } else
-#endif
-           {
-               RX_MATCH_COPIED_on(prog);
-               s = savepvn(strbeg, i);
-               prog->subbeg = s;
-           }
+
+           s = savepvn(strbeg, i);
+           prog->subbeg = s;
             prog->sublen = i;
+           RX_MATCH_COPIED_on(prog);
         }
         else {
             prog->subbeg = strbeg;
@@ -2132,9 +2144,6 @@ S_regtry(pTHX_ regexp *prog, char *startpos)
                 $` inside (?{}) could fail... */
             PL_reg_oldsaved = prog->subbeg;
             PL_reg_oldsavedlen = prog->sublen;
-#ifdef PERL_COPY_ON_WRITE
-           PL_nrs = prog->saved_copy;
-#endif
             RX_MATCH_COPIED_off(prog);
         }
         else
@@ -3357,7 +3366,7 @@ S_regmatch(pTHX_ regnode *prog)
             CHECKPOINT lastcp;
         
             /* We suppose that the next guy does not need
-              backtracking: in particular, it is of constant length,
+              backtracking: in particular, it is of constant non-zero length,
                and has no parenths to influence future backrefs. */
             ln = ARG1(scan);  /* min to match */
             n  = ARG2(scan);  /* max to match */
@@ -3376,15 +3385,6 @@ S_regmatch(pTHX_ regnode *prog)
                 minmod = 0;
                 if (ln && regrepeat_hard(scan, ln, &l) < ln)
                     sayNO;
-               /* if we matched something zero-length we don't need to
-                  backtrack - capturing parens are already defined, so
-                  the caveat in the maximal case doesn't apply
-
-                  XXXX if ln == 0, we can redo this check first time
-                  through the following loop
-               */
-               if (ln && l == 0)
-                   n = ln;     /* don't backtrack */
                 locinput = PL_reginput;
                 if (HAS_TEXT(next) || JUMPABLE(next)) {
                     regnode *text_node = next;
@@ -3410,8 +3410,7 @@ S_regmatch(pTHX_ regnode *prog)
                     c1 = c2 = -1000;
             assume_ok_MM:
                 REGCP_SET(lastcp);
-               /* This may be improved if l == 0.  */
-               while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
+               while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
                     /* If it could work, try it. */
                     if (c1 == -1000 ||
                         UCHARAT(PL_reginput) == c1 ||
@@ -3442,13 +3441,6 @@ S_regmatch(pTHX_ regnode *prog)
             }
             else {
                 n = regrepeat_hard(scan, n, &l);
-               /* if we matched something zero-length we don't need to
-                  backtrack, unless the minimum count is zero and we
-                  are capturing the result - in that case the capture
-                  being defined or not may affect later execution
-               */
-               if (n != 0 && l == 0 && !(paren && ln == 0))
-                   ln = n;     /* don't backtrack */
                 locinput = PL_reginput;
                 DEBUG_r(
                     PerlIO_printf(Perl_debug_log,
@@ -4253,7 +4245,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
  /*
   - regrepeat_hard - repeatedly match something, report total lenth and length
   *
- * The repeater is supposed to have constant length.
+ * The repeater is supposed to have constant non-zero length.
   */
  
  STATIC I32
@@ -4550,9 +4542,6 @@ restore_pos(pTHX_ void *arg)
         if (PL_reg_oldsaved) {
             PL_reg_re->subbeg = PL_reg_oldsaved;
             PL_reg_re->sublen = PL_reg_oldsavedlen;
-#ifdef PERL_COPY_ON_WRITE
-           PL_reg_re->saved_copy = PL_nrs;
-#endif
             RX_MATCH_COPIED_on(PL_reg_re);
         }
         PL_reg_magic->mg_len = PL_reg_oldpos;