Remove deprecated flag from sv_nosharing.

[perl5.git] / regexec.c
diff --git a/regexec.c b/regexec.c

index 39a0b6b..97ea458 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1787,7 +1787,9 @@ STMT_START {
      STMT_START {                                            \
          while (s < strend) {                                \
              CODE                                            \
-            s += ((UTF8) ? UTF8SKIP(s) : 1);                \
+            s += ((UTF8)                                    \
+                  ? UTF8_SAFE_SKIP(s, reginfo->strend)      \
+                  : 1);                                     \
          }                                                   \
      } STMT_END
  
@@ -1801,7 +1803,7 @@ STMT_START {
  #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND)                  \
      if (COND) {                                                \
          FBC_CHECK_AND_TRY                                      \
-        s += ((UTF8) ? UTF8SKIP(s) : 1);                       \
+        s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\
          previous_occurrence_end = s;                           \
      }                                                          \
      else {                                                     \
@@ -1820,12 +1822,13 @@ STMT_START {
   * of the one we're looking for.  Knowing that, we can see right away if the
   * next occurrence is adjacent to the previous.  When 'doevery' is FALSE, we
   * don't accept the 2nd and succeeding adjacent occurrences */
-#define FBC_CHECK_AND_TRY                                      \
-        if (   (   doevery                                     \
-                || s != previous_occurrence_end)               \
-            && (reginfo->intuit || regtry(reginfo, &s)))       \
-        {                                                      \
-            goto got_it;                                       \
+#define FBC_CHECK_AND_TRY                                           \
+        if (   (   doevery                                          \
+                || s != previous_occurrence_end)                    \
+            && (   reginfo->intuit                                  \
+                || (s <= reginfo->strend && regtry(reginfo, &s))))  \
+        {                                                           \
+            goto got_it;                                            \
          }
  
  
@@ -1858,7 +1861,7 @@ STMT_START {
                                                              \
          if (COND) {                                         \
              FBC_CHECK_AND_TRY                               \
-            s += UTF8SKIP(s);                               \
+            s += UTF8_SAFE_SKIP(s, reginfo->strend);        \
              previous_occurrence_end = s;                    \
          }                                                   \
          else {                                              \
@@ -1977,7 +1980,7 @@ STMT_START {
   * string (which should be zero length without having to look at the string
   * contents) */
  #define REXEC_FBC_TRYIT                                                     \
-    if ((reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))) \
+    if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))   \
          goto got_it
  
  /* The only difference between the BOUND and NBOUND cases is that
@@ -2175,17 +2178,30 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
  
      case ANYOFH:
          if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
+            REXEC_FBC_CLASS_SCAN(TRUE,
+                  (   (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
+                   && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
+        }
+        break;
+
+    case ANYOFHb:
+        if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
+
+            /* We know what the first byte of any matched string should be */
              U8 first_byte = FLAGS(c);
  
-            if (first_byte) {   /* We know what the first byte of any matched
-                                   string should be */
-                REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
+            REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
                        reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
-            }
-            else {
-                REXEC_FBC_CLASS_SCAN(TRUE,
-                      reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
-            }
+        }
+        break;
+
+    case ANYOFHr:
+        if (utf8_target) {  /* Can't possibly match a non-UTF-8 target */
+            REXEC_FBC_CLASS_SCAN(TRUE,
+                  (   inRANGE((U8) NATIVE_UTF8_TO_I8(*s),
+                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
+                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
+                   && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
          }
          break;
  
@@ -2398,7 +2414,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
              {
                  goto got_it;
              }
-            s += (utf8_target) ? UTF8SKIP(s) : 1;
+            s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
          }
          break;
      }
@@ -2482,7 +2498,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                      }
  
                      /* Didn't match.  Try at the next position (if there is one) */
-                    s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
                      if (UNLIKELY(s >= reginfo->strend)) {
                          break;
                      }
@@ -2506,7 +2522,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                              goto got_it;
                          }
                          before = after;
-                        s += UTF8SKIP(s);
+                        s += UTF8_SAFE_SKIP(s, reginfo->strend);
                      }
                  }
                  else {  /* Not utf8.  Everything is a GCB except between CR and
@@ -2524,7 +2540,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
  
                  /* And, since this is a bound, it can match after the final
                   * character in the string */
-                if ((reginfo->intuit || regtry(reginfo, &s))) {
+                if (   reginfo->intuit
+                    || (s <= reginfo->strend && regtry(reginfo, &s)))
+                {
                      goto got_it;
                  }
                  break;
@@ -2534,7 +2552,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                      if (reginfo->intuit || regtry(reginfo, &s)) {
                          goto got_it;
                      }
-                    s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
                      if (UNLIKELY(s >= reginfo->strend)) {
                          break;
                      }
@@ -2558,7 +2576,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                              goto got_it;
                          }
                          before = after;
-                        s += UTF8SKIP(s);
+                        s += UTF8_SAFE_SKIP(s, reginfo->strend);
                      }
                  }
                  else {  /* Not utf8. */
@@ -2580,7 +2598,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                      }
                  }
  
-                if (reginfo->intuit || regtry(reginfo, &s)) {
+                if (   reginfo->intuit
+                    || (s <= reginfo->strend && regtry(reginfo, &s)))
+                {
                      goto got_it;
                  }
  
@@ -2591,7 +2611,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                      if (reginfo->intuit || regtry(reginfo, &s)) {
                          goto got_it;
                      }
-                    s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
                      if (UNLIKELY(s >= reginfo->strend)) {
                          break;
                      }
@@ -2616,7 +2636,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                              goto got_it;
                          }
                          before = after;
-                        s += UTF8SKIP(s);
+                        s += UTF8_SAFE_SKIP(s, reginfo->strend);
                      }
                  }
                  else {  /* Not utf8. */
@@ -2641,7 +2661,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                  /* Here are at the final position in the target string.  The SB
                   * value is always true here, so matches, depending on other
                   * constraints */
-                if (reginfo->intuit || regtry(reginfo, &s)) {
+                if (   reginfo->intuit
+                    || (s <= reginfo->strend && regtry(reginfo, &s)))
+                {
                      goto got_it;
                  }
  
@@ -2652,7 +2674,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                      if (reginfo->intuit || regtry(reginfo, &s)) {
                          goto got_it;
                      }
-                    s += (utf8_target) ? UTF8SKIP(s) : 1;
+                    s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
                      if (UNLIKELY(s >= reginfo->strend)) {
                          break;
                      }
@@ -2686,7 +2708,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                          }
                          previous = before;
                          before = after;
-                        s += UTF8SKIP(s);
+                        s += UTF8_SAFE_SKIP(s, reginfo->strend);
                      }
                  }
                  else {  /* Not utf8. */
@@ -2711,7 +2733,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                      }
                  }
  
-                if (reginfo->intuit || regtry(reginfo, &s)) {
+                if (   reginfo->intuit
+                    || (s <= reginfo->strend && regtry(reginfo, &s)))
+                {
                      goto got_it;
                  }
          }
@@ -3028,7 +3052,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                          LEAVE;
                          goto got_it;
                      }
-                    s = HOPc(s,1);
+                    if (s < reginfo->strend) {
+                        s = HOPc(s,1);
+                    }
                      DEBUG_TRIE_EXECUTE_r({
                          Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
                      });
@@ -3547,7 +3573,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
                 if (*s == ch) {
                     DEBUG_EXECUTE_r( did_match = 1 );
                     if (regtry(reginfo, &s)) goto got_it;
-                   s += UTF8SKIP(s);
+                   s += UTF8_SAFE_SKIP(s, strend);
                     while (s < strend && *s == ch)
                         s += UTF8SKIP(s);
                 }
@@ -6790,8 +6816,33 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
          case ANYOFH:
              if (   ! utf8_target
                  ||   NEXTCHR_IS_EOS
-                ||  (   ANYOF_FLAGS(scan) != 0
-                     && ANYOF_FLAGS(scan) != (U8) *locinput)
+                ||   ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8((U8) *locinput)
+               || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
+                                                                   utf8_target))
+            {
+                sayNO;
+            }
+            goto increment_locinput;
+            break;
+
+        case ANYOFHb:
+            if (   ! utf8_target
+                ||   NEXTCHR_IS_EOS
+                ||   ANYOF_FLAGS(scan) != (U8) *locinput
+               || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
+                                                                  utf8_target))
+            {
+                sayNO;
+            }
+            goto increment_locinput;
+            break;
+
+        case ANYOFHr:
+            if (   ! utf8_target
+                ||   NEXTCHR_IS_EOS
+                || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
+                             LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
+                             HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
                 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
                                                                     utf8_target))
              {
@@ -7021,7 +7072,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             }
             break;
              
-       case NREFFL:  /*  /\g{name}/il  */
+       case REFFLN:  /*  /\g{name}/il  */
         {   /* The capture buffer cases.  The ones beginning with N for the
                named buffers just convert to the equivalent numbered and
                pretend they were called as the corresponding numbered buffer
@@ -7041,28 +7092,28 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             utf8_fold_flags = FOLDEQ_LOCALE;
             goto do_nref;
  
-       case NREFFA:  /*  /\g{name}/iaa  */
+       case REFFAN:  /*  /\g{name}/iaa  */
             folder = foldEQ_latin1;
             fold_array = PL_fold_latin1;
             type = REFFA;
             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
             goto do_nref;
  
-       case NREFFU:  /*  /\g{name}/iu  */
+       case REFFUN:  /*  /\g{name}/iu  */
             folder = foldEQ_latin1;
             fold_array = PL_fold_latin1;
             type = REFFU;
             utf8_fold_flags = 0;
             goto do_nref;
  
-       case NREFF:  /*  /\g{name}/i  */
+       case REFFN:  /*  /\g{name}/i  */
             folder = foldEQ;
             fold_array = PL_fold;
             type = REFF;
             utf8_fold_flags = 0;
             goto do_nref;
  
-       case NREF:  /*  /\g{name}/   */
+       case REFN:  /*  /\g{name}/   */
             type = REF;
             folder = NULL;
             fold_array = NULL;
@@ -7716,7 +7767,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
             break;
  
-       case NGROUPP:  /*  (?(<name>))  */
+       case GROUPPN:  /*  (?(<name>))  */
             /* reg_check_named_buff_matched returns 0 for no match */
             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
             break;
@@ -9565,22 +9616,42 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
  
      case ANYOFH:
          if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
-            if (ANYOF_FLAGS(p)) {   /* If we know the first byte of what
-                                       matches, we can avoid calling reginclass
-                                     */
-                while (   hardcount < max
-                       && scan < this_eol
-                       && (U8) *scan == ANYOF_FLAGS(p)
-                       && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
-                                                                  TRUE))
-                {
-                    scan += UTF8SKIP(scan);
-                    hardcount++;
-                }
+            while (  hardcount < max
+                   && scan < this_eol
+                   && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p)
+                   && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
+            {
+                scan += UTF8SKIP(scan);
+                hardcount++;
+            }
+        }
+        break;
+
+    case ANYOFHb:
+        if (utf8_target) {  /* ANYOFHb only can match UTF-8 targets */
+
+            /* we know the first byte must be the FLAGS field */
+            while (   hardcount < max
+                   && scan < this_eol
+                   && (U8) *scan == ANYOF_FLAGS(p)
+                   && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
+                                                              TRUE))
+            {
+                scan += UTF8SKIP(scan);
+                hardcount++;
              }
-            else while (  hardcount < max
-                        && scan < this_eol
-                        && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
+        }
+        break;
+
+    case ANYOFHr:
+        if (utf8_target) {  /* ANYOFH only can match UTF-8 targets */
+            while (  hardcount < max
+                   && scan < this_eol
+                   && inRANGE((U8) NATIVE_UTF8_TO_I8(*scan),
+                              LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
+                              HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
+                   && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p)
+                   && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
              {
                  scan += UTF8SKIP(scan);
                  hardcount++;
@@ -9831,7 +9902,9 @@ STATIC bool
  S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
  {
      dVAR;
-    const char flags = (OP(n) == ANYOFH) ? 0 : ANYOF_FLAGS(n);
+    const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHr))
+                        ? 0
+                        : ANYOF_FLAGS(n);
      bool match = FALSE;
      UV c = *p;
  
@@ -9858,7 +9931,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
      }
  
      /* If this character is potentially in the bitmap, check it */
-    if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) {
+    if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
         if (ANYOF_BITMAP_TEST(n, c))
             match = TRUE;
         else if ((flags
@@ -10160,6 +10233,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
      regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
  
      eval_state->rex = rex;
+    eval_state->sv  = reginfo->sv;
  
      if (reginfo->sv) {
          /* Make $_ available to executed code. */
@@ -10167,6 +10241,8 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
              SAVE_DEFSV;
              DEFSV_set(reginfo->sv);
          }
+        /* will be dec'd by S_cleanup_regmatch_info_aux */
+        SvREFCNT_inc_NN(reginfo->sv);
  
          if (!(mg = mg_find_mglob(reginfo->sv))) {
              /* prepare for quick setting of pos */
@@ -10258,6 +10334,7 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg)
          }
  
          PL_curpm = eval_state->curpm;
+        SvREFCNT_dec(eval_state->sv);
      }
  
      PL_regmatch_state = aux->old_regmatch_state;
@@ -10328,6 +10405,7 @@ S_to_byte_substr(pTHX_ regexp *prog)
             && !prog->substrs->data[i].substr) {
             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
             if (! sv_utf8_downgrade(sv, TRUE)) {
+                SvREFCNT_dec_NN(sv);
                  return FALSE;
              }
              if (SvVALID(prog->substrs->data[i].utf8_substr)) {