regexec.c: restrict match to substring in regmatch()

[perl5.git] / regexec.c
diff --git a/regexec.c b/regexec.c

index 223eaf2..bca2c4c 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -5645,6 +5645,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
      SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
      SSize_t endref = 0; /* offset of end of backref when ln is start */
      char *locinput = startpos;
      SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
      SSize_t endref = 0; /* offset of end of backref when ln is start */
      char *locinput = startpos;
+    char *loceol = reginfo->strend;
      char *pushinput; /* where to continue after a PUSH */
      I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
  
      char *pushinput; /* where to continue after a PUSH */
      I32 nextchr;   /* is always set to UCHARAT(locinput), or -1 at EOS */
  
@@ -5809,13 +5810,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             break;
  
         case SANY: /*  /./s  */
             break;
  
         case SANY: /*  /./s  */
-           if (NEXTCHR_IS_EOS)
+           if (NEXTCHR_IS_EOS || locinput >= loceol)
                 sayNO;
              goto increment_locinput;
  
         case REG_ANY: /*  /./  */
                 sayNO;
              goto increment_locinput;
  
         case REG_ANY: /*  /./  */
-           if ((NEXTCHR_IS_EOS) || nextchr == '\n')
+           if (   NEXTCHR_IS_EOS
+                || locinput >= loceol
+                || nextchr == '\n')
+            {
                 sayNO;
                 sayNO;
+            }
              goto increment_locinput;
  
  
              goto increment_locinput;
  
  
@@ -5825,7 +5830,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
              /* In this case the charclass data is available inline so
                 we can fail fast without a lot of extra overhead. 
               */
              /* In this case the charclass data is available inline so
                 we can fail fast without a lot of extra overhead. 
               */
-            if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
+            if ( !   NEXTCHR_IS_EOS
+                &&   locinput < loceol
+                && ! ANYOF_BITMAP_TEST(scan, nextchr))
+            {
                  DEBUG_EXECUTE_r(
                      Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
                                depth, PL_colors[4], PL_colors[5])
                  DEBUG_EXECUTE_r(
                      Perl_re_exec_indentf( aTHX_  "%sTRIE: failed to match trie start class...%s\n",
                                depth, PL_colors[4], PL_colors[5])
@@ -5904,7 +5912,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                      }
                  }
                  if (   trie->bitmap
                      }
                  }
                  if (   trie->bitmap
-                    && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
+                    && (     NEXTCHR_IS_EOS
+                        ||   locinput >= loceol
+                        || ! TRIE_BITMAP_TEST(trie, nextchr)))
                  {
                     if (trie->states[ state ].wordnum) {
                          DEBUG_EXECUTE_r(
                  {
                     if (trie->states[ state ].wordnum) {
                          DEBUG_EXECUTE_r(
@@ -5942,7 +5952,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                    shortest accept state and the wordnum of the longest
                    accept state */
  
                    shortest accept state and the wordnum of the longest
                    accept state */
  
-               while ( state && uc <= (U8*)(reginfo->strend) ) {
+               while ( state && uc <= (U8*)(loceol) ) {
                      U32 base = trie->states[ state ].trans.base;
                      UV uvc = 0;
                      U16 charid = 0;
                      U32 base = trie->states[ state ].trans.base;
                      UV uvc = 0;
                      U16 charid = 0;
@@ -5977,10 +5987,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                     });
  
                     /* read a char and goto next state */
                     });
  
                     /* read a char and goto next state */
-                   if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
+                   if ( base && (foldlen || uc < (U8*)(loceol))) {
                         I32 offset;
                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
                         I32 offset;
                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
-                                             (U8 *) reginfo->strend, uscan,
+                                             (U8 *) loceol, uscan,
                                               len, uvc, charid, foldlen,
                                               foldbuf, uniflags);
                         charcount++;
                                               len, uvc, charid, foldlen,
                                               foldbuf, uniflags);
                         charcount++;
@@ -6214,7 +6224,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                       * is an invariant, but there are tests in the test suite
                       * dealing with (??{...}) which violate this) */
                     while (s < e) {
                       * is an invariant, but there are tests in the test suite
                       * dealing with (??{...}) which violate this) */
                     while (s < e) {
-                       if (l >= reginfo->strend
+                       if (   l >= loceol
                              || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
                          {
                              sayNO;
                              || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
                          {
                              sayNO;
@@ -6238,7 +6248,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 else {
                     /* The target is not utf8, the pattern is utf8. */
                     while (s < e) {
                 else {
                     /* The target is not utf8, the pattern is utf8. */
                     while (s < e) {
-                        if (l >= reginfo->strend
+                        if (   l >= loceol
                              || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
                          {
                              sayNO;
                              || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
                          {
                              sayNO;
@@ -6264,7 +6274,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
              else {
                  /* The target and the pattern have the same utf8ness. */
                  /* Inline the first character, for speed. */
              else {
                  /* The target and the pattern have the same utf8ness. */
                  /* Inline the first character, for speed. */
-                if (reginfo->strend - locinput < ln
+                if (   loceol - locinput < ln
                      || UCHARAT(s) != nextchr
                      || (ln > 1 && memNE(s, locinput, ln)))
                  {
                      || UCHARAT(s) != nextchr
                      || (ln > 1 && memNE(s, locinput, ln)))
                  {
@@ -6360,7 +6370,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
               /* Either target or the pattern are utf8, or has the issue where
                * the fold lengths may differ. */
                 const char * const l = locinput;
               /* Either target or the pattern are utf8, or has the issue where
                * the fold lengths may differ. */
                 const char * const l = locinput;
-               char *e = reginfo->strend;
+               char *e = loceol;
  
                 if (! foldEQ_utf8_flags(l, &e, 0,  utf8_target,
                                          s, 0,  ln, is_utf8_pat,fold_utf8_flags))
  
                 if (! foldEQ_utf8_flags(l, &e, 0,  utf8_target,
                                          s, 0,  ln, is_utf8_pat,fold_utf8_flags))
@@ -6378,7 +6388,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             {
                 sayNO;
             }
             {
                 sayNO;
             }
-           if (reginfo->strend - locinput < ln)
+           if (loceol - locinput < ln)
                 sayNO;
             if (ln > 1 && ! folder(locinput, s, ln))
                 sayNO;
                 sayNO;
             if (ln > 1 && ! folder(locinput, s, ln))
                 sayNO;
@@ -6674,7 +6684,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
              /* FALLTHROUGH */
         case ANYOFD:  /*   /[abc]/d       */
         case ANYOF:  /*   /[abc]/       */
              /* FALLTHROUGH */
         case ANYOFD:  /*   /[abc]/d       */
         case ANYOF:  /*   /[abc]/       */
-            if (NEXTCHR_IS_EOS)
+            if (NEXTCHR_IS_EOS || locinput >= loceol)
                  sayNO;
             if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
                 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
                  sayNO;
             if (  (! utf8_target || UTF8_IS_INVARIANT(*locinput))
                 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
@@ -6685,7 +6695,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 locinput++;
              }
              else {
                 locinput++;
              }
              else {
-               if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
+               if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
                                                                     utf8_target))
                  {
                     sayNO;
                                                                     utf8_target))
                  {
                     sayNO;
@@ -6695,14 +6705,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             break;
  
          case ANYOFM:
             break;
  
          case ANYOFM:
-            if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)) {
+            if (   NEXTCHR_IS_EOS
+                || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
+                || locinput >= loceol)
+            {
                  sayNO;
              }
              locinput++; /* ANYOFM is always single byte */
              break;
  
          case NANYOFM:
                  sayNO;
              }
              locinput++; /* ANYOFM is always single byte */
              break;
  
          case NANYOFM:
-            if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)) {
+            if (   NEXTCHR_IS_EOS
+                || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
+                || locinput >= loceol)
+            {
                  sayNO;
              }
              goto increment_locinput;
                  sayNO;
              }
              goto increment_locinput;
@@ -6711,7 +6727,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
          case ANYOFH:
              if (   ! utf8_target
                  ||   NEXTCHR_IS_EOS
          case ANYOFH:
              if (   ! utf8_target
                  ||   NEXTCHR_IS_EOS
-               || ! reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
+               || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
                                                                     utf8_target))
              {
                  sayNO;
                                                                     utf8_target))
              {
                  sayNO;
@@ -6728,7 +6744,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
  
          case POSIXL:    /* \w or [:punct:] etc. under /l */
              _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
  
          case POSIXL:    /* \w or [:punct:] etc. under /l */
              _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
-            if (NEXTCHR_IS_EOS)
+            if (NEXTCHR_IS_EOS || locinput >= loceol)
                  sayNO;
  
              /* Use isFOO_lc() for characters within Latin1.  (Note that
                  sayNO;
  
              /* Use isFOO_lc() for characters within Latin1.  (Note that
@@ -6773,7 +6789,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
  
          case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
  
  
          case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
  
-            if (NEXTCHR_IS_EOS) {
+            if (NEXTCHR_IS_EOS || locinput >= loceol) {
                  sayNO;
              }
  
                  sayNO;
              }
  
@@ -6792,7 +6808,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
               * UTF-8, and also from NPOSIXA even in UTF-8 when the current
               * character is a single byte */
  
               * UTF-8, and also from NPOSIXA even in UTF-8 when the current
               * character is a single byte */
  
-            if (NEXTCHR_IS_EOS) {
+            if (NEXTCHR_IS_EOS || locinput >= loceol) {
                  sayNO;
              }
  
                  sayNO;
              }
  
@@ -6815,7 +6831,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
  
          case POSIXU:    /* \w or [:punct:] etc. under /u */
            utf8_posix:
  
          case POSIXU:    /* \w or [:punct:] etc. under /u */
            utf8_posix:
-            if (NEXTCHR_IS_EOS) {
+            if (NEXTCHR_IS_EOS || locinput >= loceol) {
                  sayNO;
              }
  
                  sayNO;
              }
  
@@ -6896,7 +6912,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
  
         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
                        a Unicode extended Grapheme Cluster */
  
         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
                        a Unicode extended Grapheme Cluster */
-           if (NEXTCHR_IS_EOS)
+           if (NEXTCHR_IS_EOS || locinput >= loceol)
                 sayNO;
             if  (! utf8_target) {
  
                 sayNO;
             if  (! utf8_target) {
  
@@ -6905,7 +6921,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 locinput++;         /* Match the . or CR */
                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
                                        match the LF */
                 locinput++;         /* Match the . or CR */
                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
                                        match the LF */
-                   && locinput < reginfo->strend
+                   && locinput <  loceol
                     && UCHARAT(locinput) == '\n')
                  {
                      locinput++;
                     && UCHARAT(locinput) == '\n')
                  {
                      locinput++;
@@ -6922,7 +6938,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                   * current character.  (There is always a break at the
                   * end-of-input) */
                  locinput += UTF8SKIP(locinput);
                   * current character.  (There is always a break at the
                   * end-of-input) */
                  locinput += UTF8SKIP(locinput);
-                while (locinput < reginfo->strend) {
+                while (locinput < loceol) {
                      GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
                                                           (U8*) reginfo->strend);
                      if (isGCB(prev_gcb, cur_gcb,
                      GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
                                                           (U8*) reginfo->strend);
                      if (isGCB(prev_gcb, cur_gcb,
@@ -7044,11 +7060,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             if (type != REF     /* REF can do byte comparison */
                 && (utf8_target || type == REFFU || type == REFFL))
             {
             if (type != REF     /* REF can do byte comparison */
                 && (utf8_target || type == REFFU || type == REFFL))
             {
-               char * limit = reginfo->strend;
+               char * limit = loceol;
  
                 /* This call case insensitively compares the entire buffer
                     * at s, with the current input starting at locinput, but
  
                 /* This call case insensitively compares the entire buffer
                     * at s, with the current input starting at locinput, but
-                    * not going off the end given by reginfo->strend, and
+                    * not going off the end given by loceol, and
                      * returns in <limit> upon success, how much of the
                      * current input was matched */
                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
                      * returns in <limit> upon success, how much of the
                      * current input was matched */
                 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
@@ -7061,13 +7077,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             }
  
             /* Not utf8:  Inline the first character, for speed. */
             }
  
             /* Not utf8:  Inline the first character, for speed. */
-           if (!NEXTCHR_IS_EOS &&
-                UCHARAT(s) != nextchr &&
-               (type == REF ||
-                UCHARAT(s) != fold_array[nextchr]))
+           if ( ! NEXTCHR_IS_EOS
+                && locinput < loceol
+                && UCHARAT(s) != nextchr
+                && (   type == REF
+                    || UCHARAT(s) != fold_array[nextchr]))
+            {
                 sayNO;
                 sayNO;
+            }
             ln = endref - ln;
             ln = endref - ln;
-           if (locinput + ln > reginfo->strend)
+           if (locinput + ln > loceol)
                 sayNO;
             if (ln > 1 && (type == REF
                            ? memNE(s, locinput, ln)
                 sayNO;
             if (ln > 1 && (type == REF
                            ? memNE(s, locinput, ln)
@@ -8313,7 +8332,7 @@ NULL
              if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
              {
                  char *li = locinput;
              if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
              {
                  char *li = locinput;
-                if (!regrepeat(rex, &li, scan, reginfo->strend, reginfo, 1))
+                if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
                     sayNO;
                  SET_locinput(li);
                  goto fake_end;
                     sayNO;
                  SET_locinput(li);
                  goto fake_end;
@@ -8369,7 +8388,7 @@ NULL
                  char *li = locinput;
                 minmod = 0;
                 if (ST.min &&
                  char *li = locinput;
                 minmod = 0;
                 if (ST.min &&
-                        regrepeat(rex, &li, ST.A, reginfo->strend, reginfo, ST.min)
+                        regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
                              < ST.min)
                     sayNO;
                  SET_locinput(li);
                              < ST.min)
                     sayNO;
                  SET_locinput(li);
@@ -8383,7 +8402,7 @@ NULL
                 /* set ST.maxpos to the furthest point along the
                  * string that could possibly match */
                 if  (ST.max == REG_INFTY) {
                 /* set ST.maxpos to the furthest point along the
                  * string that could possibly match */
                 if  (ST.max == REG_INFTY) {
-                   ST.maxpos = reginfo->strend - 1;
+                   ST.maxpos = loceol - 1;
                     if (utf8_target)
                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
                             ST.maxpos--;
                     if (utf8_target)
                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
                             ST.maxpos--;
@@ -8391,13 +8410,13 @@ NULL
                 else if (utf8_target) {
                     int m = ST.max - ST.min;
                     for (ST.maxpos = locinput;
                 else if (utf8_target) {
                     int m = ST.max - ST.min;
                     for (ST.maxpos = locinput;
-                        m >0 && ST.maxpos < reginfo->strend; m--)
+                        m >0 && ST.maxpos <  loceol; m--)
                         ST.maxpos += UTF8SKIP(ST.maxpos);
                 }
                 else {
                     ST.maxpos = locinput + ST.max - ST.min;
                         ST.maxpos += UTF8SKIP(ST.maxpos);
                 }
                 else {
                     ST.maxpos = locinput + ST.max - ST.min;
-                   if (ST.maxpos >= reginfo->strend)
-                       ST.maxpos = reginfo->strend - 1;
+                   if (ST.maxpos >=  loceol)
+                       ST.maxpos =  loceol - 1;
                 }
                 goto curly_try_B_min_known;
  
                 }
                 goto curly_try_B_min_known;
  
@@ -8406,7 +8425,7 @@ NULL
                  /* avoid taking address of locinput, so it can remain
                   * a register var */
                  char *li = locinput;
                  /* avoid taking address of locinput, so it can remain
                   * a register var */
                  char *li = locinput;
-                ST.count = regrepeat(rex, &li, ST.A, reginfo->strend, reginfo, ST.max);
+                ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
                 if (ST.count < ST.min)
                     sayNO;
                  SET_locinput(li);
                 if (ST.count < ST.min)
                     sayNO;
                  SET_locinput(li);
@@ -8439,7 +8458,7 @@ NULL
              if (ST.c1 == CHRTEST_VOID) {
                  /* failed -- move forward one */
                  char *li = locinput;
              if (ST.c1 == CHRTEST_VOID) {
                  /* failed -- move forward one */
                  char *li = locinput;
-                if (!regrepeat(rex, &li, ST.A, reginfo->strend, reginfo, 1)) {
+                if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
                      sayNO;
                  }
                  locinput = li;
                      sayNO;
                  }
                  locinput = li;
@@ -8536,7 +8555,7 @@ NULL
                       * locinput matches */
                      char *li = ST.oldloc;
                     ST.count += n;
                       * locinput matches */
                      char *li = ST.oldloc;
                     ST.count += n;
-                    if (regrepeat(rex, &li, ST.A, reginfo->strend, reginfo, n) < n)
+                    if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
                         sayNO;
                      assert(n == REG_INFTY || locinput == li);
                 }
                         sayNO;
                      assert(n == REG_INFTY || locinput == li);
                 }
@@ -8551,7 +8570,7 @@ NULL
            curly_try_B_max:
             /* a successful greedy match: now try to match B */
             {
            curly_try_B_max:
             /* a successful greedy match: now try to match B */
             {
-               bool could_match = locinput < reginfo->strend;
+               bool could_match = locinput <  loceol;
  
                 /* If it could work, try it. */
                  if (ST.c1 != CHRTEST_VOID && could_match) {
  
                 /* If it could work, try it. */
                  if (ST.c1 != CHRTEST_VOID && could_match) {
@@ -8726,7 +8745,7 @@ NULL
             break;
  
         case COMMIT:  /*  (*COMMIT)  */
             break;
  
         case COMMIT:  /*  (*COMMIT)  */
-           reginfo->cutpoint = reginfo->strend;
+           reginfo->cutpoint = loceol;
             /* FALLTHROUGH */
  
         case PRUNE:   /*  (*PRUNE)   */
             /* FALLTHROUGH */
  
         case PRUNE:   /*  (*PRUNE)   */
@@ -8837,7 +8856,7 @@ NULL
  #undef ST
  
          case LNBREAK: /* \R */
  #undef ST
  
          case LNBREAK: /* \R */
-            if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
+            if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
                  locinput += n;
              } else
                  sayNO;
                  locinput += n;
              } else
                  sayNO;
@@ -8856,7 +8875,7 @@ NULL
                  locinput += PL_utf8skip[nextchr];
                  /* locinput is allowed to go 1 char off the end (signifying
                   * EOS), but not 2+ */
                  locinput += PL_utf8skip[nextchr];
                  /* locinput is allowed to go 1 char off the end (signifying
                   * EOS), but not 2+ */
-                if (locinput > reginfo->strend)
+                if (locinput >  loceol)
                      sayNO;
              }
              else
                      sayNO;
              }
              else