Bump Data::Dumper version

[perl5.git] / regexec.c
diff --git a/regexec.c b/regexec.c

index 533c0df..75d58ce 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -425,10 +425,8 @@ S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
  
  #define regcpblow(cp) LEAVE_SCOPE(cp)  /* Ignores regcppush()ed data. */
  
-#ifndef PERL_IN_XSUB_RE
-
-bool
-Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
+STATIC bool
+S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
  {
      /* Returns a boolean as to whether or not 'character' is a member of the
       * Posix character class given by 'classnum' that should be equivalent to a
@@ -468,8 +466,6 @@ Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
      return FALSE;
  }
  
-#endif
-
  PERL_STATIC_INLINE I32
  S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
  {
@@ -1412,7 +1408,7 @@ Perl_re_intuit_start(pTHX_
           * On the one hand you'd expect rare substrings to appear less
           * often than \n's. On the other hand, searching for \n means
           * we're effectively flipping between check_substr and "\n" on each
-         * iteration as the current "rarest" string candidate, which
+         * iteration as the current "rarest" candidate string, which
           * means for example that we'll quickly reject the whole string if
           * hasn't got a \n, rather than trying every substr position
           * first
@@ -4515,6 +4511,9 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
       * string can match, it returns FALSE; otherwise TRUE.  (The FALSE
       * situation occurs if the first character in <text_node> requires UTF-8 to
       * represent, and the target string isn't in UTF-8.)
+     *
+     * Some analysis is in GH #18414, located at the time of this writing at:
+     * https://github.com/Perl/perl5/issues/18414
       */
  
      const bool utf8_target = reginfo->is_utf8_target;
@@ -4524,7 +4523,7 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
  
      /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
       */
-    U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { 0 };
+    U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
      U8 lengths[MAX_MATCHES] = { 0 };
  
      U8 index_of_longest = 0;
@@ -4691,23 +4690,36 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
       *
       * Everything generally matches at least itself.  But if there is a
       * UTF8ness mismatch, we have to convert to that of the target string. */
-    if (utf8_pat == utf8_target || UTF8_IS_INVARIANT(*pat)) {
-        lengths[0] = MIN(pat_len, C_ARRAY_LENGTH(matches[0]));
-        Copy(pat, matches[0], lengths[0], U8);
+    if (UTF8_IS_INVARIANT(*pat)) {  /* Immaterial if either is in UTF-8 */
+        matches[0][0] = pat[0];
+        lengths[0] = 1;
          m->count++;
      }
-    else if (utf8_target) { /* target is UTF-8; pattern isn't */
-        matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
-        matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
-        lengths[0] = 2;
-        m->count++;
-    }
-    else { /* pattern is UTF-8, target isn't */
-        if (UTF8_IS_DOWNGRADEABLE_START(*pat)) {
-            matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
-            lengths[0] = 1;
+    else if (utf8_target) {
+        if (utf8_pat) {
+            lengths[0] = UTF8SKIP(pat);
+            Copy(pat, matches[0], lengths[0], U8);
              m->count++;
          }
+        else {  /* target is UTF-8, pattern isn't */
+            matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
+            matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
+            lengths[0] = 2;
+            m->count++;
+        }
+    }
+    else if (! utf8_pat) {  /* Neither is UTF-8 */
+        matches[0][0] = pat[0];
+        lengths[0] = 1;
+        m->count++;
+    }
+    else     /* target isn't UTF-8; pattern is.  No match possible unless the
+                pattern's first character can fit in a byte */
+         if (UTF8_IS_DOWNGRADEABLE_START(*pat))
+    {
+        matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
+        lengths[0] = 1;
+        m->count++;
      }
  
      /* Here we have taken care of any necessary node-type changes */
@@ -4808,7 +4820,8 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
                  fold_from = remaining_fold_froms[i-1];
              }
  
-            if (folded == fold_from) {  /* We already added the character itself */
+            if (folded == fold_from) {  /* We already added the character
+                                           itself */
                  continue;
              }
  
@@ -4845,8 +4858,8 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
                  lengths[m->count] = UVCHR_SKIP(fold_from);
                  m->count++;
              }
-            else { /* Non-UTF8 target: any code point above 255
-                      can't appear in it */
+            else { /* Non-UTF8 target: no code point above 255 can appear in it
+                    */
                  if (fold_from > 255) {
                      continue;
                  }
@@ -4969,7 +4982,10 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
          if (m->count > 1) { /* No need to sort a single entry */
              for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
  
-                /* Keep the same order for all but the longest */
+                /* Keep the same order for all but the longest.  (If the
+                 * asserts fail, it could be because m->matches is declared too
+                 * short, either because of a new Unicode release, or an
+                 * overlooked test case, or it could be a bug.) */
                  if (i != index_of_longest) {
                      assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
                      Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
@@ -4987,6 +5003,7 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
          m->lengths[output_index] = lengths[index_of_longest];
      }
  
+
      return TRUE;
  }
  
@@ -9218,60 +9235,60 @@ NULL
                curly_try_B_min_known:
                  /* find the next place where 'B' could work, then call B */
                  if (locinput + ST.Binfo.initial_exact < loceol) {
-                if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
-
-                    /* Here, the mask is all 1's for the entire length of
-                        * any possible match.  (That actually means that there
-                        * is only one possible match.)  Look for the next
-                        * occurrence */
-                    locinput = ninstr(locinput, loceol,
-                                    (char *) ST.Binfo.matches,
-                                    (char *) ST.Binfo.matches
-                                                + ST.Binfo.initial_exact);
-                    if (locinput == NULL) {
-                        sayNO;
-                    }
-                }
-                else do {
-                    /* If the first byte(s) of the mask are all ones, it
-                        * means those bytes must match identically, so can use
-                        * ninstr() to find the next possible matchpoint */
-                    if (ST.Binfo.initial_exact > 0) {
+                    if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
+
+                        /* Here, the mask is all 1's for the entire length of
+                         * any possible match.  (That actually means that there
+                         * is only one possible match.)  Look for the next
+                         * occurrence */
                          locinput = ninstr(locinput, loceol,
-                                            (char *) ST.Binfo.matches,
-                                            (char *) ST.Binfo.matches
+                                        (char *) ST.Binfo.matches,
+                                        (char *) ST.Binfo.matches
                                                      + ST.Binfo.initial_exact);
+                        if (locinput == NULL) {
+                            sayNO;
+                        }
                      }
-                    else { /* Otherwise find the next byte that matches,
-                                masked */
-                        locinput = (char *) find_next_masked(
-                                            (U8 *) locinput, (U8 *) loceol,
-                                            ST.Binfo.first_byte_anded,
-                                            ST.Binfo.first_byte_mask);
-                        /* Advance to the end of a multi-byte character */
-                        if (utf8_target) {
-                            while (   locinput < loceol
-                                && UTF8_IS_CONTINUATION(*locinput))
-                            {
-                                locinput++;
+                    else do {
+                        /* If the first byte(s) of the mask are all ones, it
+                         * means those bytes must match identically, so can use
+                         * ninstr() to find the next possible matchpoint */
+                        if (ST.Binfo.initial_exact > 0) {
+                            locinput = ninstr(locinput, loceol,
+                                              (char *) ST.Binfo.matches,
+                                              (char *) ST.Binfo.matches
+                                                     + ST.Binfo.initial_exact);
+                        }
+                        else { /* Otherwise find the next byte that matches,
+                                  masked */
+                            locinput = (char *) find_next_masked(
+                                                (U8 *) locinput, (U8 *) loceol,
+                                                ST.Binfo.first_byte_anded,
+                                                ST.Binfo.first_byte_mask);
+                            /* Advance to the end of a multi-byte character */
+                            if (utf8_target) {
+                                while (   locinput < loceol
+                                    && UTF8_IS_CONTINUATION(*locinput))
+                                {
+                                    locinput++;
+                                }
                              }
                          }
-                    }
-                    if (   locinput == NULL
-                        || locinput + ST.Binfo.min_length > loceol)
-                    {
-                        sayNO;
-                    }
+                        if (   locinput == NULL
+                            || locinput + ST.Binfo.min_length > loceol)
+                        {
+                            sayNO;
+                        }
  
-                    /* Here, we have found a possible match point; if can't
-                        * rule it out, quit the loop so can check fully */
-                    if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
-                        break;
-                    }
+                        /* Here, we have found a possible match point; if can't
+                         * rule it out, quit the loop so can check fully */
+                        if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
+                            break;
+                        }
  
-                    locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
+                        locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
  
-                } while (locinput <= ST.maxpos);
+                    } while (locinput <= ST.maxpos);
                  }
  
                 if (locinput > ST.maxpos)
@@ -9279,7 +9296,7 @@ NULL
  
                  n = (utf8_target)
                      ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
-                    : locinput - ST.oldloc;
+                    : (STRLEN) (locinput - ST.oldloc);
  
  
                  /* Here is at the beginning of a character that meets the mask
@@ -10014,7 +10031,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                      if (definitive_len == 1) {
                          const char * orig_scan = scan;
  
-                        this_eol = MIN(this_eol, scan + max - hardcount);
+                        if (this_eol - (scan - hardcount) > max) {
+                            this_eol = scan - hardcount + max;
+                        }
  
                          /* Use different routines depending on whether it's an
                           * exact match or matches with a mask */