This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Bump Data::Dumper version
[perl5.git] / regexec.c
index 533c0df..75d58ce 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -425,10 +425,8 @@ S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
 
 #define regcpblow(cp) LEAVE_SCOPE(cp)  /* Ignores regcppush()ed data. */
 
-#ifndef PERL_IN_XSUB_RE
-
-bool
-Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
+STATIC bool
+S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 {
     /* Returns a boolean as to whether or not 'character' is a member of the
      * Posix character class given by 'classnum' that should be equivalent to a
@@ -468,8 +466,6 @@ Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
     return FALSE;
 }
 
-#endif
-
 PERL_STATIC_INLINE I32
 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
 {
@@ -1412,7 +1408,7 @@ Perl_re_intuit_start(pTHX_
          * On the one hand you'd expect rare substrings to appear less
          * often than \n's. On the other hand, searching for \n means
          * we're effectively flipping between check_substr and "\n" on each
-         * iteration as the current "rarest" string candidate, which
+         * iteration as the current "rarest" candidate string, which
          * means for example that we'll quickly reject the whole string if
          * hasn't got a \n, rather than trying every substr position
          * first
@@ -4515,6 +4511,9 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
      * string can match, it returns FALSE; otherwise TRUE.  (The FALSE
      * situation occurs if the first character in <text_node> requires UTF-8 to
      * represent, and the target string isn't in UTF-8.)
+     *
+     * Some analysis is in GH #18414, located at the time of this writing at:
+     * https://github.com/Perl/perl5/issues/18414
      */
 
     const bool utf8_target = reginfo->is_utf8_target;
@@ -4524,7 +4523,7 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
 
     /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
      */
-    U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { 0 };
+    U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
     U8 lengths[MAX_MATCHES] = { 0 };
 
     U8 index_of_longest = 0;
@@ -4691,23 +4690,36 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
      *
      * Everything generally matches at least itself.  But if there is a
      * UTF8ness mismatch, we have to convert to that of the target string. */
-    if (utf8_pat == utf8_target || UTF8_IS_INVARIANT(*pat)) {
-        lengths[0] = MIN(pat_len, C_ARRAY_LENGTH(matches[0]));
-        Copy(pat, matches[0], lengths[0], U8);
+    if (UTF8_IS_INVARIANT(*pat)) {  /* Immaterial if either is in UTF-8 */
+        matches[0][0] = pat[0];
+        lengths[0] = 1;
         m->count++;
     }
-    else if (utf8_target) { /* target is UTF-8; pattern isn't */
-        matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
-        matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
-        lengths[0] = 2;
-        m->count++;
-    }
-    else { /* pattern is UTF-8, target isn't */
-        if (UTF8_IS_DOWNGRADEABLE_START(*pat)) {
-            matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
-            lengths[0] = 1;
+    else if (utf8_target) {
+        if (utf8_pat) {
+            lengths[0] = UTF8SKIP(pat);
+            Copy(pat, matches[0], lengths[0], U8);
             m->count++;
         }
+        else {  /* target is UTF-8, pattern isn't */
+            matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
+            matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
+            lengths[0] = 2;
+            m->count++;
+        }
+    }
+    else if (! utf8_pat) {  /* Neither is UTF-8 */
+        matches[0][0] = pat[0];
+        lengths[0] = 1;
+        m->count++;
+    }
+    else     /* target isn't UTF-8; pattern is.  No match possible unless the
+                pattern's first character can fit in a byte */
+         if (UTF8_IS_DOWNGRADEABLE_START(*pat))
+    {
+        matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
+        lengths[0] = 1;
+        m->count++;
     }
 
     /* Here we have taken care of any necessary node-type changes */
@@ -4808,7 +4820,8 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
                 fold_from = remaining_fold_froms[i-1];
             }
 
-            if (folded == fold_from) {  /* We already added the character itself */
+            if (folded == fold_from) {  /* We already added the character
+                                           itself */
                 continue;
             }
 
@@ -4845,8 +4858,8 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
                 lengths[m->count] = UVCHR_SKIP(fold_from);
                 m->count++;
             }
-            else { /* Non-UTF8 target: any code point above 255
-                      can't appear in it */
+            else { /* Non-UTF8 target: no code point above 255 can appear in it
+                    */
                 if (fold_from > 255) {
                     continue;
                 }
@@ -4969,7 +4982,10 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
         if (m->count > 1) { /* No need to sort a single entry */
             for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
 
-                /* Keep the same order for all but the longest */
+                /* Keep the same order for all but the longest.  (If the
+                 * asserts fail, it could be because m->matches is declared too
+                 * short, either because of a new Unicode release, or an
+                 * overlooked test case, or it could be a bug.) */
                 if (i != index_of_longest) {
                     assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
                     Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
@@ -4987,6 +5003,7 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
         m->lengths[output_index] = lengths[index_of_longest];
     }
 
+
     return TRUE;
 }
 
@@ -9218,60 +9235,60 @@ NULL
               curly_try_B_min_known:
                 /* find the next place where 'B' could work, then call B */
                 if (locinput + ST.Binfo.initial_exact < loceol) {
-                if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
-
-                    /* Here, the mask is all 1's for the entire length of
-                        * any possible match.  (That actually means that there
-                        * is only one possible match.)  Look for the next
-                        * occurrence */
-                    locinput = ninstr(locinput, loceol,
-                                    (char *) ST.Binfo.matches,
-                                    (char *) ST.Binfo.matches
-                                                + ST.Binfo.initial_exact);
-                    if (locinput == NULL) {
-                        sayNO;
-                    }
-                }
-                else do {
-                    /* If the first byte(s) of the mask are all ones, it
-                        * means those bytes must match identically, so can use
-                        * ninstr() to find the next possible matchpoint */
-                    if (ST.Binfo.initial_exact > 0) {
+                    if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
+
+                        /* Here, the mask is all 1's for the entire length of
+                         * any possible match.  (That actually means that there
+                         * is only one possible match.)  Look for the next
+                         * occurrence */
                         locinput = ninstr(locinput, loceol,
-                                            (char *) ST.Binfo.matches,
-                                            (char *) ST.Binfo.matches
+                                        (char *) ST.Binfo.matches,
+                                        (char *) ST.Binfo.matches
                                                     + ST.Binfo.initial_exact);
+                        if (locinput == NULL) {
+                            sayNO;
+                        }
                     }
-                    else { /* Otherwise find the next byte that matches,
-                                masked */
-                        locinput = (char *) find_next_masked(
-                                            (U8 *) locinput, (U8 *) loceol,
-                                            ST.Binfo.first_byte_anded,
-                                            ST.Binfo.first_byte_mask);
-                        /* Advance to the end of a multi-byte character */
-                        if (utf8_target) {
-                            while (   locinput < loceol
-                                && UTF8_IS_CONTINUATION(*locinput))
-                            {
-                                locinput++;
+                    else do {
+                        /* If the first byte(s) of the mask are all ones, it
+                         * means those bytes must match identically, so can use
+                         * ninstr() to find the next possible matchpoint */
+                        if (ST.Binfo.initial_exact > 0) {
+                            locinput = ninstr(locinput, loceol,
+                                              (char *) ST.Binfo.matches,
+                                              (char *) ST.Binfo.matches
+                                                     + ST.Binfo.initial_exact);
+                        }
+                        else { /* Otherwise find the next byte that matches,
+                                  masked */
+                            locinput = (char *) find_next_masked(
+                                                (U8 *) locinput, (U8 *) loceol,
+                                                ST.Binfo.first_byte_anded,
+                                                ST.Binfo.first_byte_mask);
+                            /* Advance to the end of a multi-byte character */
+                            if (utf8_target) {
+                                while (   locinput < loceol
+                                    && UTF8_IS_CONTINUATION(*locinput))
+                                {
+                                    locinput++;
+                                }
                             }
                         }
-                    }
-                    if (   locinput == NULL
-                        || locinput + ST.Binfo.min_length > loceol)
-                    {
-                        sayNO;
-                    }
+                        if (   locinput == NULL
+                            || locinput + ST.Binfo.min_length > loceol)
+                        {
+                            sayNO;
+                        }
 
-                    /* Here, we have found a possible match point; if can't
-                        * rule it out, quit the loop so can check fully */
-                    if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
-                        break;
-                    }
+                        /* Here, we have found a possible match point; if can't
+                         * rule it out, quit the loop so can check fully */
+                        if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
+                            break;
+                        }
 
-                    locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
+                        locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
 
-                } while (locinput <= ST.maxpos);
+                    } while (locinput <= ST.maxpos);
                 }
 
                if (locinput > ST.maxpos)
@@ -9279,7 +9296,7 @@ NULL
 
                 n = (utf8_target)
                     ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
-                    : locinput - ST.oldloc;
+                    : (STRLEN) (locinput - ST.oldloc);
 
 
                 /* Here is at the beginning of a character that meets the mask
@@ -10014,7 +10031,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                     if (definitive_len == 1) {
                         const char * orig_scan = scan;
 
-                        this_eol = MIN(this_eol, scan + max - hardcount);
+                        if (this_eol - (scan - hardcount) > max) {
+                            this_eol = scan - hardcount + max;
+                        }
 
                         /* Use different routines depending on whether it's an
                          * exact match or matches with a mask */