This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Correct typo reported by Elizabeth Mattijsen
[perl5.git] / regexec.c
index 45084be..16a2309 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -328,7 +328,34 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
     );                                                          \
     regcpblow(cp)
 
+/* set the start and end positions of capture ix */
+#define CLOSE_CAPTURE(ix, s, e)                                            \
+    rex->offs[ix].start = s;                                               \
+    rex->offs[ix].end = e;                                                 \
+    if (ix > rex->lastparen)                                               \
+        rex->lastparen = ix;                                               \
+    rex->lastcloseparen = ix;                                              \
+    DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
+        "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
+        depth,                                                             \
+        PTR2UV(rex),                                                       \
+        PTR2UV(rex->offs),                                                 \
+        (UV)ix,                                                            \
+        (IV)rex->offs[ix].start,                                           \
+        (IV)rex->offs[ix].end,                                             \
+        (UV)rex->lastparen                                                 \
+    ))
+
 #define UNWIND_PAREN(lp, lcp)               \
+    DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_  \
+        "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
+        depth,                              \
+        PTR2UV(rex),                        \
+        PTR2UV(rex->offs),                  \
+        (UV)(lp),                           \
+        (UV)(rex->lastparen),               \
+        (UV)(lcp)                           \
+    ));                                     \
     for (n = rex->lastparen; n > lp; n--)   \
         rex->offs[n].end = -1;              \
     rex->lastparen = n;                     \
@@ -4699,6 +4726,24 @@ S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strb
                 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
             }
 
+        case GCB_Maybe_Emoji_NonBreak:
+
+            {
+
+            /* Do not break within emoji modifier sequences or emoji zwj sequences.
+              GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
+              */
+                U8 * temp_pos = (U8 *) curpos;
+                GCB_enum prev;
+
+                do {
+                    prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
+                }
+                while (prev == GCB_Extend);
+
+                return prev != GCB_XPG_XX;
+            }
+
         default:
             break;
     }
@@ -7566,26 +7611,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             script_run_begin = (U8 *) locinput;
             break;
 
-/* XXX really need to log other places start/end are set too */
-#define CLOSE_CAPTURE                                                      \
-    rex->offs[n].start = rex->offs[n].start_tmp;                           \
-    rex->offs[n].end = locinput - reginfo->strbeg;                         \
-    DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_                            \
-        "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \
-        depth,                                                             \
-        PTR2UV(rex),                                                       \
-        PTR2UV(rex->offs),                                                 \
-        (UV)n,                                                             \
-        (IV)rex->offs[n].start,                                            \
-        (IV)rex->offs[n].end                                               \
-    ))
 
        case CLOSE:  /*  )  */
            n = ARG(scan);  /* which paren pair */
-           CLOSE_CAPTURE;
-           if (n > rex->lastparen)
-               rex->lastparen = n;
-           rex->lastcloseparen = n;
+           CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
+                             locinput - reginfo->strbeg);
             if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
                goto fake_end;
 
@@ -7613,10 +7643,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                     if ( OP(cursor)==CLOSE ){
                         n = ARG(cursor);
                         if ( n <= lastopen ) {
-                           CLOSE_CAPTURE;
-                            if (n > rex->lastparen)
-                                rex->lastparen = n;
-                            rex->lastcloseparen = n;
+                           CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
+                                             locinput - reginfo->strbeg);
                             if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
                                 break;
                         }
@@ -8242,14 +8270,11 @@ NULL
 
            if (ST.me->flags) {
                /* emulate CLOSE: mark current A as captured */
-               I32 paren = ST.me->flags;
+               U32 paren = (U32)ST.me->flags;
                if (ST.count) {
-                   rex->offs[paren].start
-                       = HOPc(locinput, -ST.alen) - reginfo->strbeg;
-                   rex->offs[paren].end = locinput - reginfo->strbeg;
-                   if ((U32)paren > rex->lastparen)
-                       rex->lastparen = paren;
-                   rex->lastcloseparen = paren;
+                    CLOSE_CAPTURE(paren,
+                       HOPc(locinput, -ST.alen) - reginfo->strbeg,
+                       locinput - reginfo->strbeg);
                }
                else
                    rex->offs[paren].end = -1;
@@ -8288,11 +8313,8 @@ NULL
 #define CURLY_SETPAREN(paren, success) \
     if (paren) { \
        if (success) { \
-           rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
-           rex->offs[paren].end = locinput - reginfo->strbeg; \
-           if (paren > rex->lastparen) \
-               rex->lastparen = paren; \
-           rex->lastcloseparen = paren; \
+            CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
+                                locinput - reginfo->strbeg); \
        } \
        else { \
            rex->offs[paren].end = -1; \
@@ -8323,12 +8345,18 @@ NULL
                maxopenparen = ST.paren;
            ST.min = ARG1(scan);  /* min to match */
            ST.max = ARG2(scan);  /* max to match */
+            scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
+
+            /* handle the single-char capture called as a GOSUB etc */
             if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
             {
-               ST.min=1;
-               ST.max=1;
+                char *li = locinput;
+                if (!regrepeat(rex, &li, scan, reginfo, 1))
+                   sayNO;
+                SET_locinput(li);
+                goto fake_end;
            }
-            scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
+
            goto repeat;
 
        case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
@@ -8444,24 +8472,41 @@ NULL
            }
            NOT_REACHED; /* NOTREACHED */
 
-       case CURLY_B_min_known_fail:
-           /* failed to find B in a non-greedy match where c1,c2 valid */
+       case CURLY_B_min_fail:
+           /* failed to find B in a non-greedy match.
+             * Handles both cases where c1,c2 valid or not */
 
            REGCP_UNWIND(ST.cp);
             if (ST.paren) {
                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
             }
-           /* Couldn't or didn't -- move forward. */
-           ST.oldloc = locinput;
-           if (utf8_target)
-               locinput += UTF8SKIP(locinput);
-           else
-               locinput++;
-           ST.count++;
-         curly_try_B_min_known:
-            /* find the next place where 'B' could work, then call B */
-           {
+
+            if (ST.c1 == CHRTEST_VOID) {
+                /* failed -- move forward one */
+                char *li = locinput;
+                if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
+                    sayNO;
+                }
+                locinput = li;
+                ST.count++;
+               if (!(   ST.count <= ST.max
+                        /* count overflow ? */
+                     || (ST.max == REG_INFTY && ST.count > 0))
+                )
+                    sayNO;
+            }
+            else {
                int n;
+                /* Couldn't or didn't -- move forward. */
+                ST.oldloc = locinput;
+                if (utf8_target)
+                    locinput += UTF8SKIP(locinput);
+                else
+                    locinput++;
+                ST.count++;
+
+              curly_try_B_min_known:
+                /* find the next place where 'B' could work, then call B */
                if (utf8_target) {
                    n = (ST.oldloc == locinput) ? 0 : 1;
                    if (ST.c1 == ST.c2) {
@@ -8540,47 +8585,16 @@ NULL
                        sayNO;
                     assert(n == REG_INFTY || locinput == li);
                }
-               CURLY_SETPAREN(ST.paren, ST.count);
-                if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
-                   goto fake_end;
-               PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
            }
-           NOT_REACHED; /* NOTREACHED */
 
-       case CURLY_B_min_fail:
-           /* failed to find B in a non-greedy match where c1,c2 invalid */
-
-           REGCP_UNWIND(ST.cp);
-            if (ST.paren) {
-                UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
-            }
-           /* failed -- move forward one */
-            {
-                char *li = locinput;
-                if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
-                    sayNO;
-                }
-                locinput = li;
-            }
-            {
-               ST.count++;
-               if (ST.count <= ST.max || (ST.max == REG_INFTY &&
-                       ST.count > 0)) /* count overflow ? */
-               {
-                 curly_try_B_min:
-                   CURLY_SETPAREN(ST.paren, ST.count);
-                    if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
-                        goto fake_end;
-                   PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
-               }
-           }
-            sayNO;
+          curly_try_B_min:
+            CURLY_SETPAREN(ST.paren, ST.count);
+            PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
            NOT_REACHED; /* NOTREACHED */
 
+
           curly_try_B_max:
            /* a successful greedy match: now try to match B */
-            if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
-                goto fake_end;
            {
                bool could_match = locinput < reginfo->strend;
 
@@ -10291,6 +10305,8 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
 
     /* What code point is the digit '0' of the script run? */
     UV zero_of_run = 0;
+#define SEEN_A_DIGIT (zero_of_run != 0)
+
     SCX_enum script_of_run  = SCX_INVALID;   /* Illegal value */
     SCX_enum script_of_char = SCX_INVALID;
 
@@ -10342,7 +10358,7 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
                 retval = FALSE;
                 break;
             }
-            if (zero_of_run > 0) {
+            if (SEEN_A_DIGIT) {
                 if (zero_of_run != '0') {
                     retval = FALSE;
                     break;
@@ -10368,7 +10384,7 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
         /* If is within the range [+0 .. +9] of the script's zero, it also is a
          * digit in that script.  We can skip the rest of this code for this
          * character. */
-        if (UNLIKELY(   zero_of_run > 0
+        if (UNLIKELY(   SEEN_A_DIGIT
                      && cp >= zero_of_run
                      && cp - zero_of_run <= 9))
         {
@@ -10433,7 +10449,7 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
 
             /* But Common contains several sets of digits.  Only the '0' set
              * can be part of another script. */
-            if (zero_of_run > 0 && zero_of_run != '0') {
+            if (SEEN_A_DIGIT && zero_of_run != '0') {
                 retval = FALSE;
                 break;
             }
@@ -10447,7 +10463,7 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
          * one to use, and the character is in that sequence.  Now that we know
          * the script, we can use script_zeros[] to directly find which
          * sequence the script uses, except in a few cases it returns 0 */
-        if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
+        if (UNLIKELY(zero_of_run == 0 && script_of_char >= 0)) {
             zero_of_run = script_zeros[script_of_char];
         }
 
@@ -10606,23 +10622,39 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
   scripts_match:
 
         /* Here, the script of the character is compatible with that of the
-         * run.  Either they match exactly, or one or both can be any of
-         * several scripts, and the intersection is not empty.  If the
-         * character is not a decimal digit, we are done with it.  Otherwise,
-         * it could still fail if it is from a different set of 10 than seen
-         * already (or we may not have seen any, and we need to set the
-         * sequence).  If we have determined a single script and that script
-         * only has one set of digits (almost all scripts are like that), then
-         * this isn't a problem, as any digit must come from the same sequence.
-         * The only scripts that have multiple sequences have been constructed
-         * to be 0 in 'script_zeros[]'.
+         * run.  That means that in most cases, it continues the script run.
+         * Either it and the run match exactly, or one or both can be in any of
+         * several scripts, and the intersection is not empty.  But if the
+         * character is a decimal digit, we need further handling.  If we
+         * haven't seen a digit before, it would establish what set of 10 all
+         * must come from; and if we have established a set, we need to check
+         * that this is in it.
          *
-         * Here we check if it is a digit. */
+         * But there are cases we can rule out without having to look up if
+         * this is a digit:
+         *   a.  All instances of [0-9] have been dealt with earlier.
+         *   b.  The next digit encoded by Unicode is 1600 code points further
+         *       on, so if the code point in this loop iteration is less than
+         *       that, it isn't a digit.
+         *   c.  Most scripts that have digits have a single set of 10.  If
+         *       we've encountered a digit in such a script, 'zero_of_run' is
+         *       set to the code point (call it z) whose numeric value is 0.
+         *       If the code point in this loop iteration is in the range
+         *       z..z+9, it is in the script's set of 10, and we've actually
+         *       handled it earlier in this function and won't reach this
+         *       point.  But, code points in that script that aren't in that
+         *       range can't be digits, so we don't have to look any such up.
+         *       We can tell if this script is such a one by looking at
+         *       'script_zeros[]' for it.  It is non-zero iff it has a single
+         *       set of digits.  This rule doesn't apply if we haven't narrowed
+         *       down the possible scripts to a single one yet.  Nor if the
+         *       zero of the run is '0', as that also hasn't narrowed things
+         *       down completely */
         if (    cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
-            && (   (          zero_of_run == 0
-                    || (  (   script_of_char >= 0
-                           && script_zeros[script_of_char] == 0)
-                        ||    intersection))))
+            && (   intersection
+                || script_of_char < 0   /* Also implies an intersection */
+                || zero_of_run == '0'
+                || script_zeros[script_of_char] == 0))
         {
             SSize_t range_zero_index;
             range_zero_index = _invlist_search(decimals_invlist, cp);
@@ -10630,7 +10662,7 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
                 && ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index))
             {
                 UV range_zero = decimals_array[range_zero_index];
-                if (zero_of_run) {
+                if (SEEN_A_DIGIT) {
                     if (zero_of_run != range_zero) {
                         retval = FALSE;
                         break;