+
+regmatch() - main matching routine
+
+This is basically one big switch statement in a loop. We execute an op,
+set 'next' to point the next op, and continue. If we come to a point which
+we may need to backtrack to on failure such as (A|B|C), we push a
+backtrack state onto the backtrack stack. On failure, we pop the top
+state, and re-enter the loop at the state indicated. If there are no more
+states to pop, we return failure.
+
+Sometimes we also need to backtrack on success; for example /A+/, where
+after successfully matching one A, we need to go back and try to
+match another one; similarly for lookahead assertions: if the assertion
+completes successfully, we backtrack to the state just before the assertion
+and then carry on. In these cases, the pushed state is marked as
+'backtrack on success too'. This marking is in fact done by a chain of
+pointers, each pointing to the previous 'yes' state. On success, we pop to
+the nearest yes state, discarding any intermediate failure-only states.
+Sometimes a yes state is pushed just to force some cleanup code to be
+called at the end of a successful match or submatch; e.g. (??{$re}) uses
+it to free the inner regex.
+
+Note that failure backtracking rewinds the cursor position, while
+success backtracking leaves it alone.
+
+A pattern is complete when the END op is executed, while a subpattern
+such as (?=foo) is complete when the SUCCESS op is executed. Both of these
+ops trigger the "pop to last yes state if any, otherwise return true"
+behaviour.
+
+A common convention in this function is to use A and B to refer to the two
+subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
+the subpattern to be matched possibly multiple times, while B is the entire
+rest of the pattern. Variable and state names reflect this convention.
+
+The states in the main switch are the union of ops and failure/success of
+substates associated with with that op. For example, IFMATCH is the op
+that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
+'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
+successfully matched A and IFMATCH_A_fail is a state saying that we have
+just failed to match A. Resume states always come in pairs. The backtrack
+state we push is marked as 'IFMATCH_A', but when that is popped, we resume
+at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
+on success or failure.
+
+The struct that holds a backtracking state is actually a big union, with
+one variant for each major type of op. The variable st points to the
+top-most backtrack struct. To make the code clearer, within each
+block of code we #define ST to alias the relevant union.
+
+Here's a concrete example of a (vastly oversimplified) IFMATCH
+implementation:
+
+ switch (state) {
+ ....
+
+#define ST st->u.ifmatch
+
+ case IFMATCH: // we are executing the IFMATCH op, (?=A)B
+ ST.foo = ...; // some state we wish to save
+ ...
+ // push a yes backtrack state with a resume value of
+ // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
+ // first node of A:
+ PUSH_YES_STATE_GOTO(IFMATCH_A, A);
+ // NOTREACHED
+
+ case IFMATCH_A: // we have successfully executed A; now continue with B
+ next = B;
+ bar = ST.foo; // do something with the preserved value
+ break;
+
+ case IFMATCH_A_fail: // A failed, so the assertion failed
+ ...; // do some housekeeping, then ...
+ sayNO; // propagate the failure
+
+#undef ST
+
+ ...
+ }
+
+For any old-timers reading this who are familiar with the old recursive
+approach, the code above is equivalent to:
+
+ case IFMATCH: // we are executing the IFMATCH op, (?=A)B
+ {
+ int foo = ...
+ ...
+ if (regmatch(A)) {
+ next = B;
+ bar = foo;
+ break;
+ }
+ ...; // do some housekeeping, then ...
+ sayNO; // propagate the failure
+ }
+
+The topmost backtrack state, pointed to by st, is usually free. If you
+want to claim it, populate any ST.foo fields in it with values you wish to
+save, then do one of
+
+ PUSH_STATE_GOTO(resume_state, node);
+ PUSH_YES_STATE_GOTO(resume_state, node);
+
+which sets that backtrack state's resume value to 'resume_state', pushes a
+new free entry to the top of the backtrack stack, then goes to 'node'.
+On backtracking, the free slot is popped, and the saved state becomes the
+new free state. An ST.foo field in this new top state can be temporarily
+accessed to retrieve values, but once the main loop is re-entered, it
+becomes available for reuse.
+
+Note that the depth of the backtrack stack constantly increases during the
+left-to-right execution of the pattern, rather than going up and down with
+the pattern nesting. For example the stack is at its maximum at Z at the
+end of the pattern, rather than at X in the following:
+
+ /(((X)+)+)+....(Y)+....Z/
+
+The only exceptions to this are lookahead/behind assertions and the cut,
+(?>A), which pop all the backtrack states associated with A before
+continuing.
+
+Bascktrack state structs are allocated in slabs of about 4K in size.
+PL_regmatch_state and st always point to the currently active state,
+and PL_regmatch_slab points to the slab currently containing
+PL_regmatch_state. The first time regmatch() is called, the first slab is
+allocated, and is never freed until interpreter destruction. When the slab
+is full, a new one is allocated and chained to the end. At exit from
+regmatch(), slabs allocated since entry are freed.
+
+*/
+
+
+#define DEBUG_STATE_pp(pp) \
+ DEBUG_STATE_r({ \
+ DUMP_EXEC_POS(locinput, scan, do_utf8); \
+ PerlIO_printf(Perl_debug_log, \
+ " %*s"pp" %s\n", \
+ depth*2, "", \
+ reg_name[st->resume_state] ); \
+ });
+
+
+#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
+
+#ifdef DEBUGGING
+
+STATIC void
+S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
+ const char *start, const char *end, const char *blurb)
+{
+ const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
+ if (!PL_colorset)
+ reginitcolors();
+ {
+ RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
+ prog->precomp, prog->prelen, 60);
+
+ RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
+ start, end - start, 60);
+
+ PerlIO_printf(Perl_debug_log,
+ "%s%s REx%s %s against %s\n",
+ PL_colors[4], blurb, PL_colors[5], s0, s1);
+
+ if (do_utf8||utf8_pat)
+ PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
+ utf8_pat ? "pattern" : "",
+ utf8_pat && do_utf8 ? " and " : "",
+ do_utf8 ? "string" : ""
+ );
+ }
+}
+
+STATIC void
+S_dump_exec_pos(pTHX_ const char *locinput,
+ const regnode *scan,
+ const char *loc_regeol,
+ const char *loc_bostr,
+ const char *loc_reg_starttry,
+ const bool do_utf8)
+{
+ const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
+ const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
+ int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
+ /* The part of the string before starttry has one color
+ (pref0_len chars), between starttry and current
+ position another one (pref_len - pref0_len chars),
+ after the current position the third one.
+ We assume that pref0_len <= pref_len, otherwise we
+ decrease pref0_len. */
+ int pref_len = (locinput - loc_bostr) > (5 + taill) - l
+ ? (5 + taill) - l : locinput - loc_bostr;
+ int pref0_len;
+
+ while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
+ pref_len++;
+ pref0_len = pref_len - (locinput - loc_reg_starttry);
+ if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
+ l = ( loc_regeol - locinput > (5 + taill) - pref_len
+ ? (5 + taill) - pref_len : loc_regeol - locinput);
+ while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
+ l--;
+ if (pref0_len < 0)
+ pref0_len = 0;
+ if (pref0_len > pref_len)
+ pref0_len = pref_len;
+ {
+ const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
+
+ RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
+ (locinput - pref_len),pref0_len, 60, 4, 5);
+
+ RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
+ (locinput - pref_len + pref0_len),
+ pref_len - pref0_len, 60, 2, 3);
+
+ RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
+ locinput, loc_regeol - locinput, 10, 0, 1);
+
+ const STRLEN tlen=len0+len1+len2;
+ PerlIO_printf(Perl_debug_log,
+ "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
+ (IV)(locinput - loc_bostr),
+ len0, s0,
+ len1, s1,
+ (docolor ? "" : "> <"),
+ len2, s2,
+ (int)(tlen > 19 ? 0 : 19 - tlen),
+ "");
+ }
+}
+
+#endif
+
+/* reg_check_named_buff_matched()
+ * Checks to see if a named buffer has matched. The data array of
+ * buffer numbers corresponding to the buffer is expected to reside
+ * in the regexp->data->data array in the slot stored in the ARG() of
+ * node involved. Note that this routine doesn't actually care about the
+ * name, that information is not preserved from compilation to execution.
+ * Returns the index of the leftmost defined buffer with the given name
+ * or 0 if non of the buffers matched.