Implement variable length lookbehind in regex patterns

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index f79e157..275945c 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -74,10 +74,6 @@
  #define PERL_IN_REGCOMP_C
  #include "perl.h"
  
-#ifndef PERL_IN_XSUB_RE
-#  include "INTERN.h"
-#endif
-
  #define REG_COMP_C
  #ifdef PERL_IN_XSUB_RE
  #  include "re_comp.h"
@@ -167,6 +163,7 @@ struct RExC_state_t {
      I32                seen_zerolen;
      regnode_offset *open_parens;       /* offsets to open parens */
      regnode_offset *close_parens;      /* offsets to close parens */
+    I32      parens_buf_size;           /* #slots malloced open/close_parens */
      regnode     *end_op;                /* END node in program */
      I32                utf8;           /* whether the pattern is utf8 or not */
      I32                orig_utf8;      /* whether the pattern was originally in utf8 */
@@ -197,6 +194,7 @@ struct RExC_state_t {
      scan_frame *frame_last;
      U32         frame_count;
      AV         *warn_text;
+    HV         *unlexed_names;
  #ifdef ADD_TO_REGEXEC
      char       *starttry;              /* -Dr: where regtry was called. */
  #define RExC_starttry  (pRExC_state->starttry)
@@ -256,6 +254,7 @@ struct RExC_state_t {
  #define RExC_maxlen        (pRExC_state->maxlen)
  #define RExC_npar      (pRExC_state->npar)
  #define RExC_total_parens      (pRExC_state->total_par)
+#define RExC_parens_buf_size   (pRExC_state->parens_buf_size)
  #define RExC_nestroot   (pRExC_state->nestroot)
  #define RExC_seen_zerolen      (pRExC_state->seen_zerolen)
  #define RExC_utf8      (pRExC_state->utf8)
@@ -284,6 +283,7 @@ struct RExC_state_t {
  #define RExC_warn_text (pRExC_state->warn_text)
  #define RExC_in_script_run      (pRExC_state->in_script_run)
  #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
+#define RExC_unlexed_names (pRExC_state->unlexed_names)
  
  /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
   * a flag to disable back-off on the fixed/floating substrings - if it's
@@ -355,7 +355,7 @@ struct RExC_state_t {
              if (DEPENDS_SEMANTICS) {                                        \
                  set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
                  RExC_uni_semantics = 1;                                     \
-                if (RExC_seen_d_op && LIKELY(RExC_total_parens >= 0)) {     \
+                if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
                      /* No need to restart the parse if we haven't seen      \
                       * anything that differs between /u and /d, and no need \
                       * to restart immediately if we're going to reparse     \
@@ -366,11 +366,10 @@ struct RExC_state_t {
              }                                                               \
      } STMT_END
  
-#define BRANCH_MAX_OFFSET   U16_MAX
  #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
      STMT_START {                                                            \
                  RExC_use_BRANCHJ = 1;                                       \
-                if (LIKELY(RExC_total_parens >= 0)) {                       \
+                if (LIKELY(! IN_PARENS_PASS)) {                             \
                      /* No need to restart the parse immediately if we're    \
                       * going to reparse anyway to count parens */           \
                      *flagp |= RESTART_PARSE;                                \
@@ -378,10 +377,19 @@ struct RExC_state_t {
                  }                                                           \
      } STMT_END
  
+/* Until we have completed the parse, we leave RExC_total_parens at 0 or
+ * less.  After that, it must always be positive, because the whole re is
+ * considered to be surrounded by virtual parens.  Setting it to negative
+ * indicates there is some construct that needs to know the actual number of
+ * parens to be properly handled.  And that means an extra pass will be
+ * required after we've counted them all */
+#define ALL_PARENS_COUNTED (RExC_total_parens > 0)
  #define REQUIRE_PARENS_PASS                                                 \
-    STMT_START {                                                            \
-                    if (RExC_total_parens == 0) RExC_total_parens = -1;     \
+    STMT_START {  /* No-op if have completed a pass */                      \
+                    if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
      } STMT_END
+#define IN_PARENS_PASS (RExC_total_parens < 0)
+
  
  /* This is used to return failure (zero) early from the calling function if
   * various flags in 'flags' are set.  Two flags always cause a return:
@@ -2694,7 +2702,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
          trie_words = newAV();
      });
  
-    re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
+    re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
      assert(re_trie_maxbuff);
      if (!SvIOK(re_trie_maxbuff)) {
          sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
@@ -5978,14 +5986,27 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                                        last, &data_fake, stopparen,
                                        recursed_depth, NULL, f, depth+1);
                  if (scan->flags) {
-                    if (deltanext) {
-                       FAIL("Variable length lookbehind not implemented");
-                    }
-                    else if (minnext > (I32)U8_MAX) {
+                    if (   deltanext < 0
+                        || deltanext > (I32) U8_MAX
+                        || minnext > (I32)U8_MAX
+                        || minnext + deltanext > (I32)U8_MAX)
+                    {
                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
                                (UV)U8_MAX);
                      }
-                    scan->flags = (U8)minnext;
+
+                    /* The 'next_off' field has been repurposed to count the
+                     * additional starting positions to try beyond the initial
+                     * one.  (This leaves it at 0 for non-variable length
+                     * matches to avoid breakage for those not using this
+                     * extension) */
+                    if (deltanext) {
+                        scan->next_off = deltanext;
+                        ckWARNexperimental(RExC_parse,
+                            WARN_EXPERIMENTAL__VLB,
+                            "Variable length lookbehind is experimental");
+                    }
+                    scan->flags = (U8)minnext + deltanext;
                  }
                  if (data) {
                      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
@@ -6070,14 +6091,21 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                                          stopparen, recursed_depth, NULL,
                                          f, depth+1);
                  if (scan->flags) {
-                    if (deltanext) {
-                       FAIL("Variable length lookbehind not implemented");
-                    }
-                    else if (*minnextp > (I32)U8_MAX) {
+                    assert(0);  /* This code has never been tested since this
+                                   is normally not compiled */
+                    if (   deltanext < 0
+                        || deltanext > (I32) U8_MAX
+                        || *minnextp > (I32)U8_MAX
+                        || *minnextp + deltanext > (I32)U8_MAX)
+                    {
                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
                                (UV)U8_MAX);
                      }
-                    scan->flags = (U8)*minnextp;
+
+                    if (deltanext) {
+                        scan->next_off = deltanext;
+                    }
+                    scan->flags = (U8)*minnextp + deltanext;
                  }
  
                  *minnextp += min;
@@ -7361,6 +7389,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      }
  
      pRExC_state->warn_text = NULL;
+    pRExC_state->unlexed_names = NULL;
      pRExC_state->code_blocks = NULL;
  
      if (is_bare_re)
@@ -7659,6 +7688,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
  
      RExC_naughty = 0;
      RExC_npar = 1;
+    RExC_parens_buf_size = 0;
      RExC_emit_start = RExC_rxi->program;
      pRExC_state->code_index = 0;
  
@@ -7668,9 +7698,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      /* Do the parse */
      if (reg(pRExC_state, 0, &flags, 1)) {
  
-        /* Success!, But if RExC_total_parens < 0, we need to redo the parse
-         * knowing how many parens there actually are */
-        if (RExC_total_parens < 0) {
+        /* Success!, But we may need to redo the parse knowing how many parens
+         * there actually are */
+        if (IN_PARENS_PASS) {
              flags |= RESTART_PARSE;
          }
  
@@ -7712,7 +7742,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
              DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
          }
  
-        if (RExC_total_parens > 0) {
+        if (ALL_PARENS_COUNTED) {
              /* Make enough room for all the known parens, and zero it */
              Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
              Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
@@ -8810,7 +8840,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
              /* It might be a forward reference; we can't fail until we
                  * know, by completing the parse to get all the groups, and
                  * then reparsing */
-            if (RExC_total_parens > 0)  {
+            if (ALL_PARENS_COUNTED)  {
                  vFAIL("Reference to nonexistent named group");
              }
              else {
@@ -11002,6 +11032,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
      I32 freeze_paren = 0;
      I32 after_freeze = 0;
      I32 num; /* numeric backreferences */
+    SV * max_open;  /* Max number of unclosed parens */
  
      char * parse_start = RExC_parse; /* MJD */
      char * const oregcomp_parse = RExC_parse;
@@ -11011,6 +11042,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
      PERL_ARGS_ASSERT_REG;
      DEBUG_PARSE("reg ");
  
+
+    max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
+    assert(max_open);
+    if (!SvIOK(max_open)) {
+        sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
+    }
+    if (depth > 4 * SvIV(max_open)) { /* We increase depth by 4 for each open
+                                         paren */
+        vFAIL("Too many nested open parens");
+    }
+
      *flagp = 0;                                /* Tentatively. */
  
      /* Having this true makes it feasible to have a lot fewer tests for the
@@ -11596,7 +11638,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                          /* It might be a forward reference; we can't fail until
                           * we know, by completing the parse to get all the
                           * groups, and then reparsing */
-                        if (RExC_total_parens > 0)  {
+                        if (ALL_PARENS_COUNTED)  {
                              RExC_parse++;
                              vFAIL("Reference to nonexistent group");
                          }
@@ -11622,7 +11664,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                      /* It might be a forward reference; we can't fail until we
                       * know, by completing the parse to get all the groups, and
                       * then reparsing */
-                    if (RExC_total_parens > 0)  {
+                    if (ALL_PARENS_COUNTED)  {
                          if (num >= RExC_total_parens) {
                              RExC_parse++;
                              vFAIL("Reference to nonexistent group");
@@ -11953,34 +11995,47 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
           capturing_parens:
             parno = RExC_npar;
             RExC_npar++;
-            if (RExC_total_parens <= 0) {
+            if (! ALL_PARENS_COUNTED) {
                  /* If we are in our first pass through (and maybe only pass),
                   * we  need to allocate memory for the capturing parentheses
-                 * data structures.  Since we start at npar=1, when it reaches
-                 * 2, for the first time it has something to put in it.  Above
-                 * 2 means we extend what we already have */
-                if (RExC_npar == 2) {
+                 * data structures.
+                 */
+
+                if (!RExC_parens_buf_size) {
+                    /* first guess at number of parens we might encounter */
+                    RExC_parens_buf_size = 10;
+
                      /* setup RExC_open_parens, which holds the address of each
                       * OPEN tag, and to make things simpler for the 0 index the
                       * start of the program - this is used later for offsets */
-                    Newxz(RExC_open_parens, RExC_npar, regnode_offset);
+                    Newxz(RExC_open_parens, RExC_parens_buf_size,
+                            regnode_offset);
                      RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
  
                      /* setup RExC_close_parens, which holds the address of each
                       * CLOSE tag, and to make things simpler for the 0 index
                       * the end of the program - this is used later for offsets
                       * */
-                    Newxz(RExC_close_parens, RExC_npar, regnode_offset);
+                    Newxz(RExC_close_parens, RExC_parens_buf_size,
+                            regnode_offset);
                      /* we dont know where end op starts yet, so we dont need to
                       * set RExC_close_parens[0] like we do RExC_open_parens[0]
                       * above */
                  }
-                else {
-                    Renew(RExC_open_parens, RExC_npar, regnode_offset);
-                    Zero(RExC_open_parens + RExC_npar - 1, 1, regnode_offset);
+                else if (RExC_npar > RExC_parens_buf_size) {
+                    I32 old_size = RExC_parens_buf_size;
+
+                    RExC_parens_buf_size *= 2;
  
-                    Renew(RExC_close_parens, RExC_npar, regnode_offset);
-                    Zero(RExC_close_parens + RExC_npar - 1, 1, regnode_offset);
+                    Renew(RExC_open_parens, RExC_parens_buf_size,
+                            regnode_offset);
+                    Zero(RExC_open_parens + old_size,
+                            RExC_parens_buf_size - old_size, regnode_offset);
+
+                    Renew(RExC_close_parens, RExC_parens_buf_size,
+                            regnode_offset);
+                    Zero(RExC_close_parens + old_size,
+                            RExC_parens_buf_size - old_size, regnode_offset);
                  }
              }
  
@@ -12062,7 +12117,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
              RETURN_FAIL_ON_RESTART(flags, flagp);
              FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
          }
-        REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
+        if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
         lastbr = br;
         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
      }
@@ -12133,7 +12190,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                            (IV)(ender - lastbr)
              );
          );
-        REGTAIL(pRExC_state, lastbr, ender);
+        if (! REGTAIL(pRExC_state, lastbr, ender)) {
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
  
         if (have_branch) {
              char is_nothing= 1;
@@ -12144,9 +12203,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
             for (br = REGNODE_p(ret); br; br = regnext(br)) {
                 const U8 op = PL_regkind[OP(br)];
                 if (op == BRANCH) {
-                    REGTAIL_STUDY(pRExC_state,
-                                  REGNODE_OFFSET(NEXTOPER(br)),
-                                  ender);
+                    if (! REGTAIL_STUDY(pRExC_state,
+                                        REGNODE_OFFSET(NEXTOPER(br)),
+                                        ender))
+                    {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                      if ( OP(NEXTOPER(br)) != NOTHING
                           || regnext(NEXTOPER(br)) != REGNODE_p(ender))
                          is_nothing= 0;
@@ -12213,7 +12275,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
              Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
             Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
             FLAGS(REGNODE_p(ret)) = flag;
-            REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
+            if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
+            {
+                REQUIRE_BRANCHJ(flagp, 0);
+            }
         }
      }
  
@@ -12307,14 +12372,12 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
             /* FIXME adding one for every branch after the first is probably
              * excessive now we have TRIE support. (hv) */
             MARK_NAUGHTY(1);
-            if (     chain > (SSize_t) BRANCH_MAX_OFFSET
-                && ! RExC_use_BRANCHJ)
-            {
+            if (! REGTAIL(pRExC_state, chain, latest)) {
                  /* XXX We could just redo this branch, but figuring out what
-                 * bookkeeping needs to be reset is a pain */
+                 * bookkeeping needs to be reset is a pain, and it's likely
+                 * that other branches that goto END will also be too large */
                  REQUIRE_BRANCHJ(flagp, 0);
              }
-            REGTAIL(pRExC_state, chain, latest);
         }
         chain = latest;
         c++;
@@ -12614,20 +12677,23 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
    * points) that this \N sequence matches.  This is set, and the input is
    * parsed for errors, even if the function returns FALSE, as detailed below.
    *
-  * There are 5 possibilities here, as detailed in the next 5 paragraphs.
+  * There are 6 possibilities here, as detailed in the next 6 paragraphs.
    *
    * Probably the most common case is for the \N to specify a single code point.
    * *cp_count will be set to 1, and *code_point_p will be set to that code
    * point.
    *
-  * Another possibility is for the input to be an empty \N{}, which for
-  * backwards compatibility we accept.  *cp_count will be set to 0. *node_p
-  * will be set to a generated NOTHING node.
+  * Another possibility is for the input to be an empty \N{}.  This is no
+  * longer accepted, and will generate a fatal error.
+  *
+  * Another possibility is for a custom charnames handler to be in effect which
+  * translates the input name to an empty string.  *cp_count will be set to 0.
+  * *node_p will be set to a generated NOTHING node.
    *
    * Still another possibility is for the \N to mean [^\n]. *cp_count will be
    * set to 0. *node_p will be set to a generated REG_ANY node.
    *
-  * The fourth possibility is that \N resolves to a sequence of more than one
+  * The fifth possibility is that \N resolves to a sequence of more than one
    * code points.  *cp_count will be set to the number of code points in the
    * sequence. *node_p will be set to a generated node returned by this
    * function calling S_reg().
@@ -12635,7 +12701,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
    * The final possibility is that it is premature to be calling this function;
    * the parse needs to be restarted.  This can happen when this changes from
    * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
-  * latter occurs only when the fourth possibility would otherwise be in
+  * latter occurs only when the fifth possibility would otherwise be in
    * effect, and is because one of those code points requires the pattern to be
    * recompiled as UTF-8.  The function returns FALSE, and sets the
    * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
@@ -12652,12 +12718,11 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
    * so we need a way to take a snapshot of what they resolve to at the time of
    * the original parse. [perl #56444].
    *
-  * That parsing is skipped for single-quoted regexes, so we may here get
-  * '\N{NAME}'.  This is a fatal error.  These names have to be resolved by the
-  * parser.  But if the single-quoted regex is something like '\N{U+41}', that
-  * is legal and handled here.  The code point is Unicode, and has to be
-  * translated into the native character set for non-ASCII platforms.
-  */
+  * That parsing is skipped for single-quoted regexes, so here we may get
+  * '\N{NAME}', which is parsed now.  If the single-quoted regex is something
+  * like '\N{U+41}', that code point is Unicode, and has to be translated into
+  * the native character set for non-ASCII platforms.  The other possibilities
+  * are already native, so no translation is done. */
  
      char * endbrace;    /* points to '}' following the name */
      char* p = RExC_parse; /* Temporary */
@@ -12666,7 +12731,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
      char *orig_end;
      char *save_start;
      I32 flags;
-    Size_t count = 0;   /* code point count kept internally by this function */
  
      GET_RE_DEBUG_FLAGS_DECL;
  
@@ -12689,7 +12753,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
  
      /* Disambiguate between \N meaning a named character versus \N meaning
       * [^\n].  The latter is assumed when the {...} following the \N is a legal
-     * quantifier, or there is no '{' at all */
+     * quantifier, or if there is no '{' at all */
      if (*p != '{' || regcurly(p)) {
          RExC_parse = p;
          if (cp_count) {
@@ -12722,15 +12786,19 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
          vFAIL2("Missing right brace on \\%c{}", 'N');
      }
  
-    /* Here, we have decided it should be a named character or sequence */
-    REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
-                                        semantics */
+    /* Here, we have decided it should be a named character or sequence.  These
+     * imply Unicode semantics */
+    REQUIRE_UNI_RULES(flagp, FALSE);
  
-    if (endbrace == RExC_parse) {   /* empty: \N{} */
+    /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
+     * nothing at all (not allowed under strict) */
+    if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
+        RExC_parse = endbrace;
          if (strict) {
              RExC_parse++;   /* Position after the "}" */
              vFAIL("Zero length \\N{}");
          }
+
          if (cp_count) {
              *cp_count = 0;
          }
@@ -12743,15 +12811,122 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
          return TRUE;
      }
  
-    /* If we haven't got something that begins with 'U+', then it didn't get lexed. */
-    if (   endbrace - RExC_parse < 2
-        || strnNE(RExC_parse, "U+", 2))
-    {
-        RExC_parse = endbrace;  /* position msg's '<--HERE' */
-        vFAIL("\\N{NAME} must be resolved by the lexer");
-    }
+    if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
+
+        /* Here, the name isn't of the form  U+....  This can happen if the
+         * pattern is single-quoted, so didn't get evaluated in toke.c.  Now
+         * is the time to find out what the name means */
+
+        const STRLEN name_len = endbrace - RExC_parse;
+        SV *  value_sv;     /* What does this name evaluate to */
+        SV ** value_svp;
+        const U8 * value;   /* string of name's value */
+        STRLEN value_len;   /* and its length */
+
+        /*  RExC_unlexed_names is a hash of names that weren't evaluated by
+         *  toke.c, and their values. Make sure is initialized */
+        if (! RExC_unlexed_names) {
+            RExC_unlexed_names = newHV();
+        }
+
+        /* If we have already seen this name in this pattern, use that.  This
+         * allows us to only call the charnames handler once per name per
+         * pattern.  A broken or malicious handler could return something
+         * different each time, which could cause the results to vary depending
+         * on if something gets added or subtracted from the pattern that
+         * causes the number of passes to change, for example */
+        if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
+                                                      name_len, 0)))
+        {
+            value_sv = *value_svp;
+        }
+        else { /* Otherwise we have to go out and get the name */
+            const char * error_msg = NULL;
+            value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
+                                                      UTF,
+                                                      &error_msg);
+            if (error_msg) {
+                RExC_parse = endbrace;
+                vFAIL(error_msg);
+            }
+
+            /* If no error message, should have gotten a valid return */
+            assert (value_sv);
+
+            /* Save the name's meaning for later use */
+            if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
+                           value_sv, 0))
+            {
+                Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
+            }
+        }
  
-        /* This code purposely indented below because of future changes coming */
+        /* Here, we have the value the name evaluates to in 'value_sv' */
+        value = (U8 *) SvPV(value_sv, value_len);
+
+        /* See if the result is one code point vs 0 or multiple */
+        if (value_len > 0 && value_len <= ((SvUTF8(value_sv))
+                                           ? UTF8SKIP(value)
+                                           : 1))
+        {
+            /* Here, exactly one code point.  If that isn't what is wanted,
+             * fail */
+            if (! code_point_p) {
+                RExC_parse = p;
+                return FALSE;
+            }
+
+            /* Convert from string to numeric code point */
+            *code_point_p = (SvUTF8(value_sv))
+                            ? valid_utf8_to_uvchr(value, NULL)
+                            : *value;
+
+            /* Have parsed this entire single code point \N{...}.  *cp_count
+             * has already been set to 1, so don't do it again. */
+            RExC_parse = endbrace;
+            nextchar(pRExC_state);
+            return TRUE;
+        } /* End of is a single code point */
+
+        /* Count the code points, if caller desires.  The API says to do this
+         * even if we will later return FALSE */
+        if (cp_count) {
+            *cp_count = 0;
+
+            *cp_count = (SvUTF8(value_sv))
+                        ? utf8_length(value, value + value_len)
+                        : value_len;
+        }
+
+        /* Fail if caller doesn't want to handle a multi-code-point sequence.
+         * But don't back the pointer up if the caller wants to know how many
+         * code points there are (they need to handle it themselves in this
+         * case).  */
+        if (! node_p) {
+            if (! cp_count) {
+                RExC_parse = p;
+            }
+            return FALSE;
+        }
+
+        /* Convert this to a sub-pattern of the form "(?: ... )", and then call
+         * reg recursively to parse it.  That way, it retains its atomicness,
+         * while not having to worry about any special handling that some code
+         * points may have. */
+
+        substitute_parse = newSVpvs("?:");
+        sv_catsv(substitute_parse, value_sv);
+        sv_catpv(substitute_parse, ")");
+
+#ifdef EBCDIC
+        /* The value should already be native, so no need to convert on EBCDIC
+         * platforms.*/
+        assert(! RExC_recode_x_to_native);
+#endif
+
+    }
+    else {   /* \N{U+...} */
+        Size_t count = 0;   /* code point count kept internally */
  
          /* We can get to here when the input is \N{U+...} or when toke.c has
           * converted a name to the \N{U+...} form.  This include changing a
@@ -12886,6 +13061,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
          RExC_recode_x_to_native = 1;
  #endif
  
+    }
+
      /* Here, we have the string the name evaluates to, ready to be parsed,
       * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
       * constructs.  This can be called from within a substitute parse already.
@@ -13273,9 +13450,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  char name = *RExC_parse;
                  char * endbrace = NULL;
                  RExC_parse += 2;
-                if (RExC_parse < RExC_end) {
-                    endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
-                }
+                endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
  
                  if (! endbrace) {
                      vFAIL2("Missing right brace on \\%c{}", name);
@@ -13620,7 +13795,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      /* It might be a forward reference; we can't fail until we
                       * know, by completing the parse to get all the groups, and
                       * then reparsing */
-                    if (RExC_total_parens > 0)  {
+                    if (ALL_PARENS_COUNTED)  {
                          if (num >= RExC_total_parens)  {
                              vFAIL("Reference to nonexistent group");
                          }
@@ -13772,7 +13947,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
                     || UTF8_IS_START(UCHARAT(RExC_parse)));
  
-
              /* Here, we have a literal character.  Find the maximal string of
               * them in the input that we can fit into a single EXACTish node.
               * We quit at the first non-literal or when the node gets full, or
@@ -16482,7 +16656,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
      IV range = 0;
      UV value = OOB_UNICODE, save_value = OOB_UNICODE;
-    regnode_offset ret;
+    regnode_offset ret = -1;    /* Initialized to an illegal value */
      STRLEN numlen;
      int namedclass = OOB_NAMEDCLASS;
      char *rangebegin = NULL;
@@ -16800,6 +16974,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                "Ignoring zero length \\N{} in character class");
                          }
                          else { /* cp_count > 1 */
+                            assert(cp_count > 1);
                              if (! RExC_in_multi_char_class) {
                                  if (invert || range || *RExC_parse == '-') {
                                      if (strict) {
@@ -19430,7 +19605,11 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
      src = REGNODE_p(RExC_emit);
      RExC_emit += size;
      dst = REGNODE_p(RExC_emit);
-    if (RExC_open_parens) {
+
+    /* If we are in a "count the parentheses" pass, the numbers are unreliable,
+     * and [perl #133871] shows this can lead to problems, so skip this
+     * realignment of parens until a later pass when they are reliable */
+    if (! IN_PARENS_PASS && RExC_open_parens) {
          int paren;
          /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
          /* remember that RExC_npar is rex->nparens + 1,
@@ -19503,10 +19682,13 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
  }
  
  /*
-- regtail - set the next-pointer at the end of a node chain of p to val.
+- regtail - set the next-pointer at the end of a node chain of p to val.  If
+            that value won't fit in the space available, instead returns FALSE.
+            (Except asserts if we can't fit in the largest space the regex
+            engine is designed for.)
  - SEE ALSO: regtail_study
  */
-STATIC void
+STATIC bool
  S_regtail(pTHX_ RExC_state_t * pRExC_state,
                  const regnode_offset p,
                  const regnode_offset val,
@@ -19539,11 +19721,21 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state,
      }
  
      if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
+        assert(val - scan <= U32_MAX);
          ARG_SET(REGNODE_p(scan), val - scan);
      }
      else {
+        if (val - scan > U16_MAX) {
+            /* Since not all callers check the return value, populate this with
+             * something that won't loop and will likely lead to a crash if
+             * execution continues */
+            NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
+            return FALSE;
+        }
          NEXT_OFF(REGNODE_p(scan)) = val - scan;
      }
+
+    return TRUE;
  }
  
  #ifdef DEBUGGING
@@ -19560,10 +19752,14 @@ that it is purely analytical.
  Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
  to control which is which.
  
+This used to return a value that was ignored.  It was a problem that it is
+#ifdef'd to be another function that didn't return a value.  khw has changed it
+so both currently return a pass/fail return.
+
  */
  /* TODO: All four parms should be const */
  
-STATIC U8
+STATIC bool
  S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
                        const regnode_offset val, U32 depth)
  {
@@ -19587,7 +19783,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
             bool unfolded_multi_char;   /* Unexamined in this routine */
              if (join_exact(pRExC_state, scan, &min,
                             &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
-                return EXACT;
+                return TRUE; /* Was return EXACT */
         }
  #endif
          if ( exact ) {
@@ -19637,13 +19833,18 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
          );
      });
      if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
+        assert(val - scan <= U32_MAX);
         ARG_SET(REGNODE_p(scan), val - scan);
      }
      else {
+        if (val - scan > U16_MAX) {
+            NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
+            return FALSE;
+        }
         NEXT_OFF(REGNODE_p(scan)) = val - scan;
      }
  
-    return exact;
+    return TRUE; /* Was 'return exact' */
  }
  #endif
  
@@ -20251,8 +20452,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
          sv_catpv(sv, bounds[FLAGS(o)]);
      }
-    else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
-       Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
+    else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
+       Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
+        if (o->next_off) {
+            Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
+        }
+       Perl_sv_catpvf(aTHX_ sv, "]");
+    }
      else if (OP(o) == SBOL)
          Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
  
@@ -21779,6 +21985,7 @@ Perl_init_uniprops(pTHX)
      PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
      PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
      PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
+    PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
  
  #ifdef UNI_XIDC
      /* The below are used only by deprecated functions.  They could be removed */
@@ -22331,6 +22538,167 @@ Perl_parse_uniprop_string(pTHX_
              }
          }
  
+        /* Most punctuation after the equals indicates a subpattern, like
+         * \p{foo=/bar/} */
+        if (   isPUNCT_A(name[i])
+            && name[i] != '-'
+            && name[i] != '+'
+            && name[i] != '_'
+            && name[i] != '{')
+        {
+            /* Find the property.  The table includes the equals sign, so we
+             * use 'j' as-is */
+            table_index = match_uniprop((U8 *) lookup_name, j);
+            if (table_index) {
+                const char * const * prop_values
+                                            = UNI_prop_value_ptrs[table_index];
+                SV * subpattern;
+                Size_t subpattern_len;
+                REGEXP * subpattern_re;
+                char open = name[i++];
+                char close;
+                const char * pos_in_brackets;
+                bool escaped = 0;
+
+                /* A backslash means the real delimitter is the next character.
+                 * */
+                if (open == '\\') {
+                    open = name[i++];
+                    escaped = 1;
+                }
+
+                /* This data structure is constructed so that the matching
+                 * closing bracket is 3 past its matching opening.  The second
+                 * set of closing is so that if the opening is something like
+                 * ']', the closing will be that as well.  Something similar is
+                 * done in toke.c */
+                pos_in_brackets = strchr("([<)]>)]>", open);
+                close = (pos_in_brackets) ? pos_in_brackets[3] : open;
+
+                if (   name[name_len-1] != close
+                    || (escaped && name[name_len-2] != '\\'))
+                {
+                    sv_catpvs(msg, "Unicode property wildcard not terminated");
+                    goto append_name_to_msg;
+                }
+
+                Perl_ck_warner_d(aTHX_
+                    packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
+                    "The Unicode property wildcards feature is experimental");
+
+                /* Now create and compile the wildcard subpattern.  Use /iaa
+                 * because nothing outside of ASCII will match, and it the
+                 * property values should all match /i.  Note that when the
+                 * pattern fails to compile, our added text to the user's
+                 * pattern will be displayed to the user, which is not so
+                 * desirable. */
+                subpattern_len = name_len - i - 1 - escaped;
+                subpattern = Perl_newSVpvf(aTHX_ "(?iaa:%.*s)",
+                                              (unsigned) subpattern_len,
+                                              name + i);
+                subpattern = sv_2mortal(subpattern);
+                subpattern_re = re_compile(subpattern, 0);
+                assert(subpattern_re);  /* Should have died if didn't compile
+                                         successfully */
+
+                /* For each legal property value, see if the supplied pattern
+                 * matches it. */
+                while (*prop_values) {
+                    const char * const entry = *prop_values;
+                    const Size_t len = strlen(entry);
+                    SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
+
+                    if (pregexec(subpattern_re,
+                                 (char *) entry,
+                                 (char *) entry + len,
+                                 (char *) entry, 0,
+                                 entry_sv,
+                                 0))
+                    { /* Here, matched.  Add to the returned list */
+                        Size_t total_len = j + len;
+                        SV * sub_invlist = NULL;
+                        char * this_string;
+
+                        /* We know this is a legal \p{property=value}.  Call
+                         * the function to return the list of code points that
+                         * match it */
+                        Newxz(this_string, total_len + 1, char);
+                        Copy(lookup_name, this_string, j, char);
+                        my_strlcat(this_string, entry, total_len + 1);
+                        SAVEFREEPV(this_string);
+                        sub_invlist = parse_uniprop_string(this_string,
+                                                           total_len,
+                                                           is_utf8,
+                                                           to_fold,
+                                                           runtime,
+                                                           user_defined_ptr,
+                                                           msg,
+                                                           level + 1);
+                        _invlist_union(prop_definition, sub_invlist,
+                                       &prop_definition);
+                    }
+
+                    prop_values++;  /* Next iteration, look at next propvalue */
+                } /* End of looking through property values; (the data
+                     structure is terminated by a NULL ptr) */
+
+                SvREFCNT_dec_NN(subpattern_re);
+
+                if (prop_definition) {
+                    return prop_definition;
+                }
+
+                sv_catpvs(msg, "No Unicode property value wildcard matches:");
+                goto append_name_to_msg;
+            }
+
+            /* Here's how khw thinks we should proceed to handle the properties
+             * not yet done:    Bidi Mirroring Glyph
+                                Bidi Paired Bracket
+                                Case Folding  (both full and simple)
+                                Decomposition Mapping
+                                Equivalent Unified Ideograph
+                                Name
+                                Name Alias
+                                Lowercase Mapping  (both full and simple)
+                                NFKC Case Fold
+                                Titlecase Mapping  (both full and simple)
+                                Uppercase Mapping  (both full and simple)
+             * Move the part that looks at the property values into a perl
+             * script, like utf8_heavy.pl is done.  This makes things somewhat
+             * easier, but most importantly, it avoids always adding all these
+             * strings to the memory usage when the feature is little-used.
+             *
+             * The property values would all be concatenated into a single
+             * string per property with each value on a separate line, and the
+             * code point it's for on alternating lines.  Then we match the
+             * user's input pattern m//mg, without having to worry about their
+             * uses of '^' and '$'.  Only the values that aren't the default
+             * would be in the strings.  Code points would be in UTF-8.  The
+             * search pattern that we would construct would look like
+             * (?: \n (code-point_re) \n (?aam: user-re ) \n )
+             * And so $1 would contain the code point that matched the user-re.
+             * For properties where the default is the code point itself, such
+             * as any of the case changing mappings, the string would otherwise
+             * consist of all Unicode code points in UTF-8 strung together.
+             * This would be impractical.  So instead, examine their compiled
+             * pattern, looking at the ssc.  If none, reject the pattern as an
+             * error.  Otherwise run the pattern against every code point in
+             * the ssc.  The ssc is kind of like tr18's 3.9 Possible Match Sets
+             * And it might be good to create an API to return the ssc.
+             *
+             * For the name properties, a new function could be created in
+             * charnames which essentially does the same thing as above,
+             * sharing Name.pl with the other charname functions.  Don't know
+             * about loose name matching, or algorithmically determined names.
+             * Decomposition.pl similarly.
+             *
+             * It might be that a new pattern modifier would have to be
+             * created, like /t for resTricTed, which changed the behavior of
+             * some constructs in their subpattern, like \A. */
+        } /* End of is a wildcard subppattern */
+
+
          /* Certain properties whose values are numeric need special handling.
           * They may optionally be prefixed by 'is'.  Ignore that prefix for the
           * purposes of checking if this is one of those properties */
@@ -23040,10 +23408,65 @@ Perl_parse_uniprop_string(pTHX_
  
      /* Create and return the inversion list */
      prop_definition =_new_invlist_C_array(uni_prop_ptrs[table_index]);
+    sv_2mortal(prop_definition);
+
+
+    /* See if there is a private use override to add to this definition */
+    {
+        COPHH * hinthash = (IN_PERL_COMPILETIME)
+                           ? CopHINTHASH_get(&PL_compiling)
+                           : CopHINTHASH_get(PL_curcop);
+       SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
+
+        if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
+
+            /* See if there is an element in the hints hash for this table */
+            SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
+            const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
+
+            if (pos) {
+                bool dummy;
+                SV * pu_definition;
+                SV * pu_invlist;
+                SV * expanded_prop_definition =
+                            sv_2mortal(invlist_clone(prop_definition, NULL));
+
+                /* If so, it's definition is the string from here to the next
+                 * \a character.  And its format is the same as a user-defined
+                 * property */
+                pos += SvCUR(pu_lookup);
+                pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
+                pu_invlist = handle_user_defined_property(lookup_name,
+                                                          lookup_len,
+                                                          0, /* Not UTF-8 */
+                                                          0, /* Not folded */
+                                                          runtime,
+                                                          pu_definition,
+                                                          &dummy,
+                                                          msg,
+                                                          level);
+                if (TAINT_get) {
+                    if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+                    sv_catpvs(msg, "Insecure private-use override");
+                    goto append_name_to_msg;
+                }
+
+                /* For now, as a safety measure, make sure that it doesn't
+                 * override non-private use code points */
+                _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
+
+                /* Add it to the list to be returned */
+                _invlist_union(prop_definition, pu_invlist,
+                               &expanded_prop_definition);
+                prop_definition = expanded_prop_definition;
+                Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
+            }
+        }
+    }
+
      if (invert_return) {
          _invlist_invert(prop_definition);
      }
-    sv_2mortal(prop_definition);
      return prop_definition;