win32/GNUmakefile collapse shell echos into one liners

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index f67e3e8..639e4a3 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -131,6 +131,7 @@ struct RExC_state_t {
      U32                flags;                  /* RXf_* are we folding, multilining? */
      U32                pm_flags;               /* PMf_* stuff from the calling PMOP */
      char       *precomp;               /* uncompiled string. */
+    char       *precomp_end;           /* pointer to end of uncompiled string. */
      REGEXP     *rx_sv;                 /* The SV that is the regexp. */
      regexp     *rx;                    /* perl core regexp structure */
      regexp_internal    *rxi;           /* internal data for regexp object
@@ -138,6 +139,8 @@ struct RExC_state_t {
      char       *start;                 /* Start of input for compile */
      char       *end;                   /* End of input for compile */
      char       *parse;                 /* Input-scan pointer. */
+    char        *adjusted_start;        /* 'start', adjusted.  See code use */
+    STRLEN      precomp_adj;            /* an offset beyond precomp.  See code use */
      SSize_t    whilem_seen;            /* number of WHILEM in this expr */
      regnode    *emit_start;            /* Start of emitted-code area */
      regnode    *emit_bound;            /* First regnode outside of the
@@ -220,6 +223,9 @@ struct RExC_state_t {
  #define RExC_flags     (pRExC_state->flags)
  #define RExC_pm_flags  (pRExC_state->pm_flags)
  #define RExC_precomp   (pRExC_state->precomp)
+#define RExC_precomp_adj (pRExC_state->precomp_adj)
+#define RExC_adjusted_start  (pRExC_state->adjusted_start)
+#define RExC_precomp_end (pRExC_state->precomp_end)
  #define RExC_rx_sv     (pRExC_state->rx_sv)
  #define RExC_rx                (pRExC_state->rx)
  #define RExC_rxi       (pRExC_state->rxi)
@@ -554,19 +560,67 @@ static const scan_data_t zero_scan_data =
  #define REPORT_LOCATION " in regex; marked by " MARKER1    \
                          " in m/%"UTF8f MARKER2 "%"UTF8f"/"
  
-#define REPORT_LOCATION_ARGS(loc)                                           \
-                UTF8fARG(UTF,                                               \
-                         ((loc) > RExC_end)                                 \
-                          ? RExC_end - RExC_precomp                         \
-                          : (loc) - RExC_precomp,                           \
-                         RExC_precomp),                                     \
-                UTF8fARG(UTF,                                               \
-                         ((loc) > RExC_end)                                 \
-                          ? 0                                               \
-                          : RExC_end - (loc),                               \
-                         ((loc) > RExC_end)                                 \
-                          ? RExC_end                                        \
-                          : (loc))
+/* The code in this file in places uses one level of recursion with parsing
+ * rebased to an alternate string constructed by us in memory.  This can take
+ * the form of something that is completely different from the input, or
+ * something that uses the input as part of the alternate.  In the first case,
+ * there should be no possibility of an error, as we are in complete control of
+ * the alternate string.  But in the second case we don't control the input
+ * portion, so there may be errors in that.  Here's an example:
+ *      /[abc\x{DF}def]/ui
+ * is handled specially because \x{df} folds to a sequence of more than one
+ * character, 'ss'.  What is done is to create and parse an alternate string,
+ * which looks like this:
+ *      /(?:\x{DF}|[abc\x{DF}def])/ui
+ * where it uses the input unchanged in the middle of something it constructs,
+ * which is a branch for the DF outside the character class, and clustering
+ * parens around the whole thing. (It knows enough to skip the DF inside the
+ * class while in this substitute parse.) 'abc' and 'def' may have errors that
+ * need to be reported.  The general situation looks like this:
+ *
+ *              sI                       tI               xI       eI
+ * Input:       ----------------------------------------------------
+ * Constructed:         ---------------------------------------------------
+ *                      sC               tC               xC       eC     EC
+ *
+ * The input string sI..eI is the input pattern.  The string sC..EC is the
+ * constructed substitute parse string.  The portions sC..tC and eC..EC are
+ * constructed by us.  The portion tC..eC is an exact duplicate of the input
+ * pattern tI..eI.  In the diagram, these are vertically aligned.  Suppose that
+ * while parsing, we find an error at xC.  We want to display a message showing
+ * the real input string.  Thus we need to find the point xI in it which
+ * corresponds to xC.  xC >= tC, since the portion of the string sC..tC has
+ * been constructed by us, and so shouldn't have errors.  We get:
+ *
+ *      xI = sI + (tI - sI) + (xC - tC)
+ *
+ * and, the offset into sI is:
+ *
+ *      (xI - sI) = (tI - sI) + (xC - tC)
+ *
+ * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
+ * and we save tC as RExC_adjusted_start.
+ *
+ * During normal processing of the input pattern, everything points to that,
+ * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
+ */
+
+#define tI_sI           RExC_precomp_adj
+#define tC              RExC_adjusted_start
+#define sC              RExC_precomp
+#define xI_offset(xC)   ((IV) (tI_sI + (xC - tC)))
+#define xI(xC)          (sC + xI_offset(xC))
+#define eC              RExC_precomp_end
+
+#define REPORT_LOCATION_ARGS(xC)                                            \
+    UTF8fARG(UTF,                                                           \
+             (xI(xC) > eC) /* Don't run off end */                          \
+              ? eC - sC   /* Length before the <--HERE */                   \
+              : xI_offset(xC),                                              \
+             sC),         /* The input pattern printed up to the <--HERE */ \
+    UTF8fARG(UTF,                                                           \
+             (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */    \
+             (xI(xC) > eC) ? eC : xI(xC))     /* pattern after <--HERE */
  
  /* Used to point after bad bytes for an error message, but avoid skipping
   * past a nul byte. */
@@ -579,7 +633,7 @@ static const scan_data_t zero_scan_data =
   */
  #define _FAIL(code) STMT_START {                                       \
      const char *ellipses = "";                                         \
-    IV len = RExC_end - RExC_precomp;                                  \
+    IV len = RExC_precomp_end - RExC_precomp;                                  \
                                                                         \
      if (!SIZE_ONLY)                                                    \
         SAVEFREESV(RExC_rx_sv);                                         \
@@ -1189,7 +1243,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
          }
  
          /* Get the code points valid only under UTF-8 locales */
-        if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
+        if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
              && ary[2] && ary[2] != &PL_sv_undef)
          {
              only_utf8_locale_invlist = ary[2];
@@ -1236,7 +1290,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
      if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
          _invlist_invert(invlist);
      }
-    else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
+    else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
  
          /* Under /li, any 0-255 could fold to any other 0-255, depending on the
           * locale.  We can skip this if there are no 0-255 at all. */
@@ -1315,6 +1369,10 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
              &( ANYOF_COMMON_FLAGS
                |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
                |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
+            if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
+                anded_flags &=
+                    ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
+            }
          }
      }
  
@@ -1471,6 +1529,10 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
              |= ANYOF_FLAGS(or_with)
               & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
                  |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
+            if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
+                ored_flags |=
+                    ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
+            }
          }
      }
  
@@ -4818,7 +4880,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
                         "Quantifier unexpected on zero-length expression "
                         "in regex m/%"UTF8f"/",
-                        UTF8fARG(UTF, RExC_end - RExC_precomp,
+                        UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
                                   RExC_precomp));
                     (void)ReREFCNT_inc(RExC_rx_sv);
                 }
@@ -6691,6 +6753,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      }
  
      RExC_precomp = exp;
+    RExC_precomp_adj = 0;
      RExC_flags = rx_flags;
      RExC_pm_flags = pm_flags;
  
@@ -6724,8 +6787,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
  
      /* First pass: determine size, legality. */
      RExC_parse = exp;
-    RExC_start = exp;
+    RExC_start = RExC_adjusted_start = exp;
      RExC_end = exp + plen;
+    RExC_precomp_end = RExC_end;
      RExC_naughty = 0;
      RExC_npar = 1;
      RExC_nestroot = 0;
@@ -11355,6 +11419,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
         SV * substitute_parse;
         STRLEN len;
         char *orig_end = RExC_end;
+       char *save_start = RExC_start;
          I32 flags;
  
          /* Count the code points, if desired, in the sequence */
@@ -11400,7 +11465,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
         }
          sv_catpv(substitute_parse, ")");
  
-       RExC_parse = SvPV(substitute_parse, len);
+        RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse,
+                                                             len);
  
         /* Don't allow empty number */
         if (len < (STRLEN) 8) {
@@ -11430,6 +11496,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
          }
  
          /* Restore the saved values */
+       RExC_start = RExC_adjusted_start = save_start;
         RExC_parse = endbrace;
         RExC_end = orig_end;
         RExC_override_recoding = 0;
@@ -11984,6 +12051,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          }
                          FLAGS(ret) = GCB_BOUND;
                          break;
+                    case 'l':
+                        if (length != 2 || *(RExC_parse + 1) != 'b') {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = LB_BOUND;
+                        break;
                      case 's':
                          if (length != 2 || *(RExC_parse + 1) != 'b') {
                              goto bad_bound_type;
@@ -14145,7 +14218,8 @@ redo_curchar:
          assert(OP(node) == ANYOF);
  
          OP(node) = ANYOFL;
-        ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8;
+        ANYOF_FLAGS(node)
+                |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
      }
  
      if (save_fold) {
@@ -15467,11 +15541,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         STRLEN len;
         char *save_end = RExC_end;
         char *save_parse = RExC_parse;
+       char *save_start = RExC_start;
+        STRLEN prefix_end = 0;      /* We copy the character class after a
+                                       prefix supplied here.  This is the size
+                                       + 1 of that prefix */
          bool first_time = TRUE;     /* First multi-char occurrence doesn't get
                                         a "|" */
          I32 reg_flags;
  
          assert(! invert);
+        assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */
+
  #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
             because too confusing */
          if (invert) {
@@ -15505,6 +15585,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
           * multi-character folds, have to include it in recursive parsing */
          if (element_count) {
              sv_catpv(substitute_parse, "|[");
+            prefix_end = SvCUR(substitute_parse);
              sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
              sv_catpv(substitute_parse, "]");
          }
@@ -15519,7 +15600,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
          }
  #endif
  
-       RExC_parse = SvPV(substitute_parse, len);
+        /* Set up the data structure so that any errors will be properly
+         * reported.  See the comments at the definition of
+         * REPORT_LOCATION_ARGS for details */
+        RExC_precomp_adj = orig_parse - RExC_precomp;
+       RExC_start =  RExC_parse = SvPV(substitute_parse, len);
+        RExC_adjusted_start = RExC_start + prefix_end;
         RExC_end = RExC_parse + len;
          RExC_in_multi_char_class = 1;
         RExC_override_recoding = 1;
@@ -15529,7 +15615,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
  
-       RExC_parse = save_parse;
+        /* And restore so can parse the rest of the pattern */
+        RExC_parse = save_parse;
+       RExC_start = RExC_adjusted_start = save_start;
+        RExC_precomp_adj = 0;
         RExC_end = save_end;
         RExC_in_multi_char_class = 0;
         RExC_override_recoding = 0;
@@ -16027,14 +16116,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
       * locales, or the class matches at least one 0-255 range code point */
      if (LOC && FOLD) {
          if (only_utf8_locale_list) {
-            ANYOF_FLAGS(ret) |=  ANYOF_LOC_FOLD
-                                |ANYOF_ONLY_UTF8_LOC_FOLD_MATCHES;
+            ANYOF_FLAGS(ret)
+                 |=  ANYOFL_FOLD
+                    |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
          }
          else if (cp_list) { /* Look to see if a 0-255 code point is in list */
              UV start, end;
              invlist_iterinit(cp_list);
              if (invlist_iternext(cp_list, &start, &end) && start < 256) {
-                ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
+                ANYOF_FLAGS(ret) |= ANYOFL_FOLD;
              }
              invlist_iterfinish(cp_list);
          }
@@ -17402,14 +17492,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
  
  
         if (OP(o) == ANYOFL) {
-            if (flags & ANYOF_LOC_REQ_UTF8) {
+            if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
                  sv_catpvs(sv, "{utf8-loc}");
              }
              else {
                  sv_catpvs(sv, "{loc}");
              }
          }
-       if (flags & ANYOF_LOC_FOLD)
+       if (flags & ANYOFL_FOLD)
             sv_catpvs(sv, "{i}");
         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
         if (flags & ANYOF_INVERT)
@@ -17436,7 +17526,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
             || (flags
                  & ( ANYOF_MATCHES_ALL_ABOVE_BITMAP
                     |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP
-                   |ANYOF_LOC_FOLD)))
+                   |ANYOFL_FOLD)))
          {
              if (do_sep) {
                  Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
@@ -17518,7 +17608,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                      SvREFCNT_dec_NN(lv);
                  }
  
-                if ((flags & ANYOF_LOC_FOLD)
+                if ((flags & ANYOFL_FOLD)
                       && only_utf8_locale
                       && only_utf8_locale != &PL_sv_undef)
                  {
@@ -17565,6 +17655,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          const char * const bounds[] = {
              "",      /* Traditional */
              "{gcb}",
+            "{lb}",
              "{sb}",
              "{wb}"
          };