This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
win32/GNUmakefile collapse shell echos into one liners
[perl5.git] / regcomp.c
index f67e3e8..639e4a3 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -131,6 +131,7 @@ struct RExC_state_t {
     U32                flags;                  /* RXf_* are we folding, multilining? */
     U32                pm_flags;               /* PMf_* stuff from the calling PMOP */
     char       *precomp;               /* uncompiled string. */
+    char       *precomp_end;           /* pointer to end of uncompiled string. */
     REGEXP     *rx_sv;                 /* The SV that is the regexp. */
     regexp     *rx;                    /* perl core regexp structure */
     regexp_internal    *rxi;           /* internal data for regexp object
@@ -138,6 +139,8 @@ struct RExC_state_t {
     char       *start;                 /* Start of input for compile */
     char       *end;                   /* End of input for compile */
     char       *parse;                 /* Input-scan pointer. */
+    char        *adjusted_start;        /* 'start', adjusted.  See code use */
+    STRLEN      precomp_adj;            /* an offset beyond precomp.  See code use */
     SSize_t    whilem_seen;            /* number of WHILEM in this expr */
     regnode    *emit_start;            /* Start of emitted-code area */
     regnode    *emit_bound;            /* First regnode outside of the
@@ -220,6 +223,9 @@ struct RExC_state_t {
 #define RExC_flags     (pRExC_state->flags)
 #define RExC_pm_flags  (pRExC_state->pm_flags)
 #define RExC_precomp   (pRExC_state->precomp)
+#define RExC_precomp_adj (pRExC_state->precomp_adj)
+#define RExC_adjusted_start  (pRExC_state->adjusted_start)
+#define RExC_precomp_end (pRExC_state->precomp_end)
 #define RExC_rx_sv     (pRExC_state->rx_sv)
 #define RExC_rx                (pRExC_state->rx)
 #define RExC_rxi       (pRExC_state->rxi)
@@ -554,19 +560,67 @@ static const scan_data_t zero_scan_data =
 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 
-#define REPORT_LOCATION_ARGS(loc)                                           \
-                UTF8fARG(UTF,                                               \
-                         ((loc) > RExC_end)                                 \
-                          ? RExC_end - RExC_precomp                         \
-                          : (loc) - RExC_precomp,                           \
-                         RExC_precomp),                                     \
-                UTF8fARG(UTF,                                               \
-                         ((loc) > RExC_end)                                 \
-                          ? 0                                               \
-                          : RExC_end - (loc),                               \
-                         ((loc) > RExC_end)                                 \
-                          ? RExC_end                                        \
-                          : (loc))
+/* The code in this file in places uses one level of recursion with parsing
+ * rebased to an alternate string constructed by us in memory.  This can take
+ * the form of something that is completely different from the input, or
+ * something that uses the input as part of the alternate.  In the first case,
+ * there should be no possibility of an error, as we are in complete control of
+ * the alternate string.  But in the second case we don't control the input
+ * portion, so there may be errors in that.  Here's an example:
+ *      /[abc\x{DF}def]/ui
+ * is handled specially because \x{df} folds to a sequence of more than one
+ * character, 'ss'.  What is done is to create and parse an alternate string,
+ * which looks like this:
+ *      /(?:\x{DF}|[abc\x{DF}def])/ui
+ * where it uses the input unchanged in the middle of something it constructs,
+ * which is a branch for the DF outside the character class, and clustering
+ * parens around the whole thing. (It knows enough to skip the DF inside the
+ * class while in this substitute parse.) 'abc' and 'def' may have errors that
+ * need to be reported.  The general situation looks like this:
+ *
+ *              sI                       tI               xI       eI
+ * Input:       ----------------------------------------------------
+ * Constructed:         ---------------------------------------------------
+ *                      sC               tC               xC       eC     EC
+ *
+ * The input string sI..eI is the input pattern.  The string sC..EC is the
+ * constructed substitute parse string.  The portions sC..tC and eC..EC are
+ * constructed by us.  The portion tC..eC is an exact duplicate of the input
+ * pattern tI..eI.  In the diagram, these are vertically aligned.  Suppose that
+ * while parsing, we find an error at xC.  We want to display a message showing
+ * the real input string.  Thus we need to find the point xI in it which
+ * corresponds to xC.  xC >= tC, since the portion of the string sC..tC has
+ * been constructed by us, and so shouldn't have errors.  We get:
+ *
+ *      xI = sI + (tI - sI) + (xC - tC)
+ *
+ * and, the offset into sI is:
+ *
+ *      (xI - sI) = (tI - sI) + (xC - tC)
+ *
+ * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
+ * and we save tC as RExC_adjusted_start.
+ *
+ * During normal processing of the input pattern, everything points to that,
+ * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
+ */
+
+#define tI_sI           RExC_precomp_adj
+#define tC              RExC_adjusted_start
+#define sC              RExC_precomp
+#define xI_offset(xC)   ((IV) (tI_sI + (xC - tC)))
+#define xI(xC)          (sC + xI_offset(xC))
+#define eC              RExC_precomp_end
+
+#define REPORT_LOCATION_ARGS(xC)                                            \
+    UTF8fARG(UTF,                                                           \
+             (xI(xC) > eC) /* Don't run off end */                          \
+              ? eC - sC   /* Length before the <--HERE */                   \
+              : xI_offset(xC),                                              \
+             sC),         /* The input pattern printed up to the <--HERE */ \
+    UTF8fARG(UTF,                                                           \
+             (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */    \
+             (xI(xC) > eC) ? eC : xI(xC))     /* pattern after <--HERE */
 
 /* Used to point after bad bytes for an error message, but avoid skipping
  * past a nul byte. */
@@ -579,7 +633,7 @@ static const scan_data_t zero_scan_data =
  */
 #define _FAIL(code) STMT_START {                                       \
     const char *ellipses = "";                                         \
-    IV len = RExC_end - RExC_precomp;                                  \
+    IV len = RExC_precomp_end - RExC_precomp;                                  \
                                                                        \
     if (!SIZE_ONLY)                                                    \
        SAVEFREESV(RExC_rx_sv);                                         \
@@ -1189,7 +1243,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
         }
 
         /* Get the code points valid only under UTF-8 locales */
-        if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
+        if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
             && ary[2] && ary[2] != &PL_sv_undef)
         {
             only_utf8_locale_invlist = ary[2];
@@ -1236,7 +1290,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
         _invlist_invert(invlist);
     }
-    else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
+    else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
 
         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
          * locale.  We can skip this if there are no 0-255 at all. */
@@ -1315,6 +1369,10 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
             &( ANYOF_COMMON_FLAGS
               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
+            if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
+                anded_flags &=
+                    ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
+            }
         }
     }
 
@@ -1471,6 +1529,10 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
             |= ANYOF_FLAGS(or_with)
              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
+            if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
+                ored_flags |=
+                    ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
+            }
         }
     }
 
@@ -4818,7 +4880,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
                        "Quantifier unexpected on zero-length expression "
                        "in regex m/%"UTF8f"/",
-                        UTF8fARG(UTF, RExC_end - RExC_precomp,
+                        UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
                                  RExC_precomp));
                    (void)ReREFCNT_inc(RExC_rx_sv);
                }
@@ -6691,6 +6753,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     }
 
     RExC_precomp = exp;
+    RExC_precomp_adj = 0;
     RExC_flags = rx_flags;
     RExC_pm_flags = pm_flags;
 
@@ -6724,8 +6787,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
 
     /* First pass: determine size, legality. */
     RExC_parse = exp;
-    RExC_start = exp;
+    RExC_start = RExC_adjusted_start = exp;
     RExC_end = exp + plen;
+    RExC_precomp_end = RExC_end;
     RExC_naughty = 0;
     RExC_npar = 1;
     RExC_nestroot = 0;
@@ -11355,6 +11419,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
        SV * substitute_parse;
        STRLEN len;
        char *orig_end = RExC_end;
+       char *save_start = RExC_start;
         I32 flags;
 
         /* Count the code points, if desired, in the sequence */
@@ -11400,7 +11465,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
        }
         sv_catpv(substitute_parse, ")");
 
-       RExC_parse = SvPV(substitute_parse, len);
+        RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse,
+                                                             len);
 
        /* Don't allow empty number */
        if (len < (STRLEN) 8) {
@@ -11430,6 +11496,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
         }
 
         /* Restore the saved values */
+       RExC_start = RExC_adjusted_start = save_start;
        RExC_parse = endbrace;
        RExC_end = orig_end;
        RExC_override_recoding = 0;
@@ -11984,6 +12051,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         }
                         FLAGS(ret) = GCB_BOUND;
                         break;
+                    case 'l':
+                        if (length != 2 || *(RExC_parse + 1) != 'b') {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = LB_BOUND;
+                        break;
                     case 's':
                         if (length != 2 || *(RExC_parse + 1) != 'b') {
                             goto bad_bound_type;
@@ -14145,7 +14218,8 @@ redo_curchar:
         assert(OP(node) == ANYOF);
 
         OP(node) = ANYOFL;
-        ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8;
+        ANYOF_FLAGS(node)
+                |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
     }
 
     if (save_fold) {
@@ -15467,11 +15541,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
        STRLEN len;
        char *save_end = RExC_end;
        char *save_parse = RExC_parse;
+       char *save_start = RExC_start;
+        STRLEN prefix_end = 0;      /* We copy the character class after a
+                                       prefix supplied here.  This is the size
+                                       + 1 of that prefix */
         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
                                        a "|" */
         I32 reg_flags;
 
         assert(! invert);
+        assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */
+
 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
            because too confusing */
         if (invert) {
@@ -15505,6 +15585,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
          * multi-character folds, have to include it in recursive parsing */
         if (element_count) {
             sv_catpv(substitute_parse, "|[");
+            prefix_end = SvCUR(substitute_parse);
             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
             sv_catpv(substitute_parse, "]");
         }
@@ -15519,7 +15600,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         }
 #endif
 
-       RExC_parse = SvPV(substitute_parse, len);
+        /* Set up the data structure so that any errors will be properly
+         * reported.  See the comments at the definition of
+         * REPORT_LOCATION_ARGS for details */
+        RExC_precomp_adj = orig_parse - RExC_precomp;
+       RExC_start =  RExC_parse = SvPV(substitute_parse, len);
+        RExC_adjusted_start = RExC_start + prefix_end;
        RExC_end = RExC_parse + len;
         RExC_in_multi_char_class = 1;
        RExC_override_recoding = 1;
@@ -15529,7 +15615,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
        *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
 
-       RExC_parse = save_parse;
+        /* And restore so can parse the rest of the pattern */
+        RExC_parse = save_parse;
+       RExC_start = RExC_adjusted_start = save_start;
+        RExC_precomp_adj = 0;
        RExC_end = save_end;
        RExC_in_multi_char_class = 0;
        RExC_override_recoding = 0;
@@ -16027,14 +16116,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      * locales, or the class matches at least one 0-255 range code point */
     if (LOC && FOLD) {
         if (only_utf8_locale_list) {
-            ANYOF_FLAGS(ret) |=  ANYOF_LOC_FOLD
-                                |ANYOF_ONLY_UTF8_LOC_FOLD_MATCHES;
+            ANYOF_FLAGS(ret)
+                 |=  ANYOFL_FOLD
+                    |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
         }
         else if (cp_list) { /* Look to see if a 0-255 code point is in list */
             UV start, end;
             invlist_iterinit(cp_list);
             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
-                ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
+                ANYOF_FLAGS(ret) |= ANYOFL_FOLD;
             }
             invlist_iterfinish(cp_list);
         }
@@ -17402,14 +17492,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
 
 
        if (OP(o) == ANYOFL) {
-            if (flags & ANYOF_LOC_REQ_UTF8) {
+            if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
                 sv_catpvs(sv, "{utf8-loc}");
             }
             else {
                 sv_catpvs(sv, "{loc}");
             }
         }
-       if (flags & ANYOF_LOC_FOLD)
+       if (flags & ANYOFL_FOLD)
            sv_catpvs(sv, "{i}");
        Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
        if (flags & ANYOF_INVERT)
@@ -17436,7 +17526,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
            || (flags
                 & ( ANYOF_MATCHES_ALL_ABOVE_BITMAP
                    |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP
-                   |ANYOF_LOC_FOLD)))
+                   |ANYOFL_FOLD)))
         {
             if (do_sep) {
                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
@@ -17518,7 +17608,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                     SvREFCNT_dec_NN(lv);
                 }
 
-                if ((flags & ANYOF_LOC_FOLD)
+                if ((flags & ANYOFL_FOLD)
                      && only_utf8_locale
                      && only_utf8_locale != &PL_sv_undef)
                 {
@@ -17565,6 +17655,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         const char * const bounds[] = {
             "",      /* Traditional */
             "{gcb}",
+            "{lb}",
             "{sb}",
             "{wb}"
         };