This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Remove never used struct element
[perl5.git] / regcomp.c
index fa23d38..50f823f 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -197,10 +197,6 @@ struct RExC_state_t {
     U32         frame_count;
     AV         *warn_text;
     HV         *unlexed_names;
-#ifdef ADD_TO_REGEXEC
-    char       *starttry;              /* -Dr: where regtry was called. */
-#define RExC_starttry  (pRExC_state->starttry)
-#endif
     SV         *runtime_code_qr;       /* qr with the runtime code blocks */
 #ifdef DEBUGGING
     const char  *lastparse;
@@ -7399,7 +7395,7 @@ S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
  *
  * pm_flags contains the PMf_* flags, typically based on those from the
  * pm_flags field of the related PMOP. Currently we're only interested in
- * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
+ * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
  *
  * For many years this code had an initial sizing pass that calculated
  * (sometimes incorrectly, leading to security holes) the size needed for the
@@ -10765,6 +10761,24 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
         /* && memCHRs("iogcmsx", *RExC_parse) */
         /* (?g), (?gc) and (?o) are useless here
            and must be globally applied -- japhy */
+        if ((RExC_pm_flags & PMf_WILDCARD)) {
+            if (flagsp == & negflags) {
+                if (*RExC_parse == 'm') {
+                    RExC_parse++;
+                    /* diag_listed_as: Use of %s is not allowed in Unicode
+                       property wildcard subpatterns in regex; marked by <--
+                       HERE in m/%s/ */
+                    vFAIL("Use of modifier '-m' is not allowed in Unicode"
+                          " property wildcard subpatterns");
+                }
+            }
+            else {
+                if (*RExC_parse == 's') {
+                    goto modifier_illegal_in_wildcard;
+                }
+            }
+        }
+
         switch (*RExC_parse) {
 
             /* Code for the imsxn flags */
@@ -10844,8 +10858,12 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
                                     *(RExC_parse - 1));
                 NOT_REACHED; /*NOTREACHED*/
-            case ONCE_PAT_MOD: /* 'o' */
             case GLOBAL_PAT_MOD: /* 'g' */
+                if (RExC_pm_flags & PMf_WILDCARD) {
+                    goto modifier_illegal_in_wildcard;
+                }
+                /*FALLTHROUGH*/
+            case ONCE_PAT_MOD: /* 'o' */
                 if (ckWARN(WARN_REGEXP)) {
                     const I32 wflagbit = *RExC_parse == 'o'
                                          ? WASTED_O
@@ -10866,6 +10884,9 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                 break;
 
             case CONTINUE_PAT_MOD: /* 'c' */
+                if (RExC_pm_flags & PMf_WILDCARD) {
+                    goto modifier_illegal_in_wildcard;
+                }
                 if (ckWARN(WARN_REGEXP)) {
                     if (! (wastedflags & WASTED_C) ) {
                         wastedflags |= WASTED_GC;
@@ -10880,6 +10901,9 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                 }
                 break;
             case KEEPCOPY_PAT_MOD: /* 'p' */
+                if (RExC_pm_flags & PMf_WILDCARD) {
+                    goto modifier_illegal_in_wildcard;
+                }
                 if (flagsp == &negflags) {
                     ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
                 } else {
@@ -10900,6 +10924,18 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
             case ':':
             case ')':
 
+                if (  (RExC_pm_flags & PMf_WILDCARD)
+                    && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
+                {
+                    RExC_parse++;
+                    /* diag_listed_as: Use of %s is not allowed in Unicode
+                       property wildcard subpatterns in regex; marked by <--
+                       HERE in m/%s/ */
+                    vFAIL2("Use of modifier '%c' is not allowed in Unicode"
+                           " property wildcard subpatterns",
+                           has_charset_modifier);
+                }
+
                 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
                     negflags |= RXf_PMf_EXTENDED_MORE;
                 }
@@ -10925,6 +10961,13 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
     }
 
     vFAIL("Sequence (?... not terminated");
+
+  modifier_illegal_in_wildcard:
+    RExC_parse++;
+    /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
+       subpatterns in regex; marked by <-- HERE in m/%s/ */
+    vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
+           " subpatterns", *(RExC_parse - 1));
 }
 
 /*
@@ -12533,6 +12576,23 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
          do_curly:
            if ((flags&SIMPLE)) {
                 if (min == 0 && max == REG_INFTY) {
+
+                    /* Going from 0..inf is currently forbidden in wildcard
+                     * subpatterns.  The only reason is to make it harder to
+                     * write patterns that take a long long time to halt, and
+                     * because the use of this construct isn't necessary in
+                     * matching Unicode property values */
+                    if (RExC_pm_flags & PMf_WILDCARD) {
+                        RExC_parse++;
+                        /* diag_listed_as: Use of %s is not allowed in Unicode
+                           property wildcard subpatterns in regex; marked by
+                           <-- HERE in m/%s/ */
+                        vFAIL("Use of quantifier '*' is not allowed in"
+                              " Unicode property wildcard subpatterns");
+                        /* Note, don't need to worry about {0,}, as a '}' isn't
+                         * legal at all in wildcards, so wouldn't get this far
+                         * */
+                    }
                     reginsert(pRExC_state, STAR, ret, depth+1);
                     MARK_NAUGHTY(4);
                     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
@@ -13404,13 +13464,28 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
        /* Special Escapes */
        case 'A':
            RExC_seen_zerolen++;
-           ret = reg_node(pRExC_state, SBOL);
-            /* SBOL is shared with /^/ so we set the flags so we can tell
-             * /\A/ from /^/ in split. */
-            FLAGS(REGNODE_p(ret)) = 1;
+            /* Under wildcards, this is changed to match \n; should be
+             * invisible to the user, as they have to compile under /m */
+            if (RExC_pm_flags & PMf_WILDCARD) {
+                ret = reg_node(pRExC_state, MBOL);
+            }
+            else {
+                ret = reg_node(pRExC_state, SBOL);
+                /* SBOL is shared with /^/ so we set the flags so we can tell
+                 * /\A/ from /^/ in split. */
+                FLAGS(REGNODE_p(ret)) = 1;
+            }
            *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 'G':
+            if (RExC_pm_flags & PMf_WILDCARD) {
+                RExC_parse++;
+                /* diag_listed_as: Use of %s is not allowed in Unicode property
+                   wildcard subpatterns in regex; marked by <-- HERE in m/%s/
+                 */
+                vFAIL("Use of '\\G' is not allowed in Unicode property"
+                      " wildcard subpatterns");
+            }
            ret = reg_node(pRExC_state, GPOS);
             RExC_seen |= REG_GPOS_SEEN;
            *flagp |= SIMPLE;
@@ -13432,12 +13507,24 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 vFAIL("\\K not permitted in lookahead/lookbehind");
             }
        case 'Z':
-           ret = reg_node(pRExC_state, SEOL);
+            if (RExC_pm_flags & PMf_WILDCARD) {
+                /* See comment under \A above */
+                ret = reg_node(pRExC_state, MEOL);
+            }
+            else {
+                ret = reg_node(pRExC_state, SEOL);
+            }
            *flagp |= SIMPLE;
            RExC_seen_zerolen++;                /* Do not optimize RE away */
            goto finish_meta_pat;
        case 'z':
-           ret = reg_node(pRExC_state, EOS);
+            if (RExC_pm_flags & PMf_WILDCARD) {
+                /* See comment under \A above */
+                ret = reg_node(pRExC_state, MEOL);
+            }
+            else {
+                ret = reg_node(pRExC_state, EOS);
+            }
            *flagp |= SIMPLE;
            RExC_seen_zerolen++;                /* Do not optimize RE away */
            goto finish_meta_pat;
@@ -16125,6 +16212,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
     GET_RE_DEBUG_FLAGS_DECL;
 
     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
+    PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */
 
     DEBUG_PARSE("xcls");
 
@@ -17457,6 +17545,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                {
                char *e;
 
+                if (RExC_pm_flags & PMf_WILDCARD) {
+                    RExC_parse++;
+                    /* diag_listed_as: Use of %s is not allowed in Unicode
+                       property wildcard subpatterns in regex; marked by <--
+                       HERE in m/%s/ */
+                    vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
+                           " wildcard subpatterns", (char) value, *(RExC_parse - 1));
+                }
+
                /* \p means they want Unicode semantics */
                REQUIRE_UNI_RULES(flagp, 0);
 
@@ -21237,6 +21334,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
 SV *
 Perl_re_intuit_string(pTHX_ REGEXP * const r)
 {                              /* Assume that RE_INTUIT is set */
+    /* Returns an SV containing a string that must appear in the target for it
+     * to match */
+
     struct regexp *const prog = ReANY(r);
     GET_RE_DEBUG_FLAGS_DECL;
 
@@ -21245,11 +21345,12 @@ Perl_re_intuit_string(pTHX_ REGEXP * const r)
 
     DEBUG_COMPILE_r(
        {
-           const char * const s = SvPV_nolen_const(RX_UTF8(r)
+            if (prog->maxlen > 0) {
+                const char * const s = SvPV_nolen_const(RX_UTF8(r)
                      ? prog->check_utf8 : prog->check_substr);
 
-           if (!PL_colorset) reginitcolors();
-            Perl_re_printf( aTHX_
+                if (!PL_colorset) reginitcolors();
+                Perl_re_printf( aTHX_
                      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
                      PL_colors[4],
                      RX_UTF8(r) ? "utf8 " : "",
@@ -21257,6 +21358,7 @@ Perl_re_intuit_string(pTHX_ REGEXP * const r)
                      s,
                      PL_colors[1],
                      (strlen(s) > PL_dump_re_max_len ? "..." : ""));
+            }
        } );
 
     /* use UTF8 check substring if regexp pattern itself is in UTF8 */
@@ -22814,6 +22916,42 @@ S_get_extended_utf8_msg(pTHX_ const UV cp)
 
 #  endif
 
+STATIC REGEXP *
+S_compile_wildcard(pTHX_ const char * name, const STRLEN len,
+                         const bool ignore_case)
+{
+    U32 flags = PMf_MULTILINE|PMf_WILDCARD;
+    REGEXP * subpattern_re;
+
+    PERL_ARGS_ASSERT_COMPILE_WILDCARD;
+
+    if (ignore_case) {
+        flags |= PMf_FOLD;
+    }
+    set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
+
+    subpattern_re = re_op_compile_wrapper(sv_2mortal(newSVpvn(name, len)),
+                                        /* Like in op.c, we copy the compile
+                                         * time pm flags to the rx ones */
+                                        (flags & RXf_PMf_COMPILETIME), flags);
+
+    assert(subpattern_re);  /* Should have died if didn't compile successfully */
+    return subpattern_re;
+}
+
+STATIC I32
+S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
+        char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
+{
+    I32 result;
+
+    PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
+
+    result = pregexec(prog, stringarg, strend, strbeg, minend, screamer, nosave);
+
+    return result;
+}
+
 SV *
 Perl_handle_user_defined_property(pTHX_
 
@@ -23410,8 +23548,6 @@ Perl_parse_uniprop_string(pTHX_
             if (table_index) {
                 const char * const * prop_values
                                             = UNI_prop_value_ptrs[table_index];
-                SV * subpattern;
-                Size_t subpattern_len;
                 REGEXP * subpattern_re;
                 char open = name[i++];
                 char close;
@@ -23455,14 +23591,10 @@ Perl_parse_uniprop_string(pTHX_
                  * pattern fails to compile, our added text to the user's
                  * pattern will be displayed to the user, which is not so
                  * desirable. */
-                subpattern_len = name_len - i - 1 - escaped;
-                subpattern = Perl_newSVpvf(aTHX_ "(?iaa:%.*s)",
-                                              (unsigned) subpattern_len,
-                                              name + i);
-                subpattern = sv_2mortal(subpattern);
-                subpattern_re = re_compile(subpattern, 0);
-                assert(subpattern_re);  /* Should have died if didn't compile
-                                         successfully */
+                subpattern_re = compile_wildcard(name + i,
+                                                 name_len - i - 1 - escaped,
+                                                 TRUE /* /i */
+                                                );
 
                 /* For each legal property value, see if the supplied pattern
                  * matches it. */
@@ -23471,7 +23603,7 @@ Perl_parse_uniprop_string(pTHX_
                     const Size_t len = strlen(entry);
                     SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
 
-                    if (pregexec(subpattern_re,
+                    if (execute_wildcard(subpattern_re,
                                  (char *) entry,
                                  (char *) entry + len,
                                  (char *) entry, 0,