This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perldelta: remove placeholders for module changes
[perl5.git] / regexec.c
index 4ca4821..bc38839 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -93,9 +93,6 @@ static const char* const non_utf8_target_but_utf8_required
 #include "inline_invlist.c"
 #include "unicode_constants.h"
 
-#define RF_tainted     1       /* tainted information used? e.g. locale */
-#define RF_warned      2               /* warned about big count? */
-
 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 
 #ifndef STATIC
@@ -144,7 +141,6 @@ static const char* const non_utf8_target_but_utf8_required
 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START {            \
         if (!swash_ptr) {                                                     \
             U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
-            ENTER; save_re_context();                                         \
             swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
                                          1, 0, NULL, &flags);                 \
             assert(swash_ptr);                                                \
@@ -439,19 +435,21 @@ S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
     switch ((_char_class_number) classnum) {
         case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
+        case _CC_ENUM_ASCII:     return isASCII_LC(character);
+        case _CC_ENUM_BLANK:     return isBLANK_LC(character);
+        case _CC_ENUM_CASED:     return isLOWER_LC(character)
+                                        || isUPPER_LC(character);
+        case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
+        case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
+        case _CC_ENUM_SPACE:     return isSPACE_LC(character);
         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
-        case _CC_ENUM_SPACE:     return isSPACE_LC(character);
-        case _CC_ENUM_BLANK:     return isBLANK_LC(character);
         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
-        case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
-        case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
-        case _CC_ENUM_ASCII:     return isASCII_LC(character);
         default:    /* VERTSPACE should never occur in locales */
             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
     }
@@ -492,7 +490,9 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
                 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
         }
 
-        return swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) character, TRUE);
+        return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
+                                 character,
+                                 TRUE /* is UTF */ ));
     }
 
     switch ((_char_class_number) classnum) {
@@ -1226,8 +1226,8 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
                               ? (utf8_target ? trie_utf8 : trie_plain) \
                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
 
-#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
-uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
+#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
+STMT_START {                               \
     STRLEN skiplen;                                                                 \
     switch (trie_type) {                                                            \
     case trie_utf8_fold:                                                            \
@@ -1460,6 +1460,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
     /* We know what class it must start with. */
     switch (OP(c)) {
     case ANYOF:
+    case ANYOF_SYNTHETIC:
+    case ANYOF_WARN_SUPER:
         if (utf8_target) {
             REXEC_FBC_UTF8_CLASS_SCAN(
                       reginclass(prog, c, (U8*)s, utf8_target));
@@ -1617,20 +1619,20 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         break;
     }
     case BOUNDL:
-        PL_reg_flags |= RF_tainted;
-        FBC_BOUND(isALNUM_LC,
-                  isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
-                  isALNUM_LC_utf8((U8*)s));
+        RXp_MATCH_TAINTED_on(prog);
+        FBC_BOUND(isWORDCHAR_LC,
+                  isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
+                  isWORDCHAR_LC_utf8((U8*)s));
         break;
     case NBOUNDL:
-        PL_reg_flags |= RF_tainted;
-        FBC_NBOUND(isALNUM_LC,
-                   isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
-                   isALNUM_LC_utf8((U8*)s));
+        RXp_MATCH_TAINTED_on(prog);
+        FBC_NBOUND(isWORDCHAR_LC,
+                   isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
+                   isWORDCHAR_LC_utf8((U8*)s));
         break;
     case BOUND:
         FBC_BOUND(isWORDCHAR,
-                  isALNUM_uni(tmp),
+                  isWORDCHAR_uni(tmp),
                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
         break;
     case BOUNDA:
@@ -1640,7 +1642,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         break;
     case NBOUND:
         FBC_NBOUND(isWORDCHAR,
-                   isALNUM_uni(tmp),
+                   isWORDCHAR_uni(tmp),
                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
         break;
     case NBOUNDA:
@@ -1650,12 +1652,12 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         break;
     case BOUNDU:
         FBC_BOUND(isWORDCHAR_L1,
-                  isALNUM_uni(tmp),
+                  isWORDCHAR_uni(tmp),
                   cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
         break;
     case NBOUNDU:
         FBC_NBOUND(isWORDCHAR_L1,
-                   isALNUM_uni(tmp),
+                   isWORDCHAR_uni(tmp),
                    cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
         break;
     case LNBREAK:
@@ -1672,7 +1674,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         /* FALLTHROUGH */
 
     case POSIXL:
-        PL_reg_flags |= RF_tainted;
+        RXp_MATCH_TAINTED_on(prog);
         REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
         break;
@@ -2105,11 +2107,12 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
        Perl_croak(aTHX_ "corrupted regexp program");
     }
 
-    PL_reg_flags = 0;
+    RX_MATCH_TAINTED_off(rx);
     PL_reg_state.re_state_eval_setup_done = FALSE;
     PL_reg_maxiter = 0;
 
     reginfo.is_utf8_pat = cBOOL(RX_UTF8(rx));
+    reginfo.warned = FALSE;
     /* Mark beginning of line for ^ and lookbehind. */
     reginfo.bol = startpos; /* XXX not used ??? */
     PL_bostr  = strbeg;
@@ -2590,7 +2593,6 @@ got_it:
            );
     );
     Safefree(swap);
-    RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
 
     if (PL_reg_state.re_state_eval_setup_done)
        restore_pos(aTHX_ prog);
@@ -4125,7 +4127,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
            const char * s;
            U32 fold_utf8_flags;
 
-           PL_reg_flags |= RF_tainted;
+            RX_MATCH_TAINTED_on(reginfo->prog);
             folder = foldEQ_locale;
             fold_array = PL_fold_locale;
            fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
@@ -4189,7 +4191,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
         * have to set the FLAGS fields of these */
        case BOUNDL:  /*  /\b/l  */
        case NBOUNDL: /*  /\B/l  */
-           PL_reg_flags |= RF_tainted;
+            RX_MATCH_TAINTED_on(reginfo->prog);
            /* FALL THROUGH */
        case BOUND:   /*  /\b/   */
        case BOUNDU:  /*  /\b/u  */
@@ -4210,7 +4212,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                    ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
                }
                if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
-                   ln = isALNUM_uni(ln);
+                   ln = isWORDCHAR_uni(ln);
                     if (NEXTCHR_IS_EOS)
                         n = 0;
                     else {
@@ -4220,8 +4222,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                     }
                }
                else {
-                   ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
-                   n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
+                   ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
+                   n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
                }
            }
            else {
@@ -4245,12 +4247,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                        n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
                        break;
                    case REGEX_LOCALE_CHARSET:
-                       ln = isALNUM_LC(ln);
-                       n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
+                       ln = isWORDCHAR_LC(ln);
+                       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
                        break;
                    case REGEX_DEPENDS_CHARSET:
-                       ln = isALNUM(ln);
-                       n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
+                       ln = isWORDCHAR(ln);
+                       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
                        break;
                    case REGEX_ASCII_RESTRICTED_CHARSET:
                    case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
@@ -4269,6 +4271,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
            break;
 
        case ANYOF:  /*  /[abc]/       */
+       case ANYOF_WARN_SUPER:
             if (NEXTCHR_IS_EOS)
                 sayNO;
            if (utf8_target) {
@@ -4296,19 +4299,19 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
             /* The locale hasn't influenced the outcome before this, so defer
              * tainting until now */
-            PL_reg_flags |= RF_tainted;
+            RX_MATCH_TAINTED_on(reginfo->prog);
 
             /* Use isFOO_lc() for characters within Latin1.  (Note that
              * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
              * wouldn't be invariant) */
             if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
-                if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), nextchr)))) {
+                if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
                     sayNO;
                 }
             }
             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
-                                        TWO_BYTE_UTF8_TO_UNI(nextchr,
+                                        (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
                                                             *(locinput + 1))))))
                 {
                     sayNO;
@@ -4670,7 +4673,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
            const U8 *fold_array;
            UV utf8_fold_flags;
 
-           PL_reg_flags |= RF_tainted;
+            RX_MATCH_TAINTED_on(reginfo->prog);
            folder = foldEQ_locale;
            fold_array = PL_fold_locale;
            type = REFFL;
@@ -4715,7 +4718,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
            goto do_nref_ref_common;
 
        case REFFL:  /*  /\1/il  */
-           PL_reg_flags |= RF_tainted;
+            RX_MATCH_TAINTED_on(reginfo->prog);
            folder = foldEQ_locale;
            fold_array = PL_fold_locale;
            utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
@@ -4875,8 +4878,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 */
                Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
 
-               PL_reg_state.re_reparsing = FALSE;
-
                if (!caller_cv)
                    caller_cv = find_runcv(NULL);
 
@@ -4910,12 +4911,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 * points to newcv's pad. */
                if (newcv != last_pushed_cv || PL_comppad != last_pad)
                {
-                   I32 depth = (newcv == caller_cv) ? 0 : 1;
+                    U8 flags = (CXp_SUB_RE |
+                                ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
                    if (last_pushed_cv) {
-                       CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
+                       CHANGE_MULTICALL_FLAGS(newcv, flags);
                    }
                    else {
-                       PUSH_MULTICALL_WITHDEPTH(newcv, depth);
+                       PUSH_MULTICALL_FLAGS(newcv, flags);
                    }
                    last_pushed_cv = newcv;
                }
@@ -5552,9 +5554,9 @@ NULL
          do_whilem_B_max:
            if (cur_curlyx->u.curlyx.count >= REG_INFTY
                && ckWARN(WARN_REGEXP)
-               && !(PL_reg_flags & RF_warned))
+               && !reginfo->warned)
            {
-               PL_reg_flags |= RF_warned;
+                reginfo->warned        = TRUE;
                Perl_warner(aTHX_ packWARN(WARN_REGEXP),
                     "Complex regular subexpression recursion limit (%d) "
                     "exceeded",
@@ -5577,9 +5579,9 @@ NULL
                /* Maximum greed exceeded */
                if (cur_curlyx->u.curlyx.count >= REG_INFTY
                    && ckWARN(WARN_REGEXP)
-                   && !(PL_reg_flags & RF_warned))
+                    && !reginfo->warned)
                {
-                   PL_reg_flags |= RF_warned;
+                    reginfo->warned    = TRUE;
                    Perl_warner(aTHX_ packWARN(WARN_REGEXP),
                        "Complex regular subexpression recursion "
                        "limit (%d) exceeded",
@@ -6639,7 +6641,7 @@ no_silent:
  * depth     - (for debugging) backtracking depth.
  */
 STATIC I32
-S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p,
+S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                 I32 max, int depth, bool is_utf8_pat)
 {
     dVAR;
@@ -6788,7 +6790,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p,
        goto do_exactf;
 
     case EXACTFL:
-       PL_reg_flags |= RF_tainted;
+        RXp_MATCH_TAINTED_on(prog);
        utf8_flags = FOLDEQ_UTF8_LOCALE;
        goto do_exactf;
 
@@ -6861,6 +6863,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p,
        break;
     }
     case ANYOF:
+    case ANYOF_WARN_SUPER:
        if (utf8_target) {
            while (hardcount < max
                    && scan < loceol
@@ -6882,7 +6885,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p,
         /* FALLTHROUGH */
 
     case POSIXL:
-       PL_reg_flags |= RF_tainted;
+        RXp_MATCH_TAINTED_on(prog);
        if (! utf8_target) {
            while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
                                                                    *scan)))
@@ -7269,7 +7272,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit
  */
 
 STATIC bool
-S_reginclass(pTHX_ const regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
+S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
 {
     dVAR;
     const char flags = ANYOF_FLAGS(n);
@@ -7302,7 +7305,7 @@ S_reginclass(pTHX_ const regexp * const prog, const regnode * const n, const U8*
            match = TRUE;
        }
        else if (flags & ANYOF_LOCALE) {
-           PL_reg_flags |= RF_tainted;
+           RXp_MATCH_TAINTED_on(prog);
 
            if ((flags & ANYOF_LOC_FOLD)
                 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
@@ -7329,7 +7332,17 @@ S_reginclass(pTHX_ const regexp * const prog, const regnode * const n, const U8*
                  * will be 1, so the exclusive or will reverse things, so we
                  * are testing for \W.  On the third iteration, 'to_complement'
                  * will be 0, and we would be testing for \s; the fourth
-                 * iteration would test for \S, etc. */
+                 * iteration would test for \S, etc.
+                 *
+                 * Note that this code assumes that all the classes are closed
+                 * under folding.  For example, if a character matches \w, then
+                 * its fold does too; and vice versa.  This should be true for
+                 * any well-behaved locale for all the currently defined Posix
+                 * classes, except for :lower: and :upper:, which are handled
+                 * by the pseudo-class :cased: which matches if either of the
+                 * other two does.  To get rid of this assumption, an outer
+                 * loop could be used below to iterate over both the source
+                 * character, and its fold (if different) */
 
                 int count = 0;
                 int to_complement = 0;
@@ -7365,7 +7378,7 @@ S_reginclass(pTHX_ const regexp * const prog, const regnode * const n, const U8*
                     || (utf8_target
                         && (c >=256
                             || (! (flags & ANYOF_LOCALE))
-                            || (flags & ANYOF_IS_SYNTHETIC)))))
+                            || OP(n) == ANYOF_SYNTHETIC))))
        {
            SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
            if (sw) {
@@ -7387,7 +7400,7 @@ S_reginclass(pTHX_ const regexp * const prog, const regnode * const n, const U8*
        }
 
         if (UNICODE_IS_SUPER(c)
-            && (flags & ANYOF_WARN_SUPER)
+            && OP(n) == ANYOF_WARN_SUPER
             && ckWARN_d(WARN_NON_UNICODE))
         {
             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),