This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Remove duplicate statement
[perl5.git] / regcomp.c
index 5ec58cc..624a1b6 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1475,8 +1475,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                     TRIE_STORE_REVCHAR;
                 }
                 if ( set_bit ) {
-                    /* store the codepoint in the bitmap, and if its ascii
-                       also store its folded equivelent. */
+                   /* store the codepoint in the bitmap, and its folded
+                    * equivalent. */
                     TRIE_BITMAP_SET(trie,uvc);
 
                    /* store the folded codepoint */
@@ -2451,8 +2451,11 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
        }
 #endif
     }
-    
-    if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
+
+    if (UTF
+       && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
+       && ( STR_LEN(scan) >= 6 ) )
+    {
     /*
     Two problematic code points in Unicode casefolding of EXACT nodes:
     
@@ -3074,7 +3077,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
                    && !ANYOF_BITMAP_TEST(data->start_class, uc)
                    && (!(data->start_class->flags & ANYOF_FOLD)
-                       || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
+                       || !ANYOF_BITMAP_TEST(data->start_class, (UNI_SEMANTICS) ? PL_fold_latin1[uc] : PL_fold[uc])))
                     )
                    compat = 0;
                ANYOF_CLASS_ZERO(data->start_class);
@@ -3120,7 +3123,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                if (uc >= 0x100 ||
                    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
                    && !ANYOF_BITMAP_TEST(data->start_class, uc)
-                    && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
+                    && !ANYOF_BITMAP_TEST(data->start_class, (UNI_SEMANTICS) ? PL_fold_latin1[uc] : PL_fold[uc])))
                    compat = 0;
                ANYOF_CLASS_ZERO(data->start_class);
                ANYOF_BITMAP_ZERO(data->start_class);
@@ -3128,16 +3131,38 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    ANYOF_BITMAP_SET(data->start_class, uc);
                    data->start_class->flags &= ~ANYOF_EOS;
                    data->start_class->flags |= ANYOF_FOLD;
-                   if (OP(scan) == EXACTFL)
+                   if (OP(scan) == EXACTFL) {
                        data->start_class->flags |= ANYOF_LOCALE;
+                   }
+                   else {
+
+                       /* Also set the other member of the fold pair.  Can't
+                        * do this for locale, because not known until runtime
+                        */
+                       ANYOF_BITMAP_SET(data->start_class,
+                                        (OP(scan) == EXACTFU)
+                                                   ? PL_fold_latin1[uc]
+                                                   : PL_fold[uc]);
+                   }
                }
            }
            else if (flags & SCF_DO_STCLASS_OR) {
                if (data->start_class->flags & ANYOF_FOLD) {
                    /* false positive possible if the class is case-folded.
                       Assume that the locale settings are the same... */
-                   if (uc < 0x100)
+                   if (uc < 0x100) {
                        ANYOF_BITMAP_SET(data->start_class, uc);
+                        if (OP(scan) != EXACTFL) {
+
+                            /* And set the other member of the fold pair, but
+                             * can't do that in locale because not known until
+                             * run-time */
+                            ANYOF_BITMAP_SET(data->start_class,
+                                            (OP(scan) == EXACTFU)
+                                                        ? PL_fold_latin1[uc]
+                                                        : PL_fold[uc]);
+                        }
+                   }
                    data->start_class->flags &= ~ANYOF_EOS;
                }
                cl_and(data->start_class, and_withp);
@@ -3221,13 +3246,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    f |= SCF_DO_STCLASS_AND;
                    f &= ~SCF_DO_STCLASS_OR;
                }
-               /* These are the cases when once a subexpression
-                  fails at a particular position, it cannot succeed
-                  even after backtracking at the enclosing scope.
-
-                  XXXX what if minimal match and we are at the
-                       initial run of {n,m}? */
-               if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
+               /* Exclude from super-linear cache processing any {n,m}
+                  regops for which the combination of input pos and regex
+                  pos is not enough information to determine if a match
+                  will be possible.
+
+                  For example, in the regex /foo(bar\s*){4,8}baz/ with the
+                  regex pos at the \s*, the prospects for a match depend not
+                  only on the input position but also on how many (bar\s*)
+                  repeats into the {4,8} we are. */
+               if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
                    f &= ~SCF_WHILEM_VISITED_POS;
 
                /* This will finish on WHILEM, setting scan, or on NULL: */
@@ -4724,7 +4752,7 @@ reStudy:
        if (PL_regkind[OP(first)] == EXACT) {
            if (OP(first) == EXACT)
                NOOP;   /* Empty, get anchored substr later. */
-           else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
+           else
                ri->regstclass = first;
        }
 #ifdef TRIE_STCLASS    
@@ -6074,7 +6102,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 
                    ENTER;
                    Perl_save_re_context(aTHX);
-                   rop = sv_compile_2op(sv, &sop, "re", &pad);
+                   rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
                    sop->op_private |= OPpREFCOUNTED;
                    /* re_dup will OpREFCNT_inc */
                    OpREFCNT_set(sop, 1);
@@ -6240,7 +6268,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                                       that follow */
                 has_use_defaults = TRUE;
                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
-                RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
                 goto parse_flags;
            default:
                --RExC_parse;
@@ -6978,8 +7005,12 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
        char *endchar;      /* Points to '.' or '}' ending cur char in the input
                               stream */
 
-       ret = reg_node(pRExC_state,
-                       (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+       ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
+                                                  : (LOC)
+                                                     ? EXACTFL
+                                                     : UNI_SEMANTICS
+                                                       ? EXACTFU
+                                                       : EXACTF));
        s= STRING(ret);
 
        /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
@@ -7585,7 +7616,13 @@ tryagain:
        defchar:
            ender = 0;
            ret = reg_node(pRExC_state,
-                          (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+                          (U8) ((! FOLD) ? EXACT
+                                         : (LOC)
+                                            ? EXACTFL
+                                            : (UNI_SEMANTICS)
+                                              ? EXACTFU
+                                              : EXACTF)
+                   );
            s = STRING(ret);
            for (len = 0, p = RExC_parse - 1;
              len < 127 && p < RExC_end;
@@ -8083,14 +8120,14 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
 ANYOF_##NAME:                                           \
        for (value = 0; value < 256; value++)           \
            if (TEST)                                   \
-               ANYOF_BITMAP_SET(ret, value);           \
+               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
     yesno = '+';                                        \
     what = WORD;                                        \
     break;                                              \
 case ANYOF_N##NAME:                                     \
        for (value = 0; value < 256; value++)           \
            if (!TEST)                                  \
-               ANYOF_BITMAP_SET(ret, value);           \
+               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
     yesno = '!';                                        \
     what = WORD;                                        \
     break
@@ -8104,12 +8141,14 @@ ANYOF_##NAME:                                           \
     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
     else if (UNI_SEMANTICS) {                           \
         for (value = 0; value < 256; value++) {         \
-            if (TEST_8) ANYOF_BITMAP_SET(ret, value);   \
+            if (TEST_8) stored +=                       \
+                      S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
         }                                               \
     }                                                   \
     else {                                              \
         for (value = 0; value < 256; value++) {         \
-            if (TEST_7) ANYOF_BITMAP_SET(ret, value);   \
+            if (TEST_7) stored +=                       \
+                       S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
         }                                               \
     }                                                   \
     yesno = '+';                                        \
@@ -8119,12 +8158,14 @@ case ANYOF_N##NAME:                                     \
     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
     else if (UNI_SEMANTICS) {                           \
         for (value = 0; value < 256; value++) {         \
-            if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
+            if (! TEST_8) stored +=                     \
+                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
         }                                               \
     }                                                   \
     else {                                              \
         for (value = 0; value < 256; value++) {         \
-            if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
+            if (! TEST_7) stored +=                     \
+                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
         }                                               \
     }                                                   \
     yesno = '!';                                        \
@@ -8147,6 +8188,78 @@ case ANYOF_N##NAME:                                     \
 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
 #endif
 
+STATIC U8
+S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
+{
+
+    /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
+     * Locale folding is done at run-time, so this function should not be
+     * called for nodes that are for locales.
+     *
+     * This function simply sets the bit corresponding to the fold of the input
+     * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
+     * 'F' is 'f'.
+     *
+     * It also sets any necessary flags, and returns the number of bits that
+     * actually changed from 0 to 1 */
+
+    U8 stored = 0;
+    U8 fold;
+
+    fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
+                           : PL_fold[value];
+
+    /* It assumes the bit for 'value' has already been set */
+    if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
+        ANYOF_BITMAP_SET(node, fold);
+        stored++;
+    }
+
+    /* The fold of the German sharp s is two ASCII characters, so isn't in the
+     * bitmap and doesn't have to be in utf8, but we only process it if unicode
+     * semantics are called for */
+    if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
+       ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
+    }
+    else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
+            || (! UNI_SEMANTICS
+                 && ! isASCII(value)
+                 && PL_fold_latin1[value] != value))
+    {   /* A character that has a fold outside of Latin1 matches outside the
+           bitmap, but only when the target string is utf8.  Similarly when we
+           don't have unicode semantics for the above ASCII Latin-1 characters,
+           and they have a fold, they should match if the target is utf8, and
+           not otherwise */
+       ANYOF_FLAGS(node) |= ANYOF_UTF8;
+    }
+
+    return stored;
+}
+
+
+PERL_STATIC_INLINE U8
+S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U32 value)
+{
+    /* This inline function sets a bit in the bitmap if not already set, and if
+     * appropriate, its fold, returning the number of bits that actually
+     * changed from 0 to 1 */
+
+    U8 stored;
+
+    if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
+       return 0;
+    }
+
+    ANYOF_BITMAP_SET(node, value);
+    stored = 1;
+
+    if (FOLD && ! LOC) {       /* Locale folds aren't known until runtime */
+       stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
+    }
+
+    return stored;
+}
+
 /*
    parse a class specification and produce either an ANYOF node that
    matches the pattern or if the pattern matches a single char only and
@@ -8321,6 +8434,9 @@ parseit:
                        (value=='p' ? '+' : '!'), (int)n, RExC_parse);
                }
                RExC_parse = e + 1;
+
+               /* The \p could match something in the Latin1 range, hence
+                * something that isn't utf8 */
                ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
                namedclass = ANYOF_MAX;  /* no official name, but it's named */
                }
@@ -8441,8 +8557,10 @@ parseit:
                               w, w, rangebegin);
 
                    if (prevvalue < 256) {
-                       ANYOF_BITMAP_SET(ret, prevvalue);
-                       ANYOF_BITMAP_SET(ret, '-');
+                       stored +=
+                         S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
+                       stored +=
+                         S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
                    }
                    else {
                        ANYOF_FLAGS(ret) |= ANYOF_UTF8;
@@ -8496,11 +8614,12 @@ parseit:
                    else {
 #ifndef EBCDIC
                        for (value = 0; value < 128; value++)
-                           ANYOF_BITMAP_SET(ret, value);
+                           stored +=
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
 #else  /* EBCDIC */
                        for (value = 0; value < 256; value++) {
                            if (isASCII(value))
-                               ANYOF_BITMAP_SET(ret, value);
+                               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
                        }
 #endif /* EBCDIC */
                    }
@@ -8513,11 +8632,12 @@ parseit:
                    else {
 #ifndef EBCDIC
                        for (value = 128; value < 256; value++)
-                           ANYOF_BITMAP_SET(ret, value);
+                           stored +=
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
 #else  /* EBCDIC */
                        for (value = 0; value < 256; value++) {
                            if (!isASCII(value))
-                               ANYOF_BITMAP_SET(ret, value);
+                               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
                        }
 #endif /* EBCDIC */
                    }
@@ -8530,7 +8650,8 @@ parseit:
                    else {
                        /* consecutive digits assumed */
                        for (value = '0'; value <= '9'; value++)
-                           ANYOF_BITMAP_SET(ret, value);
+                           stored +=
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
                    }
                    yesno = '+';
                    what = POSIX_CC_UNI_NAME("Digit");
@@ -8541,9 +8662,11 @@ parseit:
                    else {
                        /* consecutive digits assumed */
                        for (value = 0; value < '0'; value++)
-                           ANYOF_BITMAP_SET(ret, value);
+                           stored +=
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
                        for (value = '9' + 1; value < 256; value++)
-                           ANYOF_BITMAP_SET(ret, value);
+                           stored +=
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
                    }
                    yesno = '!';
                    what = POSIX_CC_UNI_NAME("Digit");
@@ -8594,7 +8717,8 @@ parseit:
                               w, w, rangebegin);
                    }
                    if (!SIZE_ONLY)
-                       ANYOF_BITMAP_SET(ret, '-');
+                       stored +=
+                            S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
                } else
                    range = 1;  /* yeah, it's a range! */
                continue;       /* but do it the next time */
@@ -8617,14 +8741,14 @@ parseit:
                    if (isLOWER(prevvalue)) {
                        for (i = prevvalue; i <= ceilvalue; i++)
                            if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
-                               stored++;
-                               ANYOF_BITMAP_SET(ret, i);
+                               stored +=
+                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
                            }
                    } else {
                        for (i = prevvalue; i <= ceilvalue; i++)
                            if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
-                               stored++;
-                               ANYOF_BITMAP_SET(ret, i);
+                               stored +=
+                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
                            }
                    }
                }
@@ -8632,8 +8756,8 @@ parseit:
 #endif
                      for (i = prevvalue; i <= ceilvalue; i++) {
                        if (!ANYOF_BITMAP_TEST(ret,i)) {
-                           stored++;  
-                           ANYOF_BITMAP_SET(ret, i);
+                           stored +=
+                                S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
                        }
                      }
          }
@@ -8741,8 +8865,14 @@ parseit:
     if( stored == 1 && (value < 128 || (value < 256 && !UTF))
         && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
     ) {
-        /* optimize single char class to an EXACT node
-           but *only* when its not a UTF/high char  */
+       /* optimize single char class to an EXACT node but *only* when its not
+        * a UTF/high char.  Note that the information needed to decide to do
+        * this optimization is not currently available until the 2nd pass, and
+        * that the actually used EXACT node takes less space than the
+        * calculated ANYOF node, and hence the amount of space calculated in
+         * the first pass is larger than actually used.  Currently we don't
+         * keep track of enough information to do this for nodes which contain
+         * matches outside the bitmap */
         const char * cur_parse= RExC_parse;
         RExC_emit = (regnode *)orig_emit;
         RExC_parse = (char *)orig_parse;
@@ -9171,6 +9301,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
             switch (OP(scan)) {
                 case EXACT:
                 case EXACTF:
+                case EXACTFU:
                 case EXACTFL:
                         if( exact == PSEUDO )
                             exact= OP(scan);
@@ -9552,7 +9683,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
            sv_catpvs(sv, "{unicode_all}");
        else if (flags & ANYOF_UTF8)
            sv_catpvs(sv, "{unicode}");
-       else if (flags & ANYOF_NONBITMAP)
+       if (flags & ANYOF_NONBITMAP_NON_UTF8)
            sv_catpvs(sv, "{outside bitmap}");
 
        {
@@ -10258,8 +10389,14 @@ S_put_byte(pTHX_ SV *sv, int c)
        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
        identical, to the ASCII delete (DEL) or rubout control character.
        ) So the old condition can be simplified to !isPRINT(c)  */
-    if (!isPRINT(c))
-       Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
+    if (!isPRINT(c)) {
+       if (c < 256) {
+           Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
+       }
+       else {
+           Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
+       }
+    }
     else {
        const char string = c;
        if (c == '-' || c == ']' || c == '\\' || c == '^')