This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
re.t: Avoid encoding issues by using hex chars
[perl5.git] / regcomp.c
index 50634d2..fb9c606 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -763,8 +763,9 @@ S_cl_and(struct regnode_charclass_class *cl,
     PERL_ARGS_ASSERT_CL_AND;
 
     assert(and_with->type == ANYOF);
-    if (!(and_with->flags & ANYOF_CLASS)
-       && !(cl->flags & ANYOF_CLASS)
+
+    if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
+       && !(ANYOF_CLASS_TEST_ANY_SET(cl))
        && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
        && !(and_with->flags & ANYOF_FOLD)
        && !(cl->flags & ANYOF_FOLD)) {
@@ -837,7 +838,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
            /* OR char bitmap and class bitmap separately */
            for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
                cl->bitmap[i] |= or_with->bitmap[i];
-           if (or_with->flags & ANYOF_CLASS) {
+           if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
                for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
                    cl->classflags[i] |= or_with->classflags[i];
                cl->flags |= ANYOF_CLASS;
@@ -3609,8 +3610,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        goto do_default;
                    if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
                        value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
-                                || ((data->start_class->flags & ANYOF_CLASS)
-                                     && ANYOF_CLASS_TEST_ANY_SET(data->start_class)));
+                                || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
                        cl_anything(pRExC_state, data->start_class);
                    }
                    if (flags & SCF_DO_STCLASS_AND || !value)
@@ -8179,14 +8179,14 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
 ANYOF_##NAME:                                           \
        for (value = 0; value < 256; value++)           \
            if (TEST)                                   \
-               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
     yesno = '+';                                        \
     what = WORD;                                        \
     break;                                              \
 case ANYOF_N##NAME:                                     \
        for (value = 0; value < 256; value++)           \
            if (!TEST)                                  \
-               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
     yesno = '!';                                        \
     what = WORD;                                        \
     break
@@ -8201,13 +8201,13 @@ ANYOF_##NAME:                                           \
     else if (UNI_SEMANTICS) {                           \
         for (value = 0; value < 256; value++) {         \
             if (TEST_8) stored +=                       \
-                      S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+                      S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
         }                                               \
     }                                                   \
     else {                                              \
         for (value = 0; value < 128; value++) {         \
             if (TEST_7) stored +=                       \
-                       S_set_regclass_bit(aTHX_ pRExC_state, ret, UNI_TO_NATIVE(value)); \
+                       S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \
         }                                               \
     }                                                   \
     yesno = '+';                                        \
@@ -8218,16 +8218,16 @@ case ANYOF_N##NAME:                                     \
     else if (UNI_SEMANTICS) {                           \
         for (value = 0; value < 256; value++) {         \
             if (! TEST_8) stored +=                     \
-                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+                        S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
         }                                               \
     }                                                   \
     else {                                              \
         for (value = 0; value < 128; value++) {         \
             if (! TEST_7) stored +=                     \
-                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+                        S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
         }                                               \
         for (value = 128; value < 256; value++) {         \
-                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+                        S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
         }                                               \
     }                                                   \
     yesno = '!';                                        \
@@ -8377,12 +8377,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
 
     if (SIZE_ONLY) {
        RExC_size += ANYOF_SKIP;
+#ifdef ANYOF_ADD_LOC_SKIP
+       if (LOC) {
+           RExC_size += ANYOF_ADD_LOC_SKIP;
+       }
+#endif
        listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
     }
     else {
        RExC_emit += ANYOF_SKIP;
-       if (LOC)
+       if (LOC) {
            ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
+#ifdef ANYOF_ADD_LOC_SKIP
+           RExC_emit += ANYOF_ADD_LOC_SKIP;
+#endif
+       }
        ANYOF_BITMAP_ZERO(ret);
        listsv = newSVpvs("# comment\n");
     }
@@ -8596,10 +8605,14 @@ parseit:
            if (LOC && namedclass < ANYOF_MAX && ! need_class) {
                need_class = 1;
                if (SIZE_ONLY) {
+#ifdef ANYOF_CLASS_ADD_SKIP
                    RExC_size += ANYOF_CLASS_ADD_SKIP;
+#endif
                }
                else {
+#ifdef ANYOF_CLASS_ADD_SKIP
                    RExC_emit += ANYOF_CLASS_ADD_SKIP;
+#endif
                    ANYOF_CLASS_ZERO(ret);
                }
                ANYOF_FLAGS(ret) |= ANYOF_CLASS;
@@ -8617,7 +8630,7 @@ parseit:
 
                    if (prevvalue < 256) {
                        stored +=
-                         S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
+                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
                        stored +=
                          S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
                    }
@@ -8671,7 +8684,7 @@ parseit:
                    else {
                        for (value = 0; value < 128; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
                    }
                    yesno = '+';
                    what = NULL;        /* Doesn't match outside ascii, so
@@ -8683,7 +8696,7 @@ parseit:
                    else {
                        for (value = 128; value < 256; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
                    }
                    yesno = '!';
                    what = "ASCII";
@@ -8695,7 +8708,7 @@ parseit:
                        /* consecutive digits assumed */
                        for (value = '0'; value <= '9'; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
                    }
                    yesno = '+';
                    what = POSIX_CC_UNI_NAME("Digit");
@@ -8707,10 +8720,10 @@ parseit:
                        /* consecutive digits assumed */
                        for (value = 0; value < '0'; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
                        for (value = '9' + 1; value < 256; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
                    }
                    yesno = '!';
                    what = POSIX_CC_UNI_NAME("Digit");
@@ -8780,20 +8793,20 @@ parseit:
                        for (i = prevvalue; i <= ceilvalue; i++)
                            if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
                                stored +=
-                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
                            }
                    } else {
                        for (i = prevvalue; i <= ceilvalue; i++)
                            if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
                                stored +=
-                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
                            }
                    }
                }
                else
 #endif
                      for (i = prevvalue; i <= ceilvalue; i++) {
-                       stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+                       stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
                      }
          }
          if (value > 255 || UTF) {
@@ -8916,26 +8929,83 @@ parseit:
        ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
     }
 
-    if( stored == 1 && (value < 128 || (value < 256 && !UTF))
-        && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
-    ) {
-       /* optimize single char class to an EXACT node but *only* when its not
-        * a UTF/high char.  Note that the information needed to decide to do
-        * this optimization is not currently available until the 2nd pass, and
-        * that the actually used EXACT node takes less space than the
-        * calculated ANYOF node, and hence the amount of space calculated in
-         * the first pass is larger than actually used.  Currently we don't
-         * keep track of enough information to do this for nodes which contain
-         * matches outside the bitmap */
+    /* A single character class can be "optimized" into an EXACTish node.
+     * Note that since we don't currently count how many characters there are
+     * outside the bitmap, we are XXX missing optimization possibilities for
+     * them.  This optimization can't happen unless this is a truly single
+     * character class, which means that it can't be an inversion into a
+     * many-character class, and there must be no possibility of there being
+     * things outside the bitmap.  'stored' (only) for locales doesn't include
+     * \w, etc, so have to make a special test that they aren't present
+     *
+     * Similarly A 2-character class of the very special form like [bB] can be
+     * optimized into an EXACTFish node, but only for non-locales, and for
+     * characters which only have the two folds; so things like 'fF' and 'Ii'
+     * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
+     * FI'. */
+    if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
+        && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
+                              || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
+           || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
+                                && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
+                                /* If the latest code point has a fold whose
+                                 * bit is set, it must be the only other one */
+                               && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
+                                && ANYOF_BITMAP_TEST(ret, prevvalue)))))
+    {
+        /* Note that the information needed to decide to do this optimization
+         * is not currently available until the 2nd pass, and that the actually
+        * used EXACTish node takes less space than the calculated ANYOF node,
+        * and hence the amount of space calculated in the first pass is larger
+         * than actually used, so this optimization doesn't gain us any space.
+        * But an EXACT node is faster than an ANYOF node, and can be combined
+        * with any adjacent EXACT nodes later by the optimizer for further
+        * gains.  The speed of executing an EXACTF is similar to an ANYOF
+        * node, so the optimization advantage comes from the ability to join
+        * it to adjacent EXACT nodes */
+
         const char * cur_parse= RExC_parse;
+       U8 op;
         RExC_emit = (regnode *)orig_emit;
         RExC_parse = (char *)orig_parse;
-        ret = reg_node(pRExC_state,
-                       (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
+
+       if (stored == 1) {
+
+           /* A locale node with one point can be folded; all the other cases
+            * with folding will have two points, since we calculate them above
+            */
+           if (ANYOF_FLAGS(ret) & ANYOF_FOLD) {
+                op = EXACTFL;
+           }
+           else {
+               op = EXACT;
+           }
+       }   /* else 2 chars in the bit map: the folds of each other */
+       else if (UNI_SEMANTICS || !isASCII(value)) {
+
+           /* To join adjacent nodes, they must be the exact EXACTish type.
+            * Try to use the most likely type, by using EXACTFU if the regex
+            * calls for them, or is required because the character is
+            * non-ASCII */
+           op = EXACTFU;
+       }
+       else {    /* Otherwise, more likely to be EXACTF type */
+           op = EXACTF;
+       }
+
+       ret = reg_node(pRExC_state, op);
         RExC_parse = (char *)cur_parse;
-        *STRING(ret)= (char)value;
-        STR_LEN(ret)= 1;
-        RExC_emit += STR_SZ(1);
+       if (UTF && ! NATIVE_IS_INVARIANT(value)) {
+           *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
+           *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
+           STR_LEN(ret)= 2;
+           RExC_emit += STR_SZ(2);
+       }
+       else {
+           *STRING(ret)= (char)value;
+           STR_LEN(ret)= 1;
+           RExC_emit += STR_SZ(1);
+       }
        SvREFCNT_dec(listsv);
         return ret;
     }
@@ -9700,8 +9770,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
        }
         
         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
-        /* output any special charclass tests (used mostly under use locale) */
-       if (o->flags & ANYOF_CLASS && ANYOF_CLASS_TEST_ANY_SET(o))
+        /* output any special charclass tests (used entirely under use locale) */
+       if (ANYOF_CLASS_TEST_ANY_SET(o))
            for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
                if (ANYOF_CLASS_TEST(o,i)) {
                    sv_catpv(sv, anyofs[i]);