This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Set flags when optimizing a [char class]
authorKarl Williamson <public@khwilliamson.com>
Thu, 9 Aug 2012 20:38:03 +0000 (14:38 -0600)
committerKarl Williamson <public@khwilliamson.com>
Sun, 12 Aug 2012 01:32:31 +0000 (19:32 -0600)
A bracketed character class containing a single Latin1-range character
has long been optimized into an EXACT node.  Also, flags are set to
include SIMPLE.  However, EXACT nodes containing code points that are
different when encoded under UTF-8 versus not UTF-8 should not be marked
simple.

To fix this, the address of the flags parameter is now passed to
regclass(), the function that parses bracketed character classes, which
now sets it appropriately.  The unconditional setting of SIMPLE that was
always done in the code after calling regclass() has been removed.

In addition, the setting of the flags for EXACT nodes has been pushed
into the common function that populates them.

regclass() will also now increment the naughtiness count if optimized to
a node that normally does that.  I do not understand this heuristic
behavior very well, and could not come up with a test case for it;
experimentation revealed that there are no test cases in our test suite
for which naughtiness makes any difference at all.

embed.fnc
embed.h
proto.h
regcomp.c
t/re/pat.t

index 2b25b3c..64bc381 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1930,14 +1930,15 @@ Es      |regnode*|regbranch     |NN struct RExC_state_t *pRExC_state \
                                |NN I32 *flagp|I32 first|U32 depth
 Es     |STRLEN |reguni         |NN const struct RExC_state_t *pRExC_state \
                                |UV uv|NN char *s
-Es     |regnode*|regclass      |NN struct RExC_state_t *pRExC_state|U32 depth
+Es     |regnode*|regclass      |NN struct RExC_state_t *pRExC_state \
+                               |NN I32 *flagp|U32 depth
 Es     |regnode*|reg_node      |NN struct RExC_state_t *pRExC_state|U8 op
 Es     |UV     |reg_recode     |const char value|NN SV **encp
 Es     |regnode*|regpiece      |NN struct RExC_state_t *pRExC_state \
                                |NN I32 *flagp|U32 depth
 Es     |bool   |grok_bslash_N  |NN struct RExC_state_t *pRExC_state \
                                |NULLOK regnode** nodep|NULLOK UV *valuep \
-                               |NULLOK I32 *flagp|U32 depth|bool in_char_class
+                               |NN I32 *flagp|U32 depth|bool in_char_class
 Es     |void   |reginsert      |NN struct RExC_state_t *pRExC_state \
                                |U8 op|NN regnode *opnd|U32 depth
 Es     |void   |regtail        |NN struct RExC_state_t *pRExC_state \
@@ -1951,7 +1952,8 @@ Es        |U32    |join_exact     |NN struct RExC_state_t *pRExC_state \
 EsRn   |char * |regwhite       |NN struct RExC_state_t *pRExC_state \
                                |NN char *p
 Ei     |void   |alloc_maybe_populate_EXACT|NN struct RExC_state_t *pRExC_state \
-                               |NN regnode *node|STRLEN len|UV code_point
+                               |NN regnode *node|NN I32 *flagp|STRLEN len \
+                               |UV code_point
 Ei     |U8   |compute_EXACTish|NN struct RExC_state_t *pRExC_state
 Es     |char * |nextchar       |NN struct RExC_state_t *pRExC_state
 Es     |bool   |reg_skipcomment|NN struct RExC_state_t *pRExC_state
diff --git a/embed.h b/embed.h
index fd03e8a..8c81ee9 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define add_alternate(a,b,c)   S_add_alternate(aTHX_ a,b,c)
 #define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b)
 #define add_data               S_add_data
-#define alloc_maybe_populate_EXACT(a,b,c,d)    S_alloc_maybe_populate_EXACT(aTHX_ a,b,c,d)
+#define alloc_maybe_populate_EXACT(a,b,c,d,e)  S_alloc_maybe_populate_EXACT(aTHX_ a,b,c,d,e)
 #define checkposixcc(a)                S_checkposixcc(aTHX_ a)
 #define cl_and                 S_cl_and
 #define cl_anything            S_cl_anything
 #define reganode(a,b,c)                S_reganode(aTHX_ a,b,c)
 #define regatom(a,b,c)         S_regatom(aTHX_ a,b,c)
 #define regbranch(a,b,c,d)     S_regbranch(aTHX_ a,b,c,d)
-#define regclass(a,b)          S_regclass(aTHX_ a,b)
+#define regclass(a,b,c)                S_regclass(aTHX_ a,b,c)
 #define reginsert(a,b,c,d)     S_reginsert(aTHX_ a,b,c,d)
 #define regpiece(a,b,c)                S_regpiece(aTHX_ a,b,c)
 #define regpposixcc(a,b)       S_regpposixcc(aTHX_ a,b)
diff --git a/proto.h b/proto.h
index 0a3e547..78fd691 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -6394,11 +6394,12 @@ STATIC U32      S_add_data(struct RExC_state_t *pRExC_state, U32 n, const char *s)
 #define PERL_ARGS_ASSERT_ADD_DATA      \
        assert(pRExC_state); assert(s)
 
-PERL_STATIC_INLINE void        S_alloc_maybe_populate_EXACT(pTHX_ struct RExC_state_t *pRExC_state, regnode *node, STRLEN len, UV code_point)
+PERL_STATIC_INLINE void        S_alloc_maybe_populate_EXACT(pTHX_ struct RExC_state_t *pRExC_state, regnode *node, I32 *flagp, STRLEN len, UV code_point)
                        __attribute__nonnull__(pTHX_1)
-                       __attribute__nonnull__(pTHX_2);
+                       __attribute__nonnull__(pTHX_2)
+                       __attribute__nonnull__(pTHX_3);
 #define PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT    \
-       assert(pRExC_state); assert(node)
+       assert(pRExC_state); assert(node); assert(flagp)
 
 STATIC void    S_checkposixcc(pTHX_ struct RExC_state_t *pRExC_state)
                        __attribute__nonnull__(pTHX_1);
@@ -6466,9 +6467,10 @@ PERL_STATIC_INLINE UV*   S_get_invlist_zero_addr(pTHX_ SV* invlist)
        assert(invlist)
 
 STATIC bool    S_grok_bslash_N(pTHX_ struct RExC_state_t *pRExC_state, regnode** nodep, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
-                       __attribute__nonnull__(pTHX_1);
+                       __attribute__nonnull__(pTHX_1)
+                       __attribute__nonnull__(pTHX_4);
 #define PERL_ARGS_ASSERT_GROK_BSLASH_N \
-       assert(pRExC_state)
+       assert(pRExC_state); assert(flagp)
 
 PERL_STATIC_INLINE UV* S_invlist_array(pTHX_ SV* const invlist)
                        __attribute__warn_unused_result__
@@ -6607,10 +6609,11 @@ STATIC regnode* S_regbranch(pTHX_ struct RExC_state_t *pRExC_state, I32 *flagp,
 #define PERL_ARGS_ASSERT_REGBRANCH     \
        assert(pRExC_state); assert(flagp)
 
-STATIC regnode*        S_regclass(pTHX_ struct RExC_state_t *pRExC_state, U32 depth)
-                       __attribute__nonnull__(pTHX_1);
+STATIC regnode*        S_regclass(pTHX_ struct RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
+                       __attribute__nonnull__(pTHX_1)
+                       __attribute__nonnull__(pTHX_2);
 #define PERL_ARGS_ASSERT_REGCLASS      \
-       assert(pRExC_state)
+       assert(pRExC_state); assert(flagp)
 
 STATIC void    S_reginsert(pTHX_ struct RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
                        __attribute__nonnull__(pTHX_1)
index 11f7f1d..1987df3 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -9866,15 +9866,23 @@ S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
 }
 
 PERL_STATIC_INLINE void
-S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, STRLEN len, UV code_point)
+S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
 {
-    /* This knows the details about sizing an EXACTish node, and potentially
-     * populating it with a single character.  If <len> is non-zero, it assumes
-     * that the node has already been populated, and just does the sizing,
-     * ignoring <code_point>.  Otherwise it looks at <code_point> and
-     * calculates what <len> should be.  In pass 1, it sizes the node
-     * appropriately.  In pass 2, it additionally will populate the node's
-     * STRING with <code_point>, if <len> is 0.
+    /* This knows the details about sizing an EXACTish node, setting flags for
+     * it (by setting <*flagp>, and potentially populating it with a single
+     * character.
+     *
+     * If <len> is non-zero, this function assumes that the node has already
+     * been populated, and just does the sizing.  In this case <code_point>
+     * should be the final code point that has already been placed into the
+     * node.  This value will be ignored except that under some circumstances
+     * <*flagp> is set based on it.
+     *
+     * If <len is zero, the function assumes that the node is to contain only
+     * the single character given by <code_point> and calculates what <len>
+     * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
+     * additionally will populate the node's STRING with <code_point>, if <len>
+     * is 0.  In both cases <*flagp> is appropriately set
      *
      * It knows that under FOLD, UTF characters and the Latin Sharp S must be
      * folded (the latter only when the rules indicate it can match 'ss') */
@@ -9919,6 +9927,10 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, STR
             Copy((char *) character, STRING(node), len, char);
         }
     }
+
+    *flagp |= HASWIDTH;
+    if (len == 1 && UNI_IS_INVARIANT(code_point))
+        *flagp |= SIMPLE;
 }
 
 /*
@@ -10033,13 +10045,12 @@ tryagain:
     case '[':
     {
        char * const oregcomp_parse = ++RExC_parse;
-        ret = regclass(pRExC_state,depth+1);
+        ret = regclass(pRExC_state, flagp,depth+1);
        if (*RExC_parse != ']') {
            RExC_parse = oregcomp_parse;
            vFAIL("Unmatched [");
        }
        nextchar(pRExC_state);
-       *flagp |= HASWIDTH|SIMPLE;
         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
        break;
     }
@@ -10250,7 +10261,7 @@ tryagain:
                }
                RExC_parse--;
 
-                ret = regclass(pRExC_state,depth+1);
+                ret = regclass(pRExC_state, flagp,depth+1);
 
                RExC_end = oldregxend;
                RExC_parse--;
@@ -10258,7 +10269,6 @@ tryagain:
                Set_Node_Offset(ret, parse_start + 2);
                Set_Node_Cur_Length(ret);
                nextchar(pRExC_state);
-               *flagp |= HASWIDTH|SIMPLE;
            }
            break;
         case 'N': 
@@ -10935,6 +10945,9 @@ tryagain:
 
        loopdone:   /* Jumped to when encounters something that shouldn't be in
                       the node */
+
+                alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
+
            RExC_parse = p - 1;
             Set_Node_Cur_Length(ret); /* MJD */
            nextchar(pRExC_state);
@@ -10944,12 +10957,7 @@ tryagain:
                if (iv < 0)
                    vFAIL("Internal disaster");
            }
-           if (len > 0)
-               *flagp |= HASWIDTH;
-           if (len == 1 && UNI_IS_INVARIANT(ender))
-               *flagp |= SIMPLE;
 
-            alloc_maybe_populate_EXACT(pRExC_state, ret, len, 0);
        } /* End of label 'defchar:' */
        break;
     } /* End of giant switch on input character */
@@ -11316,7 +11324,7 @@ S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
    above 255, a range list is used */
 
 STATIC regnode *
-S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
+S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 {
     dVAR;
     register UV nextvalue;
@@ -11487,7 +11495,7 @@ parseit:
                     if this makes sense as it does change the behaviour
                     from earlier versions, OTOH that behaviour was broken
                     as well. */
-                    if (! grok_bslash_N(pRExC_state, NULL, &value, NULL, depth,
+                    if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
                                       TRUE /* => charclass */))
                     {
                         goto parseit;
@@ -12106,6 +12114,7 @@ parseit:
                     if (invert) {
                         op += NALNUM - ALNUM;
                     }
+                    *flagp |= HASWIDTH|SIMPLE;
                     break;
 
                 /* The second group doesn't depend of the charset modifiers.
@@ -12116,6 +12125,7 @@ parseit:
                 case ANYOF_HORIZWS:
                   is_horizws:
                     op = (invert) ? NHORIZWS : HORIZWS;
+                    *flagp |= HASWIDTH|SIMPLE;
                     break;
 
                 case ANYOF_NVERTWS:
@@ -12123,6 +12133,7 @@ parseit:
                     /* FALLTHROUGH */
                 case ANYOF_VERTWS:
                     op = (invert) ? NVERTWS : VERTWS;
+                    *flagp |= HASWIDTH|SIMPLE;
                     break;
 
                 case ANYOF_MAX:
@@ -12162,6 +12173,8 @@ parseit:
             if (invert) {
                 if (! LOC && value == '\n') {
                     op = REG_ANY; /* Optimize [^\n] */
+                    *flagp |= HASWIDTH|SIMPLE;
+                    RExC_naughty++;
                 }
             }
             else if (value < 256 || UTF) {
@@ -12175,6 +12188,7 @@ parseit:
             if (prevvalue == '0') {
                 if (value == '9') {
                     op = (invert) ? NDIGITA : DIGITA;
+                    *flagp |= HASWIDTH|SIMPLE;
                 }
             }
         }
@@ -12208,9 +12222,10 @@ parseit:
                 if (! SIZE_ONLY) {
                     FLAGS(ret) = arg;
                 }
+                *flagp |= HASWIDTH|SIMPLE;
             }
             else if (PL_regkind[op] == EXACT) {
-                alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value);
+                alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
             }
 
             RExC_parse = (char *) cur_parse;
@@ -12678,6 +12693,7 @@ parseit:
              * it doesn't match anything.  (perluniprops.pod notes such
              * properties) */
             op = OPFAIL;
+            *flagp |= HASWIDTH|SIMPLE;
         }
         else if (start == end) {    /* The range is a single code point */
             if (! invlist_iternext(cp_list, &start, &end)
@@ -12743,12 +12759,16 @@ parseit:
         else if (start == 0) {
             if (end == UV_MAX) {
                 op = SANY;
+                *flagp |= HASWIDTH|SIMPLE;
+                RExC_naughty++;
             }
             else if (end == '\n' - 1
                     && invlist_iternext(cp_list, &start, &end)
                     && start == '\n' + 1 && end == UV_MAX)
             {
                 op = REG_ANY;
+                *flagp |= HASWIDTH|SIMPLE;
+                RExC_naughty++;
             }
         }
 
@@ -12761,7 +12781,7 @@ parseit:
             RExC_parse = (char *)cur_parse;
 
             if (PL_regkind[op] == EXACT) {
-                alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value);
+                alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
             }
 
             SvREFCNT_dec(listsv);
@@ -12902,6 +12922,8 @@ parseit:
        RExC_rxi->data->data[n] = (void*)rv;
        ARG_SET(ret, n);
     }
+
+    *flagp |= HASWIDTH|SIMPLE;
     return ret;
 }
 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
index 3cfb363..08f784d 100644 (file)
@@ -19,7 +19,7 @@ BEGIN {
     require './test.pl';
 }
 
-plan tests => 452;  # Update this when adding/deleting tests.
+plan tests => 453;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1277,6 +1277,13 @@ EOP
        ok("51" =~ /$o/, "Qr_indirect bare");
     }
 
+    {   # Various flags weren't being set when a [] is optimized into an
+        # EXACTish node
+        ;
+        ;
+        ok("\x{017F}\x{017F}" =~ qr/^[\x{00DF}]?$/i, "[] to EXACTish optimization");
+    }
+
 } # End of sub run_tests
 
 1;