Add interpolations to regex sets
authorKarl Williamson <public@khwilliamson.com>
Sun, 3 Feb 2013 16:29:32 +0000 (09:29 -0700)
committerKarl Williamson <public@khwilliamson.com>
Mon, 4 Feb 2013 04:41:28 +0000 (21:41 -0700)
This commit adds the capability for '(?[ ])' to contain interpolated
variables from other '(?[ ])' constructs.  A set operation can thus be
built up from the composition of other ones, without having to worry
about precedence, etc.

Thanks to Aaron Crane for suggesting this.

embed.fnc
embed.h
pod/perldelta.pod
pod/perlre.pod
proto.h
regcomp.c
t/porting/diag.t
t/re/reg_mesg.t
t/re/regex_sets.t

index cb8e5e8..31ce911 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1979,6 +1979,7 @@ Es        |regnode*|regclass      |NN struct RExC_state_t *pRExC_state \
                                |NULLOK SV** ret_invlist
 Es     |bool|could_it_be_a_POSIX_class|NN struct RExC_state_t *pRExC_state
 Es     |regnode*|handle_regex_sets|NN struct RExC_state_t *pRExC_state \
+                               |NULLOK SV ** return_invlist            \
                                |NN I32 *flagp|U32 depth                \
                                |NN char * const oregcomp_parse
 Es     |void|parse_lparen_question_flags|NN struct RExC_state_t *pRExC_state
diff --git a/embed.h b/embed.h
index e9ff0e8..0460505 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define get_invlist_version_id_addr(a) S_get_invlist_version_id_addr(aTHX_ a)
 #define get_invlist_zero_addr(a)       S_get_invlist_zero_addr(aTHX_ a)
 #define grok_bslash_N(a,b,c,d,e,f,g)   S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g)
-#define handle_regex_sets(a,b,c,d)     S_handle_regex_sets(aTHX_ a,b,c,d)
+#define handle_regex_sets(a,b,c,d,e)   S_handle_regex_sets(aTHX_ a,b,c,d,e)
 #define invlist_array(a)       S_invlist_array(aTHX_ a)
 #define invlist_clone(a)       S_invlist_clone(aTHX_ a)
 #define invlist_extend(a,b)    S_invlist_extend(aTHX_ a,b)
index 6474b03..746390f 100644 (file)
@@ -27,6 +27,16 @@ here, but most should go in the L</Performance Enhancements> section.
 
 [ List each enhancement as a =head2 entry ]
 
+=head2 Interpolations now Accepted in Regular Expression Set Operations
+
+Perl v5.17.8 introduced L<regular expression set operations|perlre/(?[ ])>.
+They have now been expanded to allow the interpolation of a
+previously-compiled set into a bigger set, like this:
+
+ my $thai_or_lao = '\p{Thai} + \p{Lao}';
+ ...
+ qr/(?[ \p{Digit} & $thai_or_lao ])/;
+
 =head1 Security
 
 XXX Any security-related notices go here.  In particular, any security
index 414ba38..ae20e4d 100644 (file)
@@ -1893,6 +1893,32 @@ construct.  There must not be any space between any of the characters
 that form the initial C<(?[>.  Nor may there be space between the
 closing C<])> characters.
 
+Just as in all regular expressions, the pattern can can be built up by
+including variables that are interpolated at regex compilation time.
+Care must be taken to ensure that you are getting what you expect.  For
+example:
+
+ my $thai_or_lao = '\p{Thai} + \p{Lao}';
+ ...
+ qr/(?[ \p{Digit} & $thai_or_lao ])/;
+
+compiles to
+
+ qr/(?[ \p{Digit} & \p{Thai} + \p{Lao} ])/;
+
+But this does not have the effect that someone reading the code would
+likely expect, as the intersection applies just to C<\p{Thai}>,
+excluding the Laotian.  Pitfalls like this can be avoided by
+parenthesizing the component pieces:
+
+ my $thai_or_lao = '( \p{Thai} + \p{Lao} )';
+
+Or you can make the components into instances of this construct by
+compiling them:
+
+ my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
+
+which causes the parenthesization to be done automatically for you.
 
 Due to the way that Perl parses things, your parentheses and brackets
 may need to be balanced, even including comments.  If you run into any
diff --git a/proto.h b/proto.h
index 40f2953..a2cf682 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -6516,10 +6516,10 @@ STATIC bool     S_grok_bslash_N(pTHX_ struct RExC_state_t *pRExC_state, regnode** no
 #define PERL_ARGS_ASSERT_GROK_BSLASH_N \
        assert(pRExC_state); assert(flagp)
 
-STATIC regnode*        S_handle_regex_sets(pTHX_ struct RExC_state_t *pRExC_state, I32 *flagp, U32 depth, char * const oregcomp_parse)
+STATIC regnode*        S_handle_regex_sets(pTHX_ struct RExC_state_t *pRExC_state, SV ** return_invlist, I32 *flagp, U32 depth, char * const oregcomp_parse)
                        __attribute__nonnull__(pTHX_1)
-                       __attribute__nonnull__(pTHX_2)
-                       __attribute__nonnull__(pTHX_4);
+                       __attribute__nonnull__(pTHX_3)
+                       __attribute__nonnull__(pTHX_5);
 #define PERL_ARGS_ASSERT_HANDLE_REGEX_SETS     \
        assert(pRExC_state); assert(flagp); assert(oregcomp_parse)
 
index 53589ad..a8b27dc 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -9110,7 +9110,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                }
            }
            case '[':           /* (?[ ... ]) */
-                return handle_regex_sets(pRExC_state, flagp, depth,
+                return handle_regex_sets(pRExC_state, NULL, flagp, depth,
                                          oregcomp_parse);
             case 0:
                RExC_parse--; /* for vFAIL to print correctly */
@@ -11417,7 +11417,7 @@ S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
 }
 
 STATIC regnode *
-S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
+S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
                    char * const oregcomp_parse)
 {
     /* Handle the (?[...]) construct to do set operations */
@@ -11576,6 +11576,72 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
         switch (curchar) {
 
+            case '?':
+                if (av_top(stack) >= 0   /* This makes sure that we can
+                                               safely subtract 1 from
+                                               RExC_parse in the next clause.
+                                               If we have something on the
+                                               stack, we have parsed something
+                                             */
+                    && UCHARAT(RExC_parse - 1) == '('
+                    && RExC_parse < RExC_end)
+                {
+                    /* If is a '(?', could be an embedded '(?flags:(?[...])'.
+                     * This happens when we have some thing like
+                     *
+                     *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
+                     *   ...
+                     *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
+                     *
+                     * Here we would be handling the interpolated
+                     * '$thai_or_lao'.  We handle this by a recursive call to
+                     * ourselves which returns the inversion list the
+                     * interpolated expression evaluates to.  We use the flags
+                     * from the interpolated pattern. */
+                    U32 save_flags = RExC_flags;
+                    const char * const save_parse = ++RExC_parse;
+
+                    parse_lparen_question_flags(pRExC_state);
+
+                    if (RExC_parse == save_parse  /* Makes sure there was at
+                                                     least one flag (or this
+                                                     embedding wasn't compiled)
+                                                   */
+                        || RExC_parse >= RExC_end - 4
+                        || UCHARAT(RExC_parse) != ':'
+                        || UCHARAT(++RExC_parse) != '('
+                        || UCHARAT(++RExC_parse) != '?'
+                        || UCHARAT(++RExC_parse) != '[')
+                    {
+
+                        /* In combination with the above, this moves the
+                         * pointer to the point just after the first erroneous
+                         * character (or if there are no flags, to where they
+                         * should have been) */
+                        if (RExC_parse >= RExC_end - 4) {
+                            RExC_parse = RExC_end;
+                        }
+                        else if (RExC_parse != save_parse) {
+                            RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+                        }
+                        vFAIL("Expecting '(?flags:(?[...'");
+                    }
+                    RExC_parse++;
+                    (void) handle_regex_sets(pRExC_state, &current, flagp,
+                                                    depth+1, oregcomp_parse);
+
+                    /* Here, 'current' contains the embedded expression's
+                     * inversion list, and RExC_parse points to the trailing
+                     * ']'; the next character should be the ')' which will be
+                     * paired with the '(' that has been put on the stack, so
+                     * the whole embedded expression reduces to '(operand)' */
+                    RExC_parse++;
+
+                    RExC_flags = save_flags;
+                    goto handle_operand;
+                }
+                /* FALL THROUGH */
+
             default:
                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
                 vFAIL("Unexpected character");
@@ -11756,11 +11822,16 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         vFAIL("Incomplete expression within '(?[ ])'");
     }
 
-    invlist_iterinit(final);
+    /* Here, 'final' is the resultant inversion list from evaluating the
+     * expression.  Return it if so requested */
+    if (return_invlist) {
+        *return_invlist = final;
+        return END;
+    }
 
-    /* Here, 'final' is the resultant inversion list of evaluating the
-     * expression.  Feed it to regclass() to generate the real resultant node.
-     * regclass() is expecting a string of ranges and individual code points */
+    /* Otherwise generate a resultant node, based on 'final'.  regclass() is
+     * expecting a string of ranges and individual code points */
+    invlist_iterinit(final);
     result_string = newSVpvs("");
     while (invlist_iternext(final, &start, &end)) {
         if (start == end) {
index 7355151..2d98b57 100644 (file)
@@ -660,3 +660,4 @@ UTF-16 surrogate U+%X
 False [] range "%s" in regex; marked by <-- HERE in m/%s/
 \N{} in character class restricted to one character in regex; marked by <-- HERE in m/%s/
 Zero length \N{} in regex; marked by <-- HERE in m/%s/
+Expecting '(?flags:(?[...' in regex; marked by <-- HERE in m/%s/
index 9af88d7..ca3d327 100644 (file)
@@ -165,6 +165,8 @@ my @death =
  'm/(?[[a-\pM]])/' => 'False [] range "a-\pM" {#} m/(?[[a-\pM{#}]])/',
  'm/(?[[\pM-x]])/' => 'False [] range "\pM-" {#} m/(?[[\pM-{#}x]])/',
  'm/(?[[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]])/' => '\N{} in character class restricted to one character {#} m/(?[[\N{U+100.300{#}}]])/',
+ 'm/(?[ \p{Digit} & (?(?[ \p{Thai} | \p{Lao} ]))])/' => 'Sequence (?(...) not recognized {#} m/(?[ \p{Digit} & (?({#}?[ \p{Thai} | \p{Lao} ]))])/',
+ 'm/(?[ \p{Digit} & (?:(?[ \p{Thai} | \p{Lao} ]))])/' => 'Expecting \'(?flags:(?[...\' {#} m/(?[ \p{Digit} & (?{#}:(?[ \p{Thai} | \p{Lao} ]))])/',
  'm/\o{/' => 'Missing right brace on \o{ {#} m/\o{{#}/',
  'm/\o/' => 'Missing braces on \o{} {#} m/\o{#}/',
  'm/\o{}/' => 'Number with no digits {#} m/\o{}{#}/',
index 8e889b7..eaa5c55 100644 (file)
@@ -51,6 +51,19 @@ like("c", qr/(?[ [ab] ^ [bc] ])/, 'basic symmetric difference works');
 like("2", qr/(?[ ( ( \pN & ( [a] + [2] ) ) ) ])/, 'Nesting parens and grouping');
 unlike("a", qr/(?[ ( ( \pN & ( [a] + [2] ) ) ) ])/, 'Nesting parens and grouping');
 
+unlike("\x{17f}", qr/(?[ [k] + \p{Blk=ASCII} ])/i, '/i doesn\'t affect \p{}');
+like("\N{KELVIN SIGN}", qr/(?[ [k] + \p{Blk=ASCII} ])/i, '/i does affect literals');
+
+my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
+my $thai_or_lao_digit = qr/(?[ \p{Digit} & $thai_or_lao ])/;
+like("\N{THAI DIGIT ZERO}", $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
+unlike(chr(ord("\N{THAI DIGIT ZERO}") - 1), $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
+like("\N{THAI DIGIT NINE}", $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
+unlike(chr(ord("\N{THAI DIGIT NINE}") + 1), $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
+like("\N{LAO DIGIT ZERO}", $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
+unlike(chr(ord("\N{LAO DIGIT ZERO}") - 1), $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
+like("\N{LAO DIGIT NINE}", $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
+unlike(chr(ord("\N{LAO DIGIT NINE}") + 1), $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
 
 
 done_testing();