|NULLOK SV** ret_invlist
Es |bool|could_it_be_a_POSIX_class|NN struct RExC_state_t *pRExC_state
Es |regnode*|handle_regex_sets|NN struct RExC_state_t *pRExC_state \
+ |NULLOK SV ** return_invlist \
|NN I32 *flagp|U32 depth \
|NN char * const oregcomp_parse
Es |void|parse_lparen_question_flags|NN struct RExC_state_t *pRExC_state
#define get_invlist_version_id_addr(a) S_get_invlist_version_id_addr(aTHX_ a)
#define get_invlist_zero_addr(a) S_get_invlist_zero_addr(aTHX_ a)
#define grok_bslash_N(a,b,c,d,e,f,g) S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g)
-#define handle_regex_sets(a,b,c,d) S_handle_regex_sets(aTHX_ a,b,c,d)
+#define handle_regex_sets(a,b,c,d,e) S_handle_regex_sets(aTHX_ a,b,c,d,e)
#define invlist_array(a) S_invlist_array(aTHX_ a)
#define invlist_clone(a) S_invlist_clone(aTHX_ a)
#define invlist_extend(a,b) S_invlist_extend(aTHX_ a,b)
[ List each enhancement as a =head2 entry ]
+=head2 Interpolations now Accepted in Regular Expression Set Operations
+
+Perl v5.17.8 introduced L<regular expression set operations|perlre/(?[ ])>.
+They have now been expanded to allow the interpolation of a
+previously-compiled set into a bigger set, like this:
+
+ my $thai_or_lao = '\p{Thai} + \p{Lao}';
+ ...
+ qr/(?[ \p{Digit} & $thai_or_lao ])/;
+
=head1 Security
XXX Any security-related notices go here. In particular, any security
that form the initial C<(?[>. Nor may there be space between the
closing C<])> characters.
+Just as in all regular expressions, the pattern can can be built up by
+including variables that are interpolated at regex compilation time.
+Care must be taken to ensure that you are getting what you expect. For
+example:
+
+ my $thai_or_lao = '\p{Thai} + \p{Lao}';
+ ...
+ qr/(?[ \p{Digit} & $thai_or_lao ])/;
+
+compiles to
+
+ qr/(?[ \p{Digit} & \p{Thai} + \p{Lao} ])/;
+
+But this does not have the effect that someone reading the code would
+likely expect, as the intersection applies just to C<\p{Thai}>,
+excluding the Laotian. Pitfalls like this can be avoided by
+parenthesizing the component pieces:
+
+ my $thai_or_lao = '( \p{Thai} + \p{Lao} )';
+
+Or you can make the components into instances of this construct by
+compiling them:
+
+ my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
+
+which causes the parenthesization to be done automatically for you.
Due to the way that Perl parses things, your parentheses and brackets
may need to be balanced, even including comments. If you run into any
#define PERL_ARGS_ASSERT_GROK_BSLASH_N \
assert(pRExC_state); assert(flagp)
-STATIC regnode* S_handle_regex_sets(pTHX_ struct RExC_state_t *pRExC_state, I32 *flagp, U32 depth, char * const oregcomp_parse)
+STATIC regnode* S_handle_regex_sets(pTHX_ struct RExC_state_t *pRExC_state, SV ** return_invlist, I32 *flagp, U32 depth, char * const oregcomp_parse)
__attribute__nonnull__(pTHX_1)
- __attribute__nonnull__(pTHX_2)
- __attribute__nonnull__(pTHX_4);
+ __attribute__nonnull__(pTHX_3)
+ __attribute__nonnull__(pTHX_5);
#define PERL_ARGS_ASSERT_HANDLE_REGEX_SETS \
assert(pRExC_state); assert(flagp); assert(oregcomp_parse)
}
}
case '[': /* (?[ ... ]) */
- return handle_regex_sets(pRExC_state, flagp, depth,
+ return handle_regex_sets(pRExC_state, NULL, flagp, depth,
oregcomp_parse);
case 0:
RExC_parse--; /* for vFAIL to print correctly */
}
STATIC regnode *
-S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
+S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
char * const oregcomp_parse)
{
/* Handle the (?[...]) construct to do set operations */
switch (curchar) {
+ case '?':
+ if (av_top(stack) >= 0 /* This makes sure that we can
+ safely subtract 1 from
+ RExC_parse in the next clause.
+ If we have something on the
+ stack, we have parsed something
+ */
+ && UCHARAT(RExC_parse - 1) == '('
+ && RExC_parse < RExC_end)
+ {
+ /* If is a '(?', could be an embedded '(?flags:(?[...])'.
+ * This happens when we have some thing like
+ *
+ * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
+ * ...
+ * qr/(?[ \p{Digit} & $thai_or_lao ])/;
+ *
+ * Here we would be handling the interpolated
+ * '$thai_or_lao'. We handle this by a recursive call to
+ * ourselves which returns the inversion list the
+ * interpolated expression evaluates to. We use the flags
+ * from the interpolated pattern. */
+ U32 save_flags = RExC_flags;
+ const char * const save_parse = ++RExC_parse;
+
+ parse_lparen_question_flags(pRExC_state);
+
+ if (RExC_parse == save_parse /* Makes sure there was at
+ least one flag (or this
+ embedding wasn't compiled)
+ */
+ || RExC_parse >= RExC_end - 4
+ || UCHARAT(RExC_parse) != ':'
+ || UCHARAT(++RExC_parse) != '('
+ || UCHARAT(++RExC_parse) != '?'
+ || UCHARAT(++RExC_parse) != '[')
+ {
+
+ /* In combination with the above, this moves the
+ * pointer to the point just after the first erroneous
+ * character (or if there are no flags, to where they
+ * should have been) */
+ if (RExC_parse >= RExC_end - 4) {
+ RExC_parse = RExC_end;
+ }
+ else if (RExC_parse != save_parse) {
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ }
+ vFAIL("Expecting '(?flags:(?[...'");
+ }
+ RExC_parse++;
+ (void) handle_regex_sets(pRExC_state, ¤t, flagp,
+ depth+1, oregcomp_parse);
+
+ /* Here, 'current' contains the embedded expression's
+ * inversion list, and RExC_parse points to the trailing
+ * ']'; the next character should be the ')' which will be
+ * paired with the '(' that has been put on the stack, so
+ * the whole embedded expression reduces to '(operand)' */
+ RExC_parse++;
+
+ RExC_flags = save_flags;
+ goto handle_operand;
+ }
+ /* FALL THROUGH */
+
default:
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
vFAIL("Unexpected character");
vFAIL("Incomplete expression within '(?[ ])'");
}
- invlist_iterinit(final);
+ /* Here, 'final' is the resultant inversion list from evaluating the
+ * expression. Return it if so requested */
+ if (return_invlist) {
+ *return_invlist = final;
+ return END;
+ }
- /* Here, 'final' is the resultant inversion list of evaluating the
- * expression. Feed it to regclass() to generate the real resultant node.
- * regclass() is expecting a string of ranges and individual code points */
+ /* Otherwise generate a resultant node, based on 'final'. regclass() is
+ * expecting a string of ranges and individual code points */
+ invlist_iterinit(final);
result_string = newSVpvs("");
while (invlist_iternext(final, &start, &end)) {
if (start == end) {
False [] range "%s" in regex; marked by <-- HERE in m/%s/
\N{} in character class restricted to one character in regex; marked by <-- HERE in m/%s/
Zero length \N{} in regex; marked by <-- HERE in m/%s/
+Expecting '(?flags:(?[...' in regex; marked by <-- HERE in m/%s/
'm/(?[[a-\pM]])/' => 'False [] range "a-\pM" {#} m/(?[[a-\pM{#}]])/',
'm/(?[[\pM-x]])/' => 'False [] range "\pM-" {#} m/(?[[\pM-{#}x]])/',
'm/(?[[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]])/' => '\N{} in character class restricted to one character {#} m/(?[[\N{U+100.300{#}}]])/',
+ 'm/(?[ \p{Digit} & (?(?[ \p{Thai} | \p{Lao} ]))])/' => 'Sequence (?(...) not recognized {#} m/(?[ \p{Digit} & (?({#}?[ \p{Thai} | \p{Lao} ]))])/',
+ 'm/(?[ \p{Digit} & (?:(?[ \p{Thai} | \p{Lao} ]))])/' => 'Expecting \'(?flags:(?[...\' {#} m/(?[ \p{Digit} & (?{#}:(?[ \p{Thai} | \p{Lao} ]))])/',
'm/\o{/' => 'Missing right brace on \o{ {#} m/\o{{#}/',
'm/\o/' => 'Missing braces on \o{} {#} m/\o{#}/',
'm/\o{}/' => 'Number with no digits {#} m/\o{}{#}/',
like("2", qr/(?[ ( ( \pN & ( [a] + [2] ) ) ) ])/, 'Nesting parens and grouping');
unlike("a", qr/(?[ ( ( \pN & ( [a] + [2] ) ) ) ])/, 'Nesting parens and grouping');
+unlike("\x{17f}", qr/(?[ [k] + \p{Blk=ASCII} ])/i, '/i doesn\'t affect \p{}');
+like("\N{KELVIN SIGN}", qr/(?[ [k] + \p{Blk=ASCII} ])/i, '/i does affect literals');
+
+my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
+my $thai_or_lao_digit = qr/(?[ \p{Digit} & $thai_or_lao ])/;
+like("\N{THAI DIGIT ZERO}", $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
+unlike(chr(ord("\N{THAI DIGIT ZERO}") - 1), $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
+like("\N{THAI DIGIT NINE}", $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
+unlike(chr(ord("\N{THAI DIGIT NINE}") + 1), $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
+like("\N{LAO DIGIT ZERO}", $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
+unlike(chr(ord("\N{LAO DIGIT ZERO}") - 1), $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
+like("\N{LAO DIGIT NINE}", $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
+unlike(chr(ord("\N{LAO DIGIT NINE}") + 1), $thai_or_lao_digit, 'embedded qr/(?[ ])/ works');
done_testing();