t/re/regexp_trielist.t See if regular expressions work with trie optimisation
t/re/regexp_unicode_prop.t See if unicode properties work in regular expressions as expected
t/re/regexp_unicode_prop_thr.t See if unicode properties work in regular expressions as expected under threads
+t/re/regex_sets.t Test (?[ ])
t/re/reg_fold.t See if case folding works properly
t/re/reg_mesg.t See if one can get regular expression errors
t/re/reg_namedcapture.t Make sure glob assignment doesn't break named capture
|bool allow_multi_fold \
|const bool silence_non_portable \
|NULLOK SV** ret_invlist
+Es |bool|could_it_be_POSIX |NN struct RExC_state_t *pRExC_state
+Es |regnode*|handle_sets |NN struct RExC_state_t *pRExC_state \
+ |NN I32 *flagp|U32 depth \
+ |NN char * const oregcomp_parse
Es |regnode*|reg_node |NN struct RExC_state_t *pRExC_state|U8 op
Es |UV |reg_recode |const char value|NN SV **encp
Es |regnode*|regpiece |NN struct RExC_state_t *pRExC_state \
#define cl_is_anything S_cl_is_anything
#define cl_or S_cl_or
#define compute_EXACTish(a) S_compute_EXACTish(aTHX_ a)
+#define could_it_be_POSIX(a) S_could_it_be_POSIX(aTHX_ a)
#define get_invlist_iter_addr(a) S_get_invlist_iter_addr(aTHX_ a)
#define get_invlist_previous_index_addr(a) S_get_invlist_previous_index_addr(aTHX_ a)
#define get_invlist_version_id_addr(a) S_get_invlist_version_id_addr(aTHX_ a)
#define get_invlist_zero_addr(a) S_get_invlist_zero_addr(aTHX_ a)
#define grok_bslash_N(a,b,c,d,e,f) S_grok_bslash_N(aTHX_ a,b,c,d,e,f)
+#define handle_sets(a,b,c,d) S_handle_sets(aTHX_ a,b,c,d)
#define invlist_array(a) S_invlist_array(aTHX_ a)
#define invlist_clone(a) S_invlist_clone(aTHX_ a)
#define invlist_extend(a,b) S_invlist_extend(aTHX_ a,b)
[ List each enhancement as a =head2 entry ]
+=head2 Regular Expression Set Operations
+
+This is an experimental feature to allow matching against the the union,
+intersection, etc., of sets of code points, similar to
+L<Unicode::Regex::Set>. It can also be used to extend C</x> processing
+to [bracketed] character classes, and as a replacement of user-defined
+properties, allowing more complex expressions than they do. See
+L<perlre/(?[ ])>.
+
=head1 Security
XXX Any security-related notices go here. In particular, any security
=item 5
-the pattern uses a Unicode property (C<\p{...}>)
+the pattern uses a Unicode property (C<\p{...}>); or
+
+=item 6
+
+the pattern uses L</C<(?[ ])>>
=back
PAT?+ (?>PAT?)
PAT{min,max}+ (?>PAT{min,max})
+=item C<(?[ ])>
+X<set operations>
+
+This is an experimental feature present starting in 5.18, but is subject
+to change as we gain field experience with it. Any attempt to use it
+will raise a warning, unless disabled via
+
+ no warnings "experimental::regex_sets";
+
+Comments on this feature are welcome; send email to
+C<perl5-porters@perl.org>.
+
+This is a fancy bracketed character class that can be used for more
+readable and less error-prone classes, and to perform set operations,
+such as intersection. An example is
+
+ /(?[ \p{Thai} & \p{Digit} ])/
+
+This will match all the digit characters that are in the Thai script.
+We can extend this by
+
+ /(?[ ( \p{Thai} + \p{Lao} ) & \p{Digit} ])/
+
+This matches digits that are in either the Thai or Laotion scripts.
+
+Notice the white space in these examples. This construct always has
+L</C<E<sol>x>> turned on.
+
+The available binary operators are:
+
+ & intersection
+ + union
+ | another name for '+', hence means union
+ - subtraction (the result matches the set consisting of those
+ code points matched by the first operand, excluding any that
+ are also matched by the second operand)
+ ^ symmetric difference (the union minus the intersection). This
+ is like an exclusive or, in that the result is the set of code
+ points that are matched by either, but not both, of the
+ operands.
+
+There is one unary operator:
+
+ ! complement
+
+All the binary operators left associate, and are of equal precedence.
+The unary operator right associates, and has higher precedence. Use
+parentheses to override the default associations.
+
+The main restriction is that everything is a metacharacter. Thus,
+you cannot refer to single characters by doing something like this:
+
+ /(?[ a + b ])/ # Syntax error!
+
+The easiest way to specify an individual typable character is to enclose
+it in brackets:
+
+ /(?[ [a] + [b] ])/
+
+(This is the same thing as C<[ab]>.) You could also have said the
+equivalent
+
+ /(?[[ a b ]])/
+
+(You can, of course, specify single characters by using, C<\x{ }>,
+C<\N{ }>, etc.)
+
+This last example shows the use of this construct to specify an ordinary
+bracketed character class without set operations. Note the white space
+within it. To specify a matchable white space character, you can escape
+it with a backslash, like:
+
+ /(?[ [ a e i o u \ ] ])/
+
+This matches the English vowels plus the SPACE character.
+All the other escapes accepted by normal bracketed character classes are
+accepted here as well; but unlike the normal ones, unrecognized escapes are
+fatal errors here.
+
+All warnings from these class elements are fatal, as well as some
+practices that don't currently warn. For example you cannot say
+
+ /(?[ [ \xF ] ])/ # Syntax error!
+
+You have to have two hex digits after a braceless C<\x> (use a leading
+zero to make two). These restrictions are to lower the incidence of
+typos causing the class to not match what you thought it would.
+
+The final difference between regular bracketed character classes and
+these, is that it is not possible to get the latter to match a
+multi-character fold. Thus,
+
+ /(?[ [\xDF] ])/iu
+
+does not match the string C<ss>.
+
+You don't have to enclose Posix class names inside double brackets. The
+following works
+
+ /(?[ [:word:] - [:lower:] ])/
+
+C<< (?[ ]) >> is a compile-time construct. Any attempt to use something
+which isn't knowable until run-time is a fatal error. Thus, this
+construct cannot be used within the scope of C<use locale> (or the
+L</C<E<sol>l>> regex modifier). Any L<user-defined
+property|perlunicode/"User-Defined Character Properties"> used must be
+already defined by the time the regular expression is compiled; but note
+that this construct can be used to avoid defining such properties.
+
+A regular expression using this construct that otherwise would compile
+using L</C<E<sol>d>> rules will instead use L</C<E<sol>u>>.
+
+The L</C<E<sol>x>> processing within this class is an extended form.
+Besides the characters that are considered white space in normal C</x>
+processing, there are 5 others, recommended by the Unicode standard:
+
+ U+0085 NEXT LINE
+ U+200E LEFT-TO-RIGHT MARK
+ U+200F RIGHT-TO-LEFT MARK
+ U+2028 LINE SEPARATOR
+ U+2029 PARAGRAPH SEPARATOR
+
+Note that skipping white space applies only to the interior of this
+construct. There must not be any space between any of the characters
+that form the initial C<(?[>. Nor may there be space between the
+closing C<])> characters.
+
+Due to the way that Perl parses things, your parentheses and brackets
+may need to be balanced, even including comments.
+
+Since this experimental, we may change this so that other legal uses of
+normal bracketed character classes might become illegal. One proposal,
+for example, is to forbid adjacent uses of the same character, as in
+C<[aa]>. This is likely a typo, as the second "a" adds nothing.
+
=back
=head2 Special Backtracking Control Verbs
=head2 User-Defined Character Properties
You can define your own binary character properties by defining subroutines
-whose names begin with "In" or "Is". The subroutines can be defined in any
+whose names begin with "In" or "Is". (The experimental feature
+L<perlre/(?[ ])> provides an alternative which allows more complex
+definitions.) The subroutines can be defined in any
package. The user-defined properties can be used in the regular expression
C<\p> and C<\P> constructs; if you are using a user-defined property from a
package other than the one you are in, you must specify its package in the
RL1.1 Hex Notation - done [1]
RL1.2 Properties - done [2][3]
RL1.2a Compatibility Properties - done [4]
- RL1.3 Subtraction and Intersection - MISSING [5]
+ RL1.3 Subtraction and Intersection - experimental [5]
RL1.4 Simple Word Boundaries - done [6]
RL1.5 Simple Loose Matches - done [7]
RL1.6 Line Boundaries - MISSING [8][9]
=item [5]
- Can use the following to emulate set operations:
+The experimental feature in v5.18 "(?[...])" accomplishes this. See
+L<perlre/(?[ ])>. If you don't want to use an experimental feature,
+you can use one of the following:
=over 4
#define PERL_ARGS_ASSERT_COMPUTE_EXACTISH \
assert(pRExC_state)
+STATIC bool S_could_it_be_POSIX(pTHX_ struct RExC_state_t *pRExC_state)
+ __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_COULD_IT_BE_POSIX \
+ assert(pRExC_state)
+
PERL_STATIC_INLINE UV* S_get_invlist_iter_addr(pTHX_ SV* invlist)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
#define PERL_ARGS_ASSERT_GROK_BSLASH_N \
assert(pRExC_state); assert(flagp)
+STATIC regnode* S_handle_sets(pTHX_ struct RExC_state_t *pRExC_state, I32 *flagp, U32 depth, char * const oregcomp_parse)
+ __attribute__nonnull__(pTHX_1)
+ __attribute__nonnull__(pTHX_2)
+ __attribute__nonnull__(pTHX_4);
+#define PERL_ARGS_ASSERT_HANDLE_SETS \
+ assert(pRExC_state); assert(flagp); assert(oregcomp_parse)
+
PERL_STATIC_INLINE UV* S_invlist_array(pTHX_ SV* const invlist)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
}
}
+ case '[': /* (?[ ... ]) */
+ return handle_sets(pRExC_state, flagp, depth, oregcomp_parse);
case 0:
RExC_parse--; /* for vFAIL to print correctly */
vFAIL("Sequence (? incomplete");
return namedclass;
}
+STATIC bool
+S_could_it_be_POSIX(pTHX_ RExC_state_t *pRExC_state)
+{
+ /* This applies some heuristics at the current parse position (which should
+ * be at a '[') to see if what follows might be intended to be a [:posix:]
+ * class. It returns true if it really is a posix class, of course, but it
+ * also can return true if it thinks that what was intended was a posix
+ * class that didn't quite make it.
+ *
+ * It will return true for
+ * [:alphanumerics:
+ * [:alphanumerics] (as long as the ] isn't followed immediately by a
+ * ')' indicating the end of the (?[
+ * [:any garbage including %^&$ punctuation:]
+ *
+ * This is designed to be called only from S_handle_sets; it could be
+ * easily adapted to be called from the spot at the beginning of regclass()
+ * that checks to see in a normal bracketed class if the surrounding []
+ * have been omitted ([:word:] instead of [[:word:]]). But doing so would
+ * change long-standing behavior, so I (khw) didn't do that */
+ char* p = RExC_parse + 1;
+ char first_char = *p;
+
+ PERL_ARGS_ASSERT_COULD_IT_BE_POSIX;
+
+ assert(*(p - 1) == '[');
+
+ if (! POSIXCC(first_char)) {
+ return FALSE;
+ }
+
+ p++;
+ while (p < RExC_end && isWORDCHAR(*p)) p++;
+
+ if (p >= RExC_end) {
+ return FALSE;
+ }
+
+ if (p - RExC_parse > 2 /* Got at least 1 word character */
+ && (*p == first_char
+ || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
+ {
+ return TRUE;
+ }
+
+ p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
+
+ return (p
+ && p - RExC_parse > 2 /* [:] evaluates to colon;
+ [::] is a bad posix class. */
+ && first_char == *(p - 1));
+}
+
+STATIC regnode *
+S_handle_sets(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
+ char * const oregcomp_parse)
+{
+ /* Handle the (?[...]) construct to do set operations */
+
+ U8 curchar;
+ UV start, end; /* End points of code point ranges */
+ SV* result_string;
+ char *save_end, *save_parse;
+ SV* final;
+ STRLEN len;
+ regnode* node;
+ AV* stack;
+ const bool save_fold = FOLD;
+
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_HANDLE_SETS;
+
+ if (LOC) {
+ vFAIL("(?[...]) not valid in locale");
+ }
+ RExC_uni_semantics = 1;
+
+ /* This will return only an ANYOF regnode, or (unlikely) something smaller
+ * (such as EXACT). Thus we can skip most everything if just sizing. We
+ * call regclass to handle '[]' so as to not have to reinvent its parsing
+ * rules here (throwing away the size it computes each time). And, we exit
+ * upon an unescaped ']' that isn't one ending a regclass. To do both
+ * these things, we need to realize that something preceded by a backslash
+ * is escaped, so we have to keep track of backslashes */
+ if (SIZE_ONLY) {
+
+ Perl_ck_warner_d(aTHX_
+ packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
+ "The regex_sets feature is experimental" REPORT_LOCATION,
+ (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
+
+ while (RExC_parse < RExC_end) {
+ SV* current = NULL;
+ RExC_parse = regpatws(pRExC_state, RExC_parse,
+ TRUE); /* means recognize comments */
+ switch (*RExC_parse) {
+ default:
+ break;
+ case '\\':
+ /* Skip the next byte. This would have to change to skip
+ * the next character if we were to recognize and handle
+ * specific non-ASCIIs */
+ RExC_parse++;
+ break;
+ case '[':
+ {
+ /* If this looks like it is a [:posix:] class, leave the
+ * parse pointer at the '[' to fool regclass() into
+ * thinking it is part of a '[[:posix]]'. That function
+ * will use strict checking to force a syntax error if it
+ * doesn't work out to a legitimate class */
+ bool is_posix_class = could_it_be_POSIX(pRExC_state);
+ if (! is_posix_class) {
+ RExC_parse++;
+ }
+
+ (void) regclass(pRExC_state, flagp,depth+1,
+ is_posix_class, /* parse the whole char
+ class only if not a
+ posix class */
+ FALSE, /* don't allow multi-char folds */
+ TRUE, /* silence non-portable warnings. */
+ ¤t);
+ /* function call leaves parse pointing to the ']', except
+ * if we faked it */
+ if (is_posix_class) {
+ RExC_parse--;
+ }
+
+ SvREFCNT_dec(current); /* In case it returned something */
+ break;
+ }
+
+ case ']':
+ RExC_parse++;
+ if (RExC_parse < RExC_end
+ && *RExC_parse == ')')
+ {
+ node = reganode(pRExC_state, ANYOF, 0);
+ RExC_size += ANYOF_SKIP;
+ nextchar(pRExC_state);
+ Set_Node_Length(node,
+ RExC_parse - oregcomp_parse + 1); /* MJD */
+ return node;
+ }
+ goto no_close;
+ }
+ RExC_parse++;
+ }
+
+ no_close:
+ FAIL("Syntax error in (?[...])");
+ }
+#define av_top(a) av_len(a) /* XXX Temporary */
+
+ /* Pass 2 only after this. Everything in this construct is a
+ * metacharacter. Operands begin with either a '\' (for an escape
+ * sequence), or a '[' for a bracketed character class. Any other
+ * character should be an operator, or parenthesis for grouping. Both
+ * types of operands are handled by calling regclass() to parse them. It
+ * is called with a parameter to indicate to return the computed inversion
+ * list. The parsing here is implemented via a stack. Each entry on the
+ * stack is a single character representing one of the operators, or the
+ * '('; or else a pointer to an operand inversion list. */
+
+#define IS_OPERAND(a) (! SvIOK(a))
+
+ /* The stack starts empty. It is a syntax error if the first thing parsed
+ * is a binary operator; everything else is pushed on the stack. When an
+ * operand is parsed, the top of the stack is examined. If it is a binary
+ * operator, the item before it should be an operand, and both are replaced
+ * by the result of doing that operation on the new operand and the one on
+ * the stack. Thus a sequence of binary operands is reduced to a single
+ * one before the next one is parsed.
+ *
+ * A unary operator may immediately follow a binary in the input, for
+ * example
+ * [a] + ! [b]
+ * When an operand is parsed and the top of the stack is a unary operator,
+ * the operation is performed, and then the stack is rechecked to see if
+ * this new operand is part of a binary operation; if so, it is handled as
+ * above.
+ *
+ * A '(' is simply pushed on the stack; it is valid only if the stack is
+ * empty, or the top element of the stack is an operator (for which the
+ * parenthesized expression will become an operand). By the time the
+ * corresponding ')' is parsed everything in between should have been
+ * parsed and evaluated to a single operand (or else is a syntax error),
+ * and is handled as a regular operand */
+
+ stack = newAV();
+
+ while (RExC_parse < RExC_end) {
+ I32 top_index = av_top(stack);
+ SV** top_ptr;
+ SV* current = NULL;
+
+ /* Skip white space */
+ RExC_parse = regpatws(pRExC_state, RExC_parse,
+ TRUE); /* means recognize comments */
+ if (RExC_parse >= RExC_end
+ || (curchar = UCHARAT(RExC_parse)) == ']')
+ { /* Exit loop at the end */
+ break;
+ }
+
+ switch (curchar) {
+
+ default:
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL("Unexpected character");
+
+ case '\\':
+ (void) regclass(pRExC_state, flagp,depth+1,
+ TRUE, /* means parse just the next thing */
+ FALSE, /* don't allow multi-char folds */
+ FALSE, /* don't silence non-portable warnings.
+ */
+ ¤t);
+ /* regclass() will return with parsing just the \ sequence,
+ * leaving the parse pointer at the next thing to parse */
+ RExC_parse--;
+ goto handle_operand;
+
+ case '[': /* Is a bracketed character class */
+ {
+ bool is_posix_class = could_it_be_POSIX(pRExC_state);
+
+ if (! is_posix_class) {
+ RExC_parse++;
+ }
+
+ (void) regclass(pRExC_state, flagp,depth+1,
+ is_posix_class, /* parse the whole char class
+ only if not a posix class */
+ FALSE, /* don't allow multi-char folds */
+ FALSE, /* don't silence non-portable warnings.
+ */
+ ¤t);
+ /* function call leaves parse pointing to the ']', except if we
+ * faked it */
+ if (is_posix_class) {
+ RExC_parse--;
+ }
+
+ goto handle_operand;
+ }
+
+ case '&':
+ case '|':
+ case '+':
+ case '-':
+ case '^':
+ if (top_index < 0
+ || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
+ || ! IS_OPERAND(*top_ptr))
+ {
+ RExC_parse++;
+ vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
+ }
+ av_push(stack, newSVuv(curchar));
+ break;
+
+ case '!':
+ av_push(stack, newSVuv(curchar));
+ break;
+
+ case '(':
+ if (top_index >= 0) {
+ top_ptr = av_fetch(stack, top_index, FALSE);
+ assert(top_ptr);
+ if (IS_OPERAND(*top_ptr)) {
+ RExC_parse++;
+ vFAIL("Unexpected '(' with no preceding operator");
+ }
+ }
+ av_push(stack, newSVuv(curchar));
+ break;
+
+ case ')':
+ {
+ SV* lparen;
+ if (top_index < 1
+ || ! (current = av_pop(stack))
+ || ! IS_OPERAND(current)
+ || ! (lparen = av_pop(stack))
+ || IS_OPERAND(lparen)
+ || SvUV(lparen) != '(')
+ {
+ RExC_parse++;
+ vFAIL("Unexpected ')'");
+ }
+ top_index -= 2;
+ SvREFCNT_dec_NN(lparen);
+
+ /* FALL THROUGH */
+ }
+
+ handle_operand:
+
+ /* Here, we have an operand to process, in 'current' */
+
+ if (top_index < 0) { /* Just push if stack is empty */
+ av_push(stack, current);
+ }
+ else {
+ SV* top = av_pop(stack);
+ char current_operator;
+
+ if (IS_OPERAND(top)) {
+ vFAIL("Operand with no preceding operator");
+ }
+ current_operator = (char) SvUV(top);
+ switch (current_operator) {
+ case '(': /* Push the '(' back on followed by the new
+ operand */
+ av_push(stack, top);
+ av_push(stack, current);
+ SvREFCNT_inc(top); /* Counters the '_dec' done
+ just after the 'break', so
+ it doesn't get wrongly freed
+ */
+ break;
+
+ case '!':
+ _invlist_invert(current);
+
+ /* Unlike binary operators, the top of the stack,
+ * now that this unary one has been popped off, may
+ * legally be an operator, and we now have operand
+ * for it. */
+ top_index--;
+ SvREFCNT_dec_NN(top);
+ goto handle_operand;
+
+ case '&':
+ _invlist_intersection(av_pop(stack),
+ current,
+ ¤t);
+ av_push(stack, current);
+ break;
+
+ case '|':
+ case '+':
+ _invlist_union(av_pop(stack), current, ¤t);
+ av_push(stack, current);
+ break;
+
+ case '-':
+ _invlist_subtract(av_pop(stack), current, ¤t);
+ av_push(stack, current);
+ break;
+
+ case '^': /* The union minus the intersection */
+ {
+ SV* i = NULL;
+ SV* u = NULL;
+ SV* element;
+
+ element = av_pop(stack);
+ _invlist_union(element, current, &u);
+ _invlist_intersection(element, current, &i);
+ _invlist_subtract(u, i, ¤t);
+ av_push(stack, current);
+ SvREFCNT_dec_NN(i);
+ SvREFCNT_dec_NN(u);
+ SvREFCNT_dec_NN(element);
+ break;
+ }
+
+ default:
+ Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
+ }
+ SvREFCNT_dec_NN(top);
+ }
+ }
+
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ }
+
+ if (av_top(stack) < 0 /* Was empty */
+ || ((final = av_pop(stack)) == NULL)
+ || ! IS_OPERAND(final)
+ || av_top(stack) >= 0) /* More left on stack */
+ {
+ vFAIL("Incomplete expression within '(?[ ])'");
+ }
+
+ invlist_iterinit(final);
+
+ /* Here, 'final' is the resultant inversion list of evaluating the
+ * expression. Feed it to regclass() to generate the real resultant node.
+ * regclass() is expecting a string of ranges and individual code points */
+ result_string = newSVpvs("");
+ while (invlist_iternext(final, &start, &end)) {
+ if (start == end) {
+ Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
+ }
+ else {
+ Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
+ start, end);
+ }
+ }
+
+ save_parse = RExC_parse;
+ RExC_parse = SvPV(result_string, len);
+ save_end = RExC_end;
+ RExC_end = RExC_parse + len;
+
+ /* We turn off folding around the call, as the class we have constructed
+ * already has all folding taken into consideration, and we don't want
+ * regclass() to add to that */
+ RExC_flags &= ~RXf_PMf_FOLD;
+ node = regclass(pRExC_state, flagp,depth+1,
+ FALSE, /* means parse the whole char class */
+ FALSE, /* don't allow multi-char folds */
+ TRUE, /* silence non-portable warnings. The above may very
+ well have generated non-portable code points, but
+ they're valid on this machine */
+ NULL);
+ if (save_fold) {
+ RExC_flags |= RXf_PMf_FOLD;
+ }
+ RExC_parse = save_parse + 1;
+ RExC_end = save_end;
+ SvREFCNT_dec_NN(final);
+ SvREFCNT_dec_NN(result_string);
+ SvREFCNT_dec_NN(stack);
+
+ nextchar(pRExC_state);
+ Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
+ return node;
+}
+#undef IS_OPERAND
/* The names of properties whose definitions are not known at compile time are
* stored in this SV, after a constant heading. So if the length has been
character; used under /i */
UV n;
char * stop_ptr = RExC_end; /* where to stop parsing */
- const bool strict = FALSE;
- const bool skip_white = FALSE;
+ const bool skip_white = cBOOL(ret_invlist);
+ const bool strict = cBOOL(ret_invlist);
/* Unicode properties are stored in a swash; this holds the current one
* being parsed. If this swash is the only above-latin1 component of the
Useless use of (?-p) in regex; marked by <-- HERE in m/%s/
Unmatched '%c' in POSIX class in regex; marked by <-- HERE in m/%s/
Unmatched '[' in POSIX class in regex; marked by <-- HERE in m/%s/
+(?[...]) not valid in locale in regex; marked by <-- HERE in m/%s/
+The regex_sets feature is experimental
+Syntax error in (?[...]) in regex m/%s/
+Unexpected character in regex; marked by <-- HERE in m/%s/
+Unexpected binary operator '%c' with no preceding operand in regex; marked by <-- HERE in m/%s/
+Unexpected '(' with no preceding operator in regex; marked by <-- HERE in m/%s/
+Unexpected ')' in regex; marked by <-- HERE in m/%s/
+Operand with no preceding operator in regex; marked by <-- HERE in m/%s/
Property '%s' is unknown in regex; marked by <-- HERE in m/%s/
Need exactly 3 octal digits in regex; marked by <-- HERE in m/%s/
Unrecognized escape \%c in character class in regex; marked by <-- HERE in m/%s/
+Incomplete expression within '(?[ ])' in regex; marked by <-- HERE in m/%s/
+Non-octal character in regex; marked by <-- HERE in m/%s/
+Non-hex character in regex; marked by <-- HERE in m/%s/
+Use \\x{...} for more than two hex characters in regex; marked by <-- HERE in m/%s/
__CATEGORIES__
Code point 0x%X is not Unicode, all \p{} matches fail; all \P{} matches succeed
Operation "%s" returns its argument for UTF-16 surrogate U+%X
Unicode surrogate U+%X is illegal in UTF-8
UTF-16 surrogate U+%X
-Non-octal character in regex; marked by <-- HERE in m/%s/
-Non-hex character in regex; marked by <-- HERE in m/%s/
-Use \\x{...} for more than two hex characters in regex; marked by <-- HERE in m/%s/
False [] range "%s" in regex; marked by <-- HERE in m/%s/
--- /dev/null
+#!./perl
+
+# This tests (?[...]). XXX These are just basic tests, as full ones would be
+# best done with an infrastructure change to allow getting out the inversion
+# list of the constructed set and then comparing it character by character
+# with the expected result.
+
+use strict;
+use warnings;
+
+$| = 1;
+
+BEGIN {
+ chdir 't' if -d 't';
+ @INC = ('../lib','.');
+ require './test.pl';
+}
+
+use utf8;
+no warnings 'experimental::regex_sets';
+
+like("a", qr/(?[ [a] # This is a comment
+ ])/, 'Can ignore a comment');
+like("a", qr/(?[ [a] # [[:notaclass:]]
+ ])/, 'A comment isn\'t parsed');
+unlike("\x85", qr/(?[ \t\85 ])/, 'NEL is white space');
+unlike("\x85", qr/(?[ [\t\85] ])/, '... including within nested []');
+like("\x85", qr/(?[ \t + \\85 ])/, 'can escape NEL to match');
+like("\x85", qr/(?[ [\\85] ])/, '... including within nested []');
+like("\t", qr/(?[ \t + \\85 ])/, 'can do basic union');
+like("\cK", qr/(?[ \s ])/, '\s matches \cK');
+unlike("\cK", qr/(?[ \s - \cK ])/, 'can do basic subtraction');
+like(" ", qr/(?[ \s - \cK ])/, 'can do basic subtraction');
+like(":", qr/(?[ [:] ])/, '[:] is not a posix class');
+unlike("\t", qr/(?[ ! \t ])/, 'can do basic complement');
+like("\t", qr/(?[ ! [ ^ \t ] ])/, 'can do basic complement');
+unlike("\r", qr/(?[ \t ])/, '\r doesn\'t match \t ');
+like("\r", qr/(?[ ! \t ])/, 'can do basic complement');
+like("0", qr/(?[ [:word:] & [:digit:] ])/, 'can do basic intersection');
+unlike("A", qr/(?[ [:word:] & [:digit:] ])/, 'can do basic intersection');
+like("0", qr/(?[[:word:]&[:digit:]])/, 'spaces around internal [] aren\'t required');
+
+like("a", qr/(?[ [a] | [b] ])/, '| means union');
+like("b", qr/(?[ [a] | [b] ])/, '| means union');
+unlike("c", qr/(?[ [a] | [b] ])/, '| means union');
+
+like("a", qr/(?[ [ab] ^ [bc] ])/, 'basic symmetric difference works');
+unlike("b", qr/(?[ [ab] ^ [bc] ])/, 'basic symmetric difference works');
+like("c", qr/(?[ [ab] ^ [bc] ])/, 'basic symmetric difference works');
+
+like("2", qr/(?[ ( ( \pN & ( [a] + [2] ) ) ) ])/, 'Nesting parens and grouping');
+unlike("a", qr/(?[ ( ( \pN & ( [a] + [2] ) ) ) ])/, 'Nesting parens and grouping');
+
+
+
+done_testing();
+
+1;