consisting of a just a single code point; <*valuep> is set to that value
if the input is such.
- If <node_p> is non-null it signifies that the caller can accept any legal
- sequence. <*node_p> is set as follows:
+ If <node_p> is non-null it signifies that the caller can accept any other
+ legal sequence (i.e., one that isn't just a single code point). <*node_p>
+ is set as follows:
1) \N means not-a-NL: points to a newly created REG_ANY node;
2) \N{}: points to a new NOTHING node;
3) otherwise: points to a new EXACT node containing the resolved
string.
+ Note that FALSE is returned for single code point sequences if <valuep> is
+ null.
*/
STATIC bool
GET_RE_DEBUG_FLAGS;
- assert(node_p || valuep);
+ assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
/* The [^\n] meaning of \N ignores spaces and comments under the /x
* modifier. The other meaning does not */
* point, and is terminated by the brace */
has_multiple_chars = (endchar < endbrace);
- if (valuep && ! node_p && (! has_multiple_chars || in_char_class)) {
+ if (valuep && (! has_multiple_chars || in_char_class)) {
/* We only pay attention to the first char of
multichar strings being returned in char classes. I kinda wonder
if this makes sense as it does change the behaviour
}
RExC_parse = endbrace + 1;
}
- else if (! node_p) {
+ else if (! node_p || ! has_multiple_chars) {
/* Here, the input is legal, but not according to the caller's
* options. We fail without advancing the parse, so that the
}
break;
case 'N':
- /* Handle \N and \N{NAME} here and not below because it can be
- multicharacter. join_exact() will join them up later on.
- Also this makes sure that things like /\N{BLAH}+/ and
- \N{BLAH} being multi char Just Happen. dmq*/
+ /* Handle \N and \N{NAME} with multiple code points here and not
+ * below because it can be multicharacter. join_exact() will join
+ * them up later on. Also this makes sure that things like
+ * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
+ * The options to the grok function call causes it to fail if the
+ * sequence is just a single code point. We then go treat it as
+ * just another character in the current EXACT node, and hence it
+ * gets uniform treatment with all the other characters. The
+ * special treatment for quantifiers is not needed for such single
+ * character sequences */
++RExC_parse;
- grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE);
+ if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
+ RExC_parse--;
+ goto defchar;
+ }
break;
case 'k': /* Handle \k<NAME> and \k'NAME' */
parse_named_seq:
defchar: {
register STRLEN len = 0;
- register UV ender;
+ UV ender;
register char *p;
char *s;
#define MAX_NODE_STRING_SIZE 127
case 'g': case 'G': /* generic-backref, pos assertion */
case 'h': case 'H': /* HORIZWS */
case 'k': case 'K': /* named backref, keep marker */
- case 'N': /* named char sequence */
case 'p': case 'P': /* Unicode property */
case 'R': /* LNBREAK */
case 's': case 'S': /* space class */
ender = '\n';
p++;
break;
+ case 'N': /* Handle a single-code point named character. */
+ /* The options cause it to fail if a multiple code
+ * point sequence. Handle those in the switch() above
+ * */
+ RExC_parse = p + 1;
+ if (! grok_bslash_N(pRExC_state, NULL, &ender,
+ flagp, depth, FALSE))
+ {
+ RExC_parse = p = oldp;
+ goto loopdone;
+ }
+ p = RExC_parse;
+ break;
case 'r':
ender = '\r';
p++;
}
ok(! $failed, "Matched multi-char fold across EXACTFish node boundaries; if failed, was at count $failed");
+ $failed = 0;
+ for my $repeat (1 .. 300) {
+ my $string = $single x $repeat;
+ my $lhs = $string . "\N{LATIN SMALL LIGATURE FFI}";
+ if ($lhs !~ m/${string}ff\N{LATIN SMALL LETTER I}/i) {
+ $failed = $repeat;
+ last;
+ }
+ }
+ ok(! $failed, "Matched multi-char fold across EXACTFish node boundaries; if failed, was at count $failed");
+
+ $failed = 0;
+ for my $repeat (1 .. 300) {
+ my $string = $single x $repeat;
+ my $lhs = $string . "\N{LATIN SMALL LIGATURE FFL}";
+ if ($lhs !~ m/${string}ff\N{U+6c}/i) {
+ $failed = $repeat;
+ last;
+ }
+ }
+ ok(! $failed, "Matched multi-char fold across EXACTFish node boundaries; if failed, was at count $failed");
}
#
# figures it out.
\N{U+} - c - Invalid hexadecimal number
[\N{U+}] - c - Invalid hexadecimal number
-\N{U+4AG3} - c - Illegal hexadecimal digit
+\N{U+4AG3} - c - Invalid hexadecimal number
[\N{U+4AG3}] - c - Invalid hexadecimal number
abc\N{def - c - \\N{NAME} must be resolved by the lexer
# Verify works in single quotish context; regex compiler delivers slightly different msg
# \N{U+BEEF.BEAD} succeeds here, because can't completely hide it from the outside.
-\N{U+0xBEEF} - c - Illegal hexadecimal digit
+\N{U+0xBEEF} - c - Invalid hexadecimal number
\c` - c - \"\\c`\" is more clearly written simply as \"\\ \"
\c1 - c - \"\\c1\" is more clearly written simply as \"q\"
\cA \001 y $& \1