=head2 Script Runs
X<(*script_run:...)> X<(sr:...)>
+X<(*atomic_script_run:...)> X<(asr:...)>
A script run is basically a sequence of characters, all from the same
Unicode script (see L<perlunicode/Scripts>), such as Latin or Greek. In
matches, or all possibilities are exhausted. This can cause a lot of
backtracking, but generally, only malicious input will result in this,
though the slow down could cause a denial of service attack. If your
-needs permit, it is best to make the pattern atomic.
+needs permit, it is best to make the pattern atomic. This is so likely
+to be what you want, that instead of writing this:
(*script_run:(?>pattern))
+you can write either of these:
+
+ (*atomic_script_run:pattern)
+ (*asr:pattern)
+
(See L</C<(?E<gt>pattern)>>.)
In Taiwan, Japan, and Korea, it is common for text to have a mixture of
vFAIL("Unmatched (");
}
- if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
+ if (paren == 'r') { /* Atomic script run */
+ paren = '>';
+ goto parse_rest;
+ }
+ else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
char *start_verb = RExC_parse + 1;
STRLEN verb_len;
char *start_arg = NULL;
}
break;
case 'a':
- if (memEQs(start_verb, verb_len, "atomic")) {
+ if ( memEQs(start_verb, verb_len, "asr")
+ || memEQs(start_verb, verb_len, "atomic_script_run"))
+ {
+ paren = 'r'; /* Mnemonic: recursed run */
+ goto script_run;
+ }
+ else if (memEQs(start_verb, verb_len, "atomic")) {
paren = 't'; /* AtOMIC */
goto alpha_assertions;
}
if ( memEQs(start_verb, verb_len, "sr")
|| memEQs(start_verb, verb_len, "script_run"))
{
+ regnode * atomic;
+
paren = 's';
+ script_run:
+
/* This indicates Unicode rules. */
REQUIRE_UNI_RULES(flagp, NULL);
RExC_parse = start_arg;
+ if (RExC_in_script_run) {
+
+ /* Nested script runs are treated as no-ops, because
+ * if the nested one fails, the outer one must as
+ * well. It could fail sooner, and avoid (??{} with
+ * side effects, but that is explicitly documented as
+ * undefined behavior. */
+
+ ret = NULL;
+
+ if (paren == 's') {
+ paren = ':';
+ goto parse_rest;
+ }
+
+ /* But, the atomic part of a nested atomic script run
+ * isn't a no-op, but can be treated just like a '(?>'
+ * */
+ paren = '>';
+ goto parse_rest;
+ }
+
+ /* By doing this here, we avoid extra warnings for nested
+ * script runs */
if (PASS2) {
Perl_ck_warner_d(aTHX_
packWARN(WARN_EXPERIMENTAL__SCRIPT_RUN),
}
- if (RExC_in_script_run) {
- paren = ':';
- ret = NULL;
+ if (paren == 's') {
+ /* Here, we're starting a new regular script run */
+ ret = reg_node(pRExC_state, SROPEN);
+ RExC_in_script_run = 1;
+ is_open = 1;
goto parse_rest;
}
- RExC_in_script_run = 1;
+
+ /* Here, we are starting an atomic script run. This is
+ * handled by recursing to deal with the atomic portion
+ * separately, enclosed in SROPEN ... SRCLOSE nodes */
ret = reg_node(pRExC_state, SROPEN);
- is_open = 1;
- goto parse_rest;
+ RExC_in_script_run = 1;
+
+ atomic = reg(pRExC_state, 'r', &flags, depth);
+ if (flags & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flags & (RESTART_PASS1|NEED_UTF8);
+ return NULL;
+ }
+
+ REGTAIL(pRExC_state, ret, atomic);
+
+ REGTAIL(pRExC_state, atomic,
+ reg_node(pRExC_state, SRCLOSE));
+
+ RExC_in_script_run = 0;
+ return ret;
}
break;
if (paren == '>' || paren == 't') {
node = SUSPEND, flag = 0;
}
+
reginsert(pRExC_state, node,ret, depth+1);
Set_Node_Cur_Length(ret, parse_start);
Set_Node_Offset(ret, parse_start + 1);
# Since there's so few tests currently, we can afford to try each syntax on
# all of them
-foreach my $type ('script_run', 'sr') {
+foreach my $type ('script_run', 'sr', 'atomic_script_run', 'asr') {
my $script_run;
eval '$script_run = qr/ ^ (*$type: .* ) $ /x;';
# Until fixed, this was skipping the '['
unlike("abc]c", qr/^ (*sr:a(*sr:[bc]*)c) $/x, "Doesn't skip parts of exact matches");
+ like("abc", qr/(*asr:a[bc]*c)/, "Outer asr works on a run");
+ unlike("abc", qr/(*asr:a(*asr:[bc]*)c)/, "Nested asr works to exclude some things");
+
done_testing();