); \
regcpblow(cp)
+/* set the start and end positions of capture ix */
+#define CLOSE_CAPTURE(ix, s, e) \
+ rex->offs[ix].start = s; \
+ rex->offs[ix].end = e; \
+ if (ix > rex->lastparen) \
+ rex->lastparen = ix; \
+ rex->lastcloseparen = ix; \
+ DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
+ "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
+ depth, \
+ PTR2UV(rex), \
+ PTR2UV(rex->offs), \
+ (UV)ix, \
+ (IV)rex->offs[ix].start, \
+ (IV)rex->offs[ix].end, \
+ (UV)rex->lastparen \
+ ))
+
#define UNWIND_PAREN(lp, lcp) \
+ DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
+ "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
+ depth, \
+ PTR2UV(rex), \
+ PTR2UV(rex->offs), \
+ (UV)(lp), \
+ (UV)(rex->lastparen), \
+ (UV)(lcp) \
+ )); \
for (n = rex->lastparen; n > lp; n--) \
rex->offs[n].end = -1; \
rex->lastparen = n; \
return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
}
+ case GCB_Maybe_Emoji_NonBreak:
+
+ {
+
+ /* Do not break within emoji modifier sequences or emoji zwj sequences.
+ GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
+ */
+ U8 * temp_pos = (U8 *) curpos;
+ GCB_enum prev;
+
+ do {
+ prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == GCB_Extend);
+
+ return prev != GCB_XPG_XX;
+ }
+
default:
break;
}
script_run_begin = (U8 *) locinput;
break;
-/* XXX really need to log other places start/end are set too */
-#define CLOSE_CAPTURE \
- rex->offs[n].start = rex->offs[n].start_tmp; \
- rex->offs[n].end = locinput - reginfo->strbeg; \
- DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
- "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \
- depth, \
- PTR2UV(rex), \
- PTR2UV(rex->offs), \
- (UV)n, \
- (IV)rex->offs[n].start, \
- (IV)rex->offs[n].end \
- ))
case CLOSE: /* ) */
n = ARG(scan); /* which paren pair */
- CLOSE_CAPTURE;
- if (n > rex->lastparen)
- rex->lastparen = n;
- rex->lastcloseparen = n;
+ CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
+ locinput - reginfo->strbeg);
if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
goto fake_end;
if ( OP(cursor)==CLOSE ){
n = ARG(cursor);
if ( n <= lastopen ) {
- CLOSE_CAPTURE;
- if (n > rex->lastparen)
- rex->lastparen = n;
- rex->lastcloseparen = n;
+ CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
+ locinput - reginfo->strbeg);
if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
break;
}
if (ST.me->flags) {
/* emulate CLOSE: mark current A as captured */
- I32 paren = ST.me->flags;
+ U32 paren = (U32)ST.me->flags;
if (ST.count) {
- rex->offs[paren].start
- = HOPc(locinput, -ST.alen) - reginfo->strbeg;
- rex->offs[paren].end = locinput - reginfo->strbeg;
- if ((U32)paren > rex->lastparen)
- rex->lastparen = paren;
- rex->lastcloseparen = paren;
+ CLOSE_CAPTURE(paren,
+ HOPc(locinput, -ST.alen) - reginfo->strbeg,
+ locinput - reginfo->strbeg);
}
else
rex->offs[paren].end = -1;
#define CURLY_SETPAREN(paren, success) \
if (paren) { \
if (success) { \
- rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
- rex->offs[paren].end = locinput - reginfo->strbeg; \
- if (paren > rex->lastparen) \
- rex->lastparen = paren; \
- rex->lastcloseparen = paren; \
+ CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
+ locinput - reginfo->strbeg); \
} \
else { \
rex->offs[paren].end = -1; \
maxopenparen = ST.paren;
ST.min = ARG1(scan); /* min to match */
ST.max = ARG2(scan); /* max to match */
+ scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
+
+ /* handle the single-char capture called as a GOSUB etc */
if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
{
- ST.min=1;
- ST.max=1;
+ char *li = locinput;
+ if (!regrepeat(rex, &li, scan, reginfo, 1))
+ sayNO;
+ SET_locinput(li);
+ goto fake_end;
}
- scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
+
goto repeat;
case CURLY: /* /A{m,n}B/ where A is width 1 char */
}
NOT_REACHED; /* NOTREACHED */
- case CURLY_B_min_known_fail:
- /* failed to find B in a non-greedy match where c1,c2 valid */
+ case CURLY_B_min_fail:
+ /* failed to find B in a non-greedy match.
+ * Handles both cases where c1,c2 valid or not */
REGCP_UNWIND(ST.cp);
if (ST.paren) {
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
}
- /* Couldn't or didn't -- move forward. */
- ST.oldloc = locinput;
- if (utf8_target)
- locinput += UTF8SKIP(locinput);
- else
- locinput++;
- ST.count++;
- curly_try_B_min_known:
- /* find the next place where 'B' could work, then call B */
- {
+
+ if (ST.c1 == CHRTEST_VOID) {
+ /* failed -- move forward one */
+ char *li = locinput;
+ if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
+ sayNO;
+ }
+ locinput = li;
+ ST.count++;
+ if (!( ST.count <= ST.max
+ /* count overflow ? */
+ || (ST.max == REG_INFTY && ST.count > 0))
+ )
+ sayNO;
+ }
+ else {
int n;
+ /* Couldn't or didn't -- move forward. */
+ ST.oldloc = locinput;
+ if (utf8_target)
+ locinput += UTF8SKIP(locinput);
+ else
+ locinput++;
+ ST.count++;
+
+ curly_try_B_min_known:
+ /* find the next place where 'B' could work, then call B */
if (utf8_target) {
n = (ST.oldloc == locinput) ? 0 : 1;
if (ST.c1 == ST.c2) {
sayNO;
assert(n == REG_INFTY || locinput == li);
}
- CURLY_SETPAREN(ST.paren, ST.count);
- if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
- goto fake_end;
- PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
}
- NOT_REACHED; /* NOTREACHED */
- case CURLY_B_min_fail:
- /* failed to find B in a non-greedy match where c1,c2 invalid */
-
- REGCP_UNWIND(ST.cp);
- if (ST.paren) {
- UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
- }
- /* failed -- move forward one */
- {
- char *li = locinput;
- if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
- sayNO;
- }
- locinput = li;
- }
- {
- ST.count++;
- if (ST.count <= ST.max || (ST.max == REG_INFTY &&
- ST.count > 0)) /* count overflow ? */
- {
- curly_try_B_min:
- CURLY_SETPAREN(ST.paren, ST.count);
- if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
- goto fake_end;
- PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
- }
- }
- sayNO;
+ curly_try_B_min:
+ CURLY_SETPAREN(ST.paren, ST.count);
+ PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
NOT_REACHED; /* NOTREACHED */
+
curly_try_B_max:
/* a successful greedy match: now try to match B */
- if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
- goto fake_end;
{
bool could_match = locinput < reginfo->strend;
/* What code point is the digit '0' of the script run? */
UV zero_of_run = 0;
+#define SEEN_A_DIGIT (zero_of_run != 0)
+
SCX_enum script_of_run = SCX_INVALID; /* Illegal value */
SCX_enum script_of_char = SCX_INVALID;
retval = FALSE;
break;
}
- if (zero_of_run > 0) {
+ if (SEEN_A_DIGIT) {
if (zero_of_run != '0') {
retval = FALSE;
break;
/* If is within the range [+0 .. +9] of the script's zero, it also is a
* digit in that script. We can skip the rest of this code for this
* character. */
- if (UNLIKELY( zero_of_run > 0
+ if (UNLIKELY( SEEN_A_DIGIT
&& cp >= zero_of_run
&& cp - zero_of_run <= 9))
{
/* But Common contains several sets of digits. Only the '0' set
* can be part of another script. */
- if (zero_of_run > 0 && zero_of_run != '0') {
+ if (SEEN_A_DIGIT && zero_of_run != '0') {
retval = FALSE;
break;
}
* one to use, and the character is in that sequence. Now that we know
* the script, we can use script_zeros[] to directly find which
* sequence the script uses, except in a few cases it returns 0 */
- if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
+ if (UNLIKELY(zero_of_run == 0 && script_of_char >= 0)) {
zero_of_run = script_zeros[script_of_char];
}
scripts_match:
/* Here, the script of the character is compatible with that of the
- * run. Either they match exactly, or one or both can be any of
- * several scripts, and the intersection is not empty. If the
- * character is not a decimal digit, we are done with it. Otherwise,
- * it could still fail if it is from a different set of 10 than seen
- * already (or we may not have seen any, and we need to set the
- * sequence). If we have determined a single script and that script
- * only has one set of digits (almost all scripts are like that), then
- * this isn't a problem, as any digit must come from the same sequence.
- * The only scripts that have multiple sequences have been constructed
- * to be 0 in 'script_zeros[]'.
+ * run. That means that in most cases, it continues the script run.
+ * Either it and the run match exactly, or one or both can be in any of
+ * several scripts, and the intersection is not empty. But if the
+ * character is a decimal digit, we need further handling. If we
+ * haven't seen a digit before, it would establish what set of 10 all
+ * must come from; and if we have established a set, we need to check
+ * that this is in it.
*
- * Here we check if it is a digit. */
+ * But there are cases we can rule out without having to look up if
+ * this is a digit:
+ * a. All instances of [0-9] have been dealt with earlier.
+ * b. The next digit encoded by Unicode is 1600 code points further
+ * on, so if the code point in this loop iteration is less than
+ * that, it isn't a digit.
+ * c. Most scripts that have digits have a single set of 10. If
+ * we've encountered a digit in such a script, 'zero_of_run' is
+ * set to the code point (call it z) whose numeric value is 0.
+ * If the code point in this loop iteration is in the range
+ * z..z+9, it is in the script's set of 10, and we've actually
+ * handled it earlier in this function and won't reach this
+ * point. But, code points in that script that aren't in that
+ * range can't be digits, so we don't have to look any such up.
+ * We can tell if this script is such a one by looking at
+ * 'script_zeros[]' for it. It is non-zero iff it has a single
+ * set of digits. This rule doesn't apply if we haven't narrowed
+ * down the possible scripts to a single one yet. Nor if the
+ * zero of the run is '0', as that also hasn't narrowed things
+ * down completely */
if ( cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
- && ( ( zero_of_run == 0
- || ( ( script_of_char >= 0
- && script_zeros[script_of_char] == 0)
- || intersection))))
+ && ( intersection
+ || script_of_char < 0 /* Also implies an intersection */
+ || zero_of_run == '0'
+ || script_zeros[script_of_char] == 0))
{
SSize_t range_zero_index;
range_zero_index = _invlist_search(decimals_invlist, cp);
&& ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index))
{
UV range_zero = decimals_array[range_zero_index];
- if (zero_of_run) {
+ if (SEEN_A_DIGIT) {
if (zero_of_run != range_zero) {
retval = FALSE;
break;