locinput = (p); \
SET_nextchr
-
-#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
- if (!swash_ptr) { \
- U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
- swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
- 1, 0, invlist, &flags); \
- assert(swash_ptr); \
- } \
- } STMT_END
-
-/* If in debug mode, we test that a known character properly matches */
-#ifdef DEBUGGING
-# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
- property_name, \
- invlist, \
- utf8_char_in_property) \
- LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
- assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
-#else
-# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
- property_name, \
- invlist, \
- utf8_char_in_property) \
- LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
-#endif
-
#define PLACEHOLDER /* Something for the preprocessor to grab onto */
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
); \
regcpblow(cp)
+/* set the start and end positions of capture ix */
+#define CLOSE_CAPTURE(ix, s, e) \
+ rex->offs[ix].start = s; \
+ rex->offs[ix].end = e; \
+ if (ix > rex->lastparen) \
+ rex->lastparen = ix; \
+ rex->lastcloseparen = ix; \
+ DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
+ "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
+ depth, \
+ PTR2UV(rex), \
+ PTR2UV(rex->offs), \
+ (UV)ix, \
+ (IV)rex->offs[ix].start, \
+ (IV)rex->offs[ix].end, \
+ (UV)rex->lastparen \
+ ))
+
#define UNWIND_PAREN(lp, lcp) \
+ DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
+ "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
+ depth, \
+ PTR2UV(rex), \
+ PTR2UV(rex->offs), \
+ (UV)(lp), \
+ (UV)(rex->lastparen), \
+ (UV)(lcp) \
+ )); \
for (n = rex->lastparen; n > lp; n--) \
rex->offs[n].end = -1; \
rex->lastparen = n; \
#define DECL_TRIE_TYPE(scan) \
const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
- trie_utf8l, trie_flu8 } \
+ trie_utf8l, trie_flu8, trie_flu8_latin } \
trie_type = ((scan->flags == EXACT) \
? (utf8_target ? trie_utf8 : trie_plain) \
: (scan->flags == EXACTL) \
? trie_utf8_exactfa_fold \
: trie_latin_utf8_exactfa_fold) \
: (scan->flags == EXACTFLU8 \
- ? trie_flu8 \
+ ? (utf8_target \
+ ? trie_flu8 \
+ : trie_flu8_latin) \
: (utf8_target \
? trie_utf8_fold \
- : trie_latin_utf8_fold)))
+ : trie_latin_utf8_fold)))
-#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
+/* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
+ * 'foldbuf+sizeof(foldbuf)' */
+#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
STMT_START { \
STRLEN skiplen; \
U8 flags = FOLD_FLAGS_FULL; \
switch (trie_type) { \
case trie_flu8: \
_CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
- if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
- _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
+ if (UTF8_IS_ABOVE_LATIN1(*uc)) { \
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
} \
goto do_trie_utf8_fold; \
case trie_utf8_exactfa_fold: \
case trie_utf8_fold: \
do_trie_utf8_fold: \
if ( foldlen>0 ) { \
- uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+ uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
foldlen -= len; \
uscan += len; \
len=0; \
} else { \
- len = UTF8SKIP(uc); \
- uvc = _toFOLD_utf8_flags( (const U8*) uc, uc + len, foldbuf, &foldlen, \
+ uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
flags); \
+ len = UTF8SKIP(uc); \
skiplen = UVCHR_SKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
} \
break; \
+ case trie_flu8_latin: \
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
+ goto do_trie_latin_utf8_fold; \
case trie_latin_utf8_exactfa_fold: \
flags |= FOLD_FLAGS_NOMIX_ASCII; \
/* FALLTHROUGH */ \
case trie_latin_utf8_fold: \
+ do_trie_latin_utf8_fold: \
if ( foldlen>0 ) { \
- uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+ uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
foldlen -= len; \
uscan += len; \
len=0; \
} \
/* FALLTHROUGH */ \
case trie_utf8: \
- uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
+ uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \
break; \
case trie_plain: \
uvc = (UV)*uc; \
#ifdef DEBUGGING
static IV
S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
- IV cp_out = Perl__invlist_search(invlist, cp_in);
+ IV cp_out = _invlist_search(invlist, cp_in);
assert(cp_out >= 0);
return cp_out;
}
case ANYOFM: /* ARG() is the base byte; FLAGS() the mask byte */
/* UTF-8ness doesn't matter, so use 0 */
REXEC_FBC_FIND_NEXT_SCAN(0,
- (char *) find_next_masked((U8 *) s, (U8 *) strend, ARG(c), FLAGS(c)));
+ (char *) find_next_masked((U8 *) s, (U8 *) strend,
+ (U8) ARG(c), FLAGS(c)));
break;
case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
}
points[pointpos++ % maxlen]= uc;
if (foldlen || uc < (U8*)strend) {
- REXEC_TRIE_READ_CHAR(trie_type, trie,
- widecharmap, uc,
- uscan, len, uvc, charid, foldlen,
- foldbuf, uniflags);
+ REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
+ (U8 *) strend, uscan, len, uvc,
+ charid, foldlen, foldbuf,
+ uniflags);
DEBUG_TRIE_EXECUTE_r({
dump_exec_pos( (char *)uc, c, strend,
real_start, s, utf8_target, 0);
else { /* an EXACTFish node which doesn't begin with a multi-char fold */
c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
if (c1 > 255) {
- /* Load the folds hash, if not already done */
- SV** listp;
- if (! PL_utf8_foldclosures) {
- _load_PL_utf8_foldclosures();
+ const unsigned int * remaining_folds_to_list;
+ unsigned int first_folds_to;
+
+ /* Look up what code points (besides c1) fold to c1; e.g.,
+ * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
+ Size_t folds_to_count = _inverse_folds(c1,
+ &first_folds_to,
+ &remaining_folds_to_list);
+ if (folds_to_count == 0) {
+ c2 = c1; /* there is only a single character that could
+ match */
}
-
- /* The fold closures data structure is a hash with the keys
- * being the UTF-8 of every character that is folded to, like
- * 'k', and the values each an array of all code points that
- * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
- * Multi-character folds are not included */
- if ((! (listp = hv_fetch(PL_utf8_foldclosures,
- (char *) pat,
- UTF8SKIP(pat),
- FALSE))))
- {
- /* Not found in the hash, therefore there are no folds
- * containing it, so there is only a single character that
- * could match */
- c2 = c1;
+ else if (folds_to_count != 1) {
+ /* If there aren't exactly two folds to this (itself and
+ * another), it is outside the scope of this function */
+ use_chrtest_void = TRUE;
}
- else { /* Does participate in folds */
- AV* list = (AV*) *listp;
- if (av_tindex_skip_len_mg(list) != 1) {
-
- /* If there aren't exactly two folds to this, it is
- * outside the scope of this function */
- use_chrtest_void = TRUE;
- }
- else { /* There are two. Get them */
- SV** c_p = av_fetch(list, 0, FALSE);
- if (c_p == NULL) {
- Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
- }
- c1 = SvUV(*c_p);
-
- c_p = av_fetch(list, 1, FALSE);
- if (c_p == NULL) {
- Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
- }
- c2 = SvUV(*c_p);
-
- /* Folds that cross the 255/256 boundary are forbidden
- * if EXACTFL (and isnt a UTF8 locale), or EXACTFAA and
- * one is ASCIII. Since the pattern character is above
- * 255, and its only other match is below 256, the only
- * legal match will be to itself. We have thrown away
- * the original, so have to compute which is the one
- * above 255. */
- if ((c1 < 256) != (c2 < 256)) {
- if ((OP(text_node) == EXACTFL
- && ! IN_UTF8_CTYPE_LOCALE)
- || ((OP(text_node) == EXACTFAA
- || OP(text_node) == EXACTFAA_NO_TRIE)
- && (isASCII(c1) || isASCII(c2))))
- {
- if (c1 < 256) {
- c1 = c2;
- }
- else {
- c2 = c1;
- }
- }
- }
+ else { /* There are two. We already have one, get the other */
+ c2 = first_folds_to;
+
+ /* Folds that cross the 255/256 boundary are forbidden if
+ * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is
+ * ASCIII. The only other match to c1 is c2, and since c1
+ * is above 255, c2 better be as well under these
+ * circumstances. If it isn't, it means the only legal
+ * match of c1 is itself. */
+ if ( c2 < 256
+ && ( ( OP(text_node) == EXACTFL
+ && ! IN_UTF8_CTYPE_LOCALE)
+ || (( OP(text_node) == EXACTFAA
+ || OP(text_node) == EXACTFAA_NO_TRIE)
+ && (isASCII(c1) || isASCII(c2)))))
+ {
+ c2 = c1;
}
}
}
return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
}
+ case GCB_Maybe_Emoji_NonBreak:
+
+ {
+
+ /* Do not break within emoji modifier sequences or emoji zwj sequences.
+ GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
+ */
+ U8 * temp_pos = (U8 *) curpos;
+ GCB_enum prev;
+
+ do {
+ prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == GCB_Extend);
+
+ return prev != GCB_XPG_XX;
+ }
+
default:
break;
}
*/
if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
DEBUG_EXECUTE_r(
- Perl_re_exec_indentf( aTHX_ "%sfailed to match trie start class...%s\n",
+ Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
depth, PL_colors[4], PL_colors[5])
);
sayNO_SILENT;
if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (utf8_target
- && nextchr >= 0 /* guard against negative EOS value in nextchr */
+ && ! NEXTCHR_IS_EOS
&& UTF8_IS_ABOVE_LATIN1(nextchr)
&& scan->flags == EXACTL)
{
{
if (trie->states[ state ].wordnum) {
DEBUG_EXECUTE_r(
- Perl_re_exec_indentf( aTHX_ "%smatched empty string...%s\n",
+ Perl_re_exec_indentf( aTHX_ "%sTRIE: matched empty string...%s\n",
depth, PL_colors[4], PL_colors[5])
);
if (!trie->jump)
break;
} else {
DEBUG_EXECUTE_r(
- Perl_re_exec_indentf( aTHX_ "%sfailed to match trie start class...%s\n",
+ Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
depth, PL_colors[4], PL_colors[5])
);
sayNO_SILENT;
DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
/* HERE */
PerlIO_printf( Perl_debug_log,
- "%*s%sState: %4" UVxf " Accepted: %c ",
+ "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
INDENT_CHARS(depth), "", PL_colors[4],
(UV)state, (accepted ? 'Y' : 'N'));
});
if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
I32 offset;
REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
- uscan, len, uvc, charid, foldlen,
- foldbuf, uniflags);
+ (U8 *) reginfo->strend, uscan,
+ len, uvc, charid, foldlen,
+ foldbuf, uniflags);
charcount++;
if (foldlen>0)
ST.longfold = TRUE;
}
DEBUG_TRIE_EXECUTE_r(
Perl_re_printf( aTHX_
- "Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
+ "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
charid, uvc, (UV)state, PL_colors[5] );
);
}
}
DEBUG_EXECUTE_r(
- Perl_re_exec_indentf( aTHX_ "%sgot %" IVdf " possible matches%s\n",
+ Perl_re_exec_indentf( aTHX_ "%sTRIE: got %" IVdf " possible matches%s\n",
depth,
PL_colors[4], (IV)ST.accepted, PL_colors[5] );
);
while (foldlen) {
if (!--chars)
break;
- uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
- uniflags);
+ uvc = utf8n_to_uvchr(uscan, foldlen, &len,
+ uniflags);
uscan += len;
foldlen -= len;
}
? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
SV *sv= tmp ? sv_newmortal() : NULL;
- Perl_re_exec_indentf( aTHX_ "%sonly one match left, short-circuiting: #%d <%s>%s\n",
+ Perl_re_exec_indentf( aTHX_ "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
depth, PL_colors[4],
ST.nextword,
tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
classnum = (_char_class_number) FLAGS(scan);
switch (classnum) {
default:
- if (! (to_complement
+ if (! (to_complement
^ cBOOL(_invlist_contains_cp(
PL_XPosix_ptrs[classnum],
utf8_to_uvchr_buf((U8 *) locinput,
(U8 *) reginfo->strend,
NULL)))))
- {
- sayNO;
- }
+ {
+ sayNO;
+ }
+ break;
+ case _CC_ENUM_SPACE:
+ if (! (to_complement
+ ^ cBOOL(is_XPERLSPACE_high(locinput))))
+ {
+ sayNO;
+ }
+ break;
+ case _CC_ENUM_BLANK:
+ if (! (to_complement
+ ^ cBOOL(is_HORIZWS_high(locinput))))
+ {
+ sayNO;
+ }
+ break;
+ case _CC_ENUM_XDIGIT:
+ if (! (to_complement
+ ^ cBOOL(is_XDIGIT_high(locinput))))
+ {
+ sayNO;
+ }
+ break;
+ case _CC_ENUM_VERTSPACE:
+ if (! (to_complement
+ ^ cBOOL(is_VERTWS_high(locinput))))
+ {
+ sayNO;
+ }
+ break;
+ case _CC_ENUM_CNTRL: /* These can't match above Latin1 */
+ case _CC_ENUM_ASCII:
+ if (! to_complement) {
+ sayNO;
+ }
break;
- case _CC_ENUM_SPACE:
- if (! (to_complement
- ^ cBOOL(is_XPERLSPACE_high(locinput))))
- {
- sayNO;
- }
- break;
- case _CC_ENUM_BLANK:
- if (! (to_complement
- ^ cBOOL(is_HORIZWS_high(locinput))))
- {
- sayNO;
- }
- break;
- case _CC_ENUM_XDIGIT:
- if (! (to_complement
- ^ cBOOL(is_XDIGIT_high(locinput))))
- {
- sayNO;
- }
- break;
- case _CC_ENUM_VERTSPACE:
- if (! (to_complement
- ^ cBOOL(is_VERTWS_high(locinput))))
- {
- sayNO;
- }
- break;
- case _CC_ENUM_CNTRL: /* These can't match above Latin1 */
- case _CC_ENUM_ASCII:
- if (! to_complement) {
- sayNO;
- }
- break;
}
locinput += UTF8SKIP(locinput);
}
rei = RXi_GET(re);
DEBUG_EXECUTE_r(
debug_start_match(re_sv, utf8_target, locinput,
- reginfo->strend, "Matching embedded");
+ reginfo->strend, "EVAL/GOSUB: Matching embedded");
);
startpoint = rei->program + 1;
EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
if (n > maxopenparen)
maxopenparen = n;
DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
- "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
+ "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
depth,
PTR2UV(rex),
PTR2UV(rex->offs),
script_run_begin = (U8 *) locinput;
break;
-/* XXX really need to log other places start/end are set too */
-#define CLOSE_CAPTURE \
- rex->offs[n].start = rex->offs[n].start_tmp; \
- rex->offs[n].end = locinput - reginfo->strbeg; \
- DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
- "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \
- depth, \
- PTR2UV(rex), \
- PTR2UV(rex->offs), \
- (UV)n, \
- (IV)rex->offs[n].start, \
- (IV)rex->offs[n].end \
- ))
case CLOSE: /* ) */
n = ARG(scan); /* which paren pair */
- CLOSE_CAPTURE;
- if (n > rex->lastparen)
- rex->lastparen = n;
- rex->lastcloseparen = n;
+ CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
+ locinput - reginfo->strbeg);
if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
goto fake_end;
if ( OP(cursor)==CLOSE ){
n = ARG(cursor);
if ( n <= lastopen ) {
- CLOSE_CAPTURE;
- if (n > rex->lastparen)
- rex->lastparen = n;
- rex->lastcloseparen = n;
+ CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
+ locinput - reginfo->strbeg);
if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
break;
}
ST.cache_mask = 0;
- DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: matched %ld out of %d..%d\n",
+ DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n",
depth, (long)n, min, max)
);
/* If degenerate A matches "", assume A done. */
if (locinput == cur_curlyx->u.curlyx.lastloc) {
- DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: empty match detected, trying continuation...\n",
+ DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: empty match detected, trying continuation...\n",
depth)
);
goto do_whilem_B_max;
Newxz(aux->poscache, size, char);
}
DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
- "%swhilem: Detected a super-linear match, switching on caching%s...\n",
+ "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
PL_colors[4], PL_colors[5])
);
}
mask = 1 << (offset % 8);
offset /= 8;
if (reginfo->info_aux->poscache[offset] & mask) {
- DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: (cache) already tried at this position...\n",
+ DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: (cache) already tried at this position...\n",
depth)
);
cur_curlyx->u.curlyx.count--;
CACHEsayNO;
NOT_REACHED; /* NOTREACHED */
+ case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
+ /* FALLTHROUGH */
case WHILEM_A_pre_fail: /* just failed to match even minimal A */
REGCP_UNWIND(ST.lastcp);
regcppop(rex, &maxopenparen);
- /* FALLTHROUGH */
- case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
cur_curlyx->u.curlyx.count--;
CACHEsayNO;
case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
REGCP_UNWIND(ST.lastcp);
regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
- DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "whilem: failed, trying continuation...\n",
+ DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: failed, trying continuation...\n",
depth)
);
do_whilem_B_max:
CACHEsayNO;
}
- DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "trying longer...\n", depth)
+ DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: B min fail: trying longer...\n", depth)
);
/* Try grabbing another A and see if it helps. */
cur_curlyx->u.curlyx.lastloc = locinput;
+ ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
+ maxopenparen);
+ REGCP_SET(ST.lastcp);
PUSH_STATE_GOTO(WHILEM_A_min,
/*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
locinput);
if (ST.me->flags) {
/* emulate CLOSE: mark current A as captured */
- I32 paren = ST.me->flags;
+ U32 paren = (U32)ST.me->flags;
if (ST.count) {
- rex->offs[paren].start
- = HOPc(locinput, -ST.alen) - reginfo->strbeg;
- rex->offs[paren].end = locinput - reginfo->strbeg;
- if ((U32)paren > rex->lastparen)
- rex->lastparen = paren;
- rex->lastcloseparen = paren;
+ CLOSE_CAPTURE(paren,
+ HOPc(locinput, -ST.alen) - reginfo->strbeg,
+ locinput - reginfo->strbeg);
}
else
rex->offs[paren].end = -1;
#define CURLY_SETPAREN(paren, success) \
if (paren) { \
if (success) { \
- rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
- rex->offs[paren].end = locinput - reginfo->strbeg; \
- if (paren > rex->lastparen) \
- rex->lastparen = paren; \
- rex->lastcloseparen = paren; \
+ CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
+ locinput - reginfo->strbeg); \
} \
else { \
rex->offs[paren].end = -1; \
maxopenparen = ST.paren;
ST.min = ARG1(scan); /* min to match */
ST.max = ARG2(scan); /* max to match */
+ scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
+
+ /* handle the single-char capture called as a GOSUB etc */
if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
{
- ST.min=1;
- ST.max=1;
+ char *li = locinput;
+ if (!regrepeat(rex, &li, scan, reginfo, 1))
+ sayNO;
+ SET_locinput(li);
+ goto fake_end;
}
- scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
+
goto repeat;
case CURLY: /* /A{m,n}B/ where A is width 1 char */
}
NOT_REACHED; /* NOTREACHED */
- case CURLY_B_min_known_fail:
- /* failed to find B in a non-greedy match where c1,c2 valid */
+ case CURLY_B_min_fail:
+ /* failed to find B in a non-greedy match.
+ * Handles both cases where c1,c2 valid or not */
REGCP_UNWIND(ST.cp);
if (ST.paren) {
UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
}
- /* Couldn't or didn't -- move forward. */
- ST.oldloc = locinput;
- if (utf8_target)
- locinput += UTF8SKIP(locinput);
- else
- locinput++;
- ST.count++;
- curly_try_B_min_known:
- /* find the next place where 'B' could work, then call B */
- {
+
+ if (ST.c1 == CHRTEST_VOID) {
+ /* failed -- move forward one */
+ char *li = locinput;
+ if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
+ sayNO;
+ }
+ locinput = li;
+ ST.count++;
+ if (!( ST.count <= ST.max
+ /* count overflow ? */
+ || (ST.max == REG_INFTY && ST.count > 0))
+ )
+ sayNO;
+ }
+ else {
int n;
+ /* Couldn't or didn't -- move forward. */
+ ST.oldloc = locinput;
+ if (utf8_target)
+ locinput += UTF8SKIP(locinput);
+ else
+ locinput++;
+ ST.count++;
+
+ curly_try_B_min_known:
+ /* find the next place where 'B' could work, then call B */
if (utf8_target) {
n = (ST.oldloc == locinput) ? 0 : 1;
if (ST.c1 == ST.c2) {
sayNO;
assert(n == REG_INFTY || locinput == li);
}
- CURLY_SETPAREN(ST.paren, ST.count);
- if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
- goto fake_end;
- PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
}
- NOT_REACHED; /* NOTREACHED */
-
- case CURLY_B_min_fail:
- /* failed to find B in a non-greedy match where c1,c2 invalid */
- REGCP_UNWIND(ST.cp);
- if (ST.paren) {
- UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
- }
- /* failed -- move forward one */
- {
- char *li = locinput;
- if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
- sayNO;
- }
- locinput = li;
- }
- {
- ST.count++;
- if (ST.count <= ST.max || (ST.max == REG_INFTY &&
- ST.count > 0)) /* count overflow ? */
- {
- curly_try_B_min:
- CURLY_SETPAREN(ST.paren, ST.count);
- if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
- goto fake_end;
- PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
- }
- }
- sayNO;
+ curly_try_B_min:
+ CURLY_SETPAREN(ST.paren, ST.count);
+ PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
NOT_REACHED; /* NOTREACHED */
+
curly_try_B_max:
/* a successful greedy match: now try to match B */
- if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
- goto fake_end;
{
bool could_match = locinput < reginfo->strend;
st->u.eval.prev_eval = cur_eval;
cur_eval = CUR_EVAL.prev_eval;
DEBUG_EXECUTE_r(
- Perl_re_exec_indentf( aTHX_ "EVAL trying tail ... (cur_eval=%p)\n",
+ Perl_re_exec_indentf( aTHX_ "END: EVAL trying tail ... (cur_eval=%p)\n",
depth, cur_eval););
if ( nochange_depth )
nochange_depth--;
if (locinput < reginfo->till) {
DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
- "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
+ "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
PL_colors[4],
(long)(locinput - startpos),
(long)(reginfo->till - startpos),
case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
DEBUG_EXECUTE_r(
- Perl_re_exec_indentf( aTHX_ "%ssubpattern success...%s\n",
+ Perl_re_exec_indentf( aTHX_ "%sSUCCEED: subpattern success...%s\n",
depth, PL_colors[4], PL_colors[5]));
sayYES; /* Success! */
sv_commit = ST.mark_name;
DEBUG_EXECUTE_r({
- Perl_re_exec_indentf( aTHX_ "%ssetting cutpoint to mark:%" SVf "...%s\n",
+ Perl_re_exec_indentf( aTHX_ "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
depth,
PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
});
}
break;
- /* For these character classes, the knowledge of how to handle
- * every code point is compiled in to Perl via a macro. This
- * code is written for making the loops as tight as possible.
- * It could be refactored to save space instead */
- case _CC_ENUM_SPACE:
- while (hardcount < max
- && scan < loceol
- && (to_complement
- ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- break;
- case _CC_ENUM_BLANK:
- while (hardcount < max
- && scan < loceol
- && (to_complement
- ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- break;
- case _CC_ENUM_XDIGIT:
- while (hardcount < max
- && scan < loceol
- && (to_complement
- ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- break;
- case _CC_ENUM_VERTSPACE:
- while (hardcount < max
- && scan < loceol
- && (to_complement
- ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- break;
- case _CC_ENUM_CNTRL:
- while (hardcount < max
- && scan < loceol
- && (to_complement
- ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- break;
+ /* For the classes below, the knowledge of how to handle
+ * every code point is compiled in to Perl via a macro.
+ * This code is written for making the loops as tight as
+ * possible. It could be refactored to save space instead.
+ * */
+
+ case _CC_ENUM_SPACE:
+ while (hardcount < max
+ && scan < loceol
+ && (to_complement
+ ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ break;
+ case _CC_ENUM_BLANK:
+ while (hardcount < max
+ && scan < loceol
+ && (to_complement
+ ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ break;
+ case _CC_ENUM_XDIGIT:
+ while (hardcount < max
+ && scan < loceol
+ && (to_complement
+ ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ break;
+ case _CC_ENUM_VERTSPACE:
+ while (hardcount < max
+ && scan < loceol
+ && (to_complement
+ ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ break;
+ case _CC_ENUM_CNTRL:
+ while (hardcount < max
+ && scan < loceol
+ && (to_complement
+ ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ break;
}
}
break;
PERL_ARGS_ASSERT__IS_GRAPHEME;
- /* Unassigned code points are forbidden */
+ if ( UNLIKELY(UNICODE_IS_SUPER(cp))
+ || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
+ {
+ /* These are considered graphemes */
+ return TRUE;
+ }
+
+ /* Otherwise, unassigned code points are forbidden */
if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
_invlist_search(PL_Assigned_invlist, cp))))
{
/* What code point is the digit '0' of the script run? */
UV zero_of_run = 0;
+#define SEEN_A_DIGIT (zero_of_run != 0)
+
SCX_enum script_of_run = SCX_INVALID; /* Illegal value */
SCX_enum script_of_char = SCX_INVALID;
retval = FALSE;
break;
}
- if (zero_of_run > 0) {
+ if (SEEN_A_DIGIT) {
if (zero_of_run != '0') {
retval = FALSE;
break;
/* If is within the range [+0 .. +9] of the script's zero, it also is a
* digit in that script. We can skip the rest of this code for this
* character. */
- if (UNLIKELY( zero_of_run > 0
+ if (UNLIKELY( SEEN_A_DIGIT
&& cp >= zero_of_run
&& cp - zero_of_run <= 9))
{
/* But Common contains several sets of digits. Only the '0' set
* can be part of another script. */
- if (zero_of_run > 0 && zero_of_run != '0') {
+ if (SEEN_A_DIGIT && zero_of_run != '0') {
retval = FALSE;
break;
}
* one to use, and the character is in that sequence. Now that we know
* the script, we can use script_zeros[] to directly find which
* sequence the script uses, except in a few cases it returns 0 */
- if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
+ if (UNLIKELY(zero_of_run == 0 && script_of_char >= 0)) {
zero_of_run = script_zeros[script_of_char];
}
/* If there is only a single script in common, set to that.
* Otherwise, use the intersection going forward */
Safefree(intersection);
+ intersection = NULL;
if (intersection_len == 1) {
script_of_run = script_of_char = new_overlap[0];
Safefree(new_overlap);
+ new_overlap = NULL;
}
else {
intersection = new_overlap;
scripts_match:
/* Here, the script of the character is compatible with that of the
- * run. Either they match exactly, or one or both can be any of
- * several scripts, and the intersection is not empty. If the
- * character is not a decimal digit, we are done with it. Otherwise,
- * it could still fail if it is from a different set of 10 than seen
- * already (or we may not have seen any, and we need to set the
- * sequence). If we have determined a single script and that script
- * only has one set of digits (almost all scripts are like that), then
- * this isn't a problem, as any digit must come from the same sequence.
- * The only scripts that have multiple sequences have been constructed
- * to be 0 in 'script_zeros[]'.
+ * run. That means that in most cases, it continues the script run.
+ * Either it and the run match exactly, or one or both can be in any of
+ * several scripts, and the intersection is not empty. But if the
+ * character is a decimal digit, we need further handling. If we
+ * haven't seen a digit before, it would establish what set of 10 all
+ * must come from; and if we have established a set, we need to check
+ * that this is in it.
*
- * Here we check if it is a digit. */
+ * But there are cases we can rule out without having to look up if
+ * this is a digit:
+ * a. All instances of [0-9] have been dealt with earlier.
+ * b. The next digit encoded by Unicode is 1600 code points further
+ * on, so if the code point in this loop iteration is less than
+ * that, it isn't a digit.
+ * c. Most scripts that have digits have a single set of 10. If
+ * we've encountered a digit in such a script, 'zero_of_run' is
+ * set to the code point (call it z) whose numeric value is 0.
+ * If the code point in this loop iteration is in the range
+ * z..z+9, it is in the script's set of 10, and we've actually
+ * handled it earlier in this function and won't reach this
+ * point. But, code points in that script that aren't in that
+ * range can't be digits, so we don't have to look any such up.
+ * We can tell if this script is such a one by looking at
+ * 'script_zeros[]' for it. It is non-zero iff it has a single
+ * set of digits. This rule doesn't apply if we haven't narrowed
+ * down the possible scripts to a single one yet. Nor if the
+ * zero of the run is '0', as that also hasn't narrowed things
+ * down completely */
if ( cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
- && ( ( zero_of_run == 0
- || ( ( script_of_char >= 0
- && script_zeros[script_of_char] == 0)
- || intersection))))
+ && ( intersection
+ || script_of_char < 0 /* Also implies an intersection */
+ || zero_of_run == '0'
+ || script_zeros[script_of_char] == 0))
{
SSize_t range_zero_index;
range_zero_index = _invlist_search(decimals_invlist, cp);
&& ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index))
{
UV range_zero = decimals_array[range_zero_index];
- if (zero_of_run) {
+ if (SEEN_A_DIGIT) {
if (zero_of_run != range_zero) {
retval = FALSE;
break;