STMT_START { \
while (s < strend) { \
CODE \
- s += ((UTF8) ? UTF8SKIP(s) : 1); \
+ s += ((UTF8) \
+ ? UTF8_SAFE_SKIP(s, reginfo->strend) \
+ : 1); \
} \
} STMT_END
#define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \
if (COND) { \
FBC_CHECK_AND_TRY \
- s += ((UTF8) ? UTF8SKIP(s) : 1); \
+ s += ((UTF8) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1);\
previous_occurrence_end = s; \
} \
else { \
* of the one we're looking for. Knowing that, we can see right away if the
* next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
* don't accept the 2nd and succeeding adjacent occurrences */
-#define FBC_CHECK_AND_TRY \
- if ( ( doevery \
- || s != previous_occurrence_end) \
- && (reginfo->intuit || regtry(reginfo, &s))) \
- { \
- goto got_it; \
+#define FBC_CHECK_AND_TRY \
+ if ( ( doevery \
+ || s != previous_occurrence_end) \
+ && ( reginfo->intuit \
+ || (s <= reginfo->strend && regtry(reginfo, &s)))) \
+ { \
+ goto got_it; \
}
\
if (COND) { \
FBC_CHECK_AND_TRY \
- s += UTF8SKIP(s); \
+ s += UTF8_SAFE_SKIP(s, reginfo->strend); \
previous_occurrence_end = s; \
} \
else { \
* string (which should be zero length without having to look at the string
* contents) */
#define REXEC_FBC_TRYIT \
- if ((reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s)))) \
+ if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \
goto got_it
/* The only difference between the BOUND and NBOUND cases is that
case ANYOFH:
if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
+ REXEC_FBC_CLASS_SCAN(TRUE,
+ ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
+ && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
+ }
+ break;
+
+ case ANYOFHb:
+ if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
+
+ /* We know what the first byte of any matched string should be */
U8 first_byte = FLAGS(c);
- if (first_byte) { /* We know what the first byte of any matched
- string should be */
- REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
+ REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
- }
- else {
- REXEC_FBC_CLASS_SCAN(TRUE,
- reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
- }
+ }
+ break;
+
+ case ANYOFHr:
+ if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
+ REXEC_FBC_CLASS_SCAN(TRUE,
+ ( inRANGE((U8) NATIVE_UTF8_TO_I8(*s),
+ LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
+ HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
+ && reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)));
}
break;
{
goto got_it;
}
- s += (utf8_target) ? UTF8SKIP(s) : 1;
+ s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
}
break;
}
}
/* Didn't match. Try at the next position (if there is one) */
- s += (utf8_target) ? UTF8SKIP(s) : 1;
+ s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
if (UNLIKELY(s >= reginfo->strend)) {
break;
}
goto got_it;
}
before = after;
- s += UTF8SKIP(s);
+ s += UTF8_SAFE_SKIP(s, reginfo->strend);
}
}
else { /* Not utf8. Everything is a GCB except between CR and
/* And, since this is a bound, it can match after the final
* character in the string */
- if ((reginfo->intuit || regtry(reginfo, &s))) {
+ if ( reginfo->intuit
+ || (s <= reginfo->strend && regtry(reginfo, &s)))
+ {
goto got_it;
}
break;
if (reginfo->intuit || regtry(reginfo, &s)) {
goto got_it;
}
- s += (utf8_target) ? UTF8SKIP(s) : 1;
+ s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
if (UNLIKELY(s >= reginfo->strend)) {
break;
}
goto got_it;
}
before = after;
- s += UTF8SKIP(s);
+ s += UTF8_SAFE_SKIP(s, reginfo->strend);
}
}
else { /* Not utf8. */
}
}
- if (reginfo->intuit || regtry(reginfo, &s)) {
+ if ( reginfo->intuit
+ || (s <= reginfo->strend && regtry(reginfo, &s)))
+ {
goto got_it;
}
if (reginfo->intuit || regtry(reginfo, &s)) {
goto got_it;
}
- s += (utf8_target) ? UTF8SKIP(s) : 1;
+ s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
if (UNLIKELY(s >= reginfo->strend)) {
break;
}
goto got_it;
}
before = after;
- s += UTF8SKIP(s);
+ s += UTF8_SAFE_SKIP(s, reginfo->strend);
}
}
else { /* Not utf8. */
/* Here are at the final position in the target string. The SB
* value is always true here, so matches, depending on other
* constraints */
- if (reginfo->intuit || regtry(reginfo, &s)) {
+ if ( reginfo->intuit
+ || (s <= reginfo->strend && regtry(reginfo, &s)))
+ {
goto got_it;
}
if (reginfo->intuit || regtry(reginfo, &s)) {
goto got_it;
}
- s += (utf8_target) ? UTF8SKIP(s) : 1;
+ s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
if (UNLIKELY(s >= reginfo->strend)) {
break;
}
}
previous = before;
before = after;
- s += UTF8SKIP(s);
+ s += UTF8_SAFE_SKIP(s, reginfo->strend);
}
}
else { /* Not utf8. */
}
}
- if (reginfo->intuit || regtry(reginfo, &s)) {
+ if ( reginfo->intuit
+ || (s <= reginfo->strend && regtry(reginfo, &s)))
+ {
goto got_it;
}
}
LEAVE;
goto got_it;
}
- s = HOPc(s,1);
+ if (s < reginfo->strend) {
+ s = HOPc(s,1);
+ }
DEBUG_TRIE_EXECUTE_r({
Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
});
if (*s == ch) {
DEBUG_EXECUTE_r( did_match = 1 );
if (regtry(reginfo, &s)) goto got_it;
- s += UTF8SKIP(s);
+ s += UTF8_SAFE_SKIP(s, strend);
while (s < strend && *s == ch)
s += UTF8SKIP(s);
}
case ANYOFH:
if ( ! utf8_target
|| NEXTCHR_IS_EOS
- || ( ANYOF_FLAGS(scan) != 0
- && ANYOF_FLAGS(scan) != (U8) *locinput)
+ || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8((U8) *locinput)
+ || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
+ utf8_target))
+ {
+ sayNO;
+ }
+ goto increment_locinput;
+ break;
+
+ case ANYOFHb:
+ if ( ! utf8_target
+ || NEXTCHR_IS_EOS
+ || ANYOF_FLAGS(scan) != (U8) *locinput
+ || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
+ utf8_target))
+ {
+ sayNO;
+ }
+ goto increment_locinput;
+ break;
+
+ case ANYOFHr:
+ if ( ! utf8_target
+ || NEXTCHR_IS_EOS
+ || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
+ LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
+ HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
|| ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
utf8_target))
{
}
break;
- case NREFFL: /* /\g{name}/il */
+ case REFFLN: /* /\g{name}/il */
{ /* The capture buffer cases. The ones beginning with N for the
named buffers just convert to the equivalent numbered and
pretend they were called as the corresponding numbered buffer
utf8_fold_flags = FOLDEQ_LOCALE;
goto do_nref;
- case NREFFA: /* /\g{name}/iaa */
+ case REFFAN: /* /\g{name}/iaa */
folder = foldEQ_latin1;
fold_array = PL_fold_latin1;
type = REFFA;
utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
goto do_nref;
- case NREFFU: /* /\g{name}/iu */
+ case REFFUN: /* /\g{name}/iu */
folder = foldEQ_latin1;
fold_array = PL_fold_latin1;
type = REFFU;
utf8_fold_flags = 0;
goto do_nref;
- case NREFF: /* /\g{name}/i */
+ case REFFN: /* /\g{name}/i */
folder = foldEQ;
fold_array = PL_fold;
type = REFF;
utf8_fold_flags = 0;
goto do_nref;
- case NREF: /* /\g{name}/ */
+ case REFN: /* /\g{name}/ */
type = REF;
folder = NULL;
fold_array = NULL;
sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
break;
- case NGROUPP: /* (?(<name>)) */
+ case GROUPPN: /* (?(<name>)) */
/* reg_check_named_buff_matched returns 0 for no match */
sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
break;
case ANYOFH:
if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
- if (ANYOF_FLAGS(p)) { /* If we know the first byte of what
- matches, we can avoid calling reginclass
- */
- while ( hardcount < max
- && scan < this_eol
- && (U8) *scan == ANYOF_FLAGS(p)
- && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
- TRUE))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
+ while ( hardcount < max
+ && scan < this_eol
+ && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p)
+ && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ break;
+
+ case ANYOFHb:
+ if (utf8_target) { /* ANYOFHb only can match UTF-8 targets */
+
+ /* we know the first byte must be the FLAGS field */
+ while ( hardcount < max
+ && scan < this_eol
+ && (U8) *scan == ANYOF_FLAGS(p)
+ && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
+ TRUE))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
}
- else while ( hardcount < max
- && scan < this_eol
- && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
+ }
+ break;
+
+ case ANYOFHr:
+ if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
+ while ( hardcount < max
+ && scan < this_eol
+ && inRANGE((U8) NATIVE_UTF8_TO_I8(*scan),
+ LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
+ HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
+ && NATIVE_UTF8_TO_I8((U8) *scan) >= ANYOF_FLAGS(p)
+ && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
{
scan += UTF8SKIP(scan);
hardcount++;
S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
{
dVAR;
- const char flags = (OP(n) == ANYOFH) ? 0 : ANYOF_FLAGS(n);
+ const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHr))
+ ? 0
+ : ANYOF_FLAGS(n);
bool match = FALSE;
UV c = *p;
}
/* If this character is potentially in the bitmap, check it */
- if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) {
+ if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
else if ((flags
regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
eval_state->rex = rex;
+ eval_state->sv = reginfo->sv;
if (reginfo->sv) {
/* Make $_ available to executed code. */
SAVE_DEFSV;
DEFSV_set(reginfo->sv);
}
+ /* will be dec'd by S_cleanup_regmatch_info_aux */
+ SvREFCNT_inc_NN(reginfo->sv);
if (!(mg = mg_find_mglob(reginfo->sv))) {
/* prepare for quick setting of pos */
}
PL_curpm = eval_state->curpm;
+ SvREFCNT_dec(eval_state->sv);
}
PL_regmatch_state = aux->old_regmatch_state;
&& !prog->substrs->data[i].substr) {
SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
if (! sv_utf8_downgrade(sv, TRUE)) {
+ SvREFCNT_dec_NN(sv);
return FALSE;
}
if (SvVALID(prog->substrs->data[i].utf8_substr)) {