#ifdef DEBUGGING
/* At least one required character in the target string is expressible only in
* UTF-8. */
-static const char* const non_utf8_target_but_utf8_required
+static const char non_utf8_target_but_utf8_required[]
= "Can't match, because target string needs to be in UTF-8\n";
#endif
/* now look for the 'other' substring if defined */
- if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
- : prog->substrs->data[other_ix].substr)
+ if (prog->substrs->data[other_ix].utf8_substr
+ || prog->substrs->data[other_ix].substr)
{
/* Take into account the "other" substring. */
char *last, *last1;
do_other_substr:
other = &prog->substrs->data[other_ix];
+ if (!utf8_target && !other->substr) {
+ if (!to_byte_substr(prog)) {
+ NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
+ }
+ }
/* if "other" is anchored:
* we've previously found a floating substr starting at check_at.
} else { \
uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
flags); \
- len = UTF8SKIP(uc); \
+ len = UTF8_SAFE_SKIP(uc, uc_end); \
skiplen = UVCHR_SKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
previous_occurrence_end = s; \
}
+/* This differs from the above macros in that it is passed a single byte that
+ * is known to begin the next occurrence of the thing being looked for in 's'.
+ * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
+ * at that position. */
+#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \
+ while (s < strend) { \
+ s = (char *) memchr(s, byte, strend -s); \
+ if (s == NULL) { \
+ s = (char *) strend; \
+ break; \
+ } \
+ \
+ if (COND) { \
+ FBC_CHECK_AND_TRY \
+ s += UTF8SKIP(s); \
+ previous_occurrence_end = s; \
+ } \
+ else { \
+ s += UTF8SKIP(s); \
+ } \
+ }
+
/* The three macros below are slightly different versions of the same logic.
*
* The first is for /a and /aa when the target string is UTF-8. This can only
break;
case ANYOFH:
- if (utf8_target) REXEC_FBC_CLASS_SCAN(TRUE,
+ if (utf8_target) { /* Can't possibly match a non-UTF-8 target */
+ U8 first_byte = FLAGS(c);
+
+ if (first_byte) { /* We know what the first byte of any matched
+ string should be */
+ REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
+ }
+ else {
+ REXEC_FBC_CLASS_SCAN(TRUE,
+ reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
+ }
+ }
break;
case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
case ANYOFH:
if ( ! utf8_target
|| NEXTCHR_IS_EOS
+ || ( ANYOF_FLAGS(scan) != 0
+ && ANYOF_FLAGS(scan) != (U8) *locinput)
|| ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
utf8_target))
{
}
break;
}
- locinput += UTF8SKIP(locinput);
+ locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
}
break;
* having to worry about one being shorter than the
* other, since the first byte of each gives the
* length of the character) */
- if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
- && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
+ if ( memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput,
+ reginfo->strend))
+ && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput,
+ reginfo->strend)))
{
/* simulate B failing */
DEBUG_OPTIMISE_r(
n = (ST.oldloc == locinput) ? 0 : 1;
if (ST.c1 == ST.c2) {
/* set n to utf8_distance(oldloc, locinput) */
- while (locinput <= ST.maxpos
- && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
+ while ( locinput <= ST.maxpos
+ && locinput < loceol
+ && memNE(locinput, ST.c1_utf8,
+ UTF8_SAFE_SKIP(locinput, reginfo->strend)))
{
- locinput += UTF8SKIP(locinput);
+ locinput += UTF8_SAFE_SKIP(locinput,
+ reginfo->strend);
n++;
}
}
else {
/* set n to utf8_distance(oldloc, locinput) */
- while (locinput <= ST.maxpos
- && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
- && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
+ while ( locinput <= ST.maxpos
+ && locinput < loceol
+ && memNE(locinput, ST.c1_utf8,
+ UTF8_SAFE_SKIP(locinput, reginfo->strend))
+ && memNE(locinput, ST.c2_utf8,
+ UTF8_SAFE_SKIP(locinput, reginfo->strend)))
{
- locinput += UTF8SKIP(locinput);
+ locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
n++;
}
}
if (ST.c1 != CHRTEST_VOID && could_match) {
if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
{
- could_match = memEQ(locinput,
- ST.c1_utf8,
- UTF8SKIP(locinput))
- || memEQ(locinput,
- ST.c2_utf8,
- UTF8SKIP(locinput));
+ could_match = memEQ(locinput, ST.c1_utf8,
+ UTF8_SAFE_SKIP(locinput,
+ reginfo->strend))
+ || memEQ(locinput, ST.c2_utf8,
+ UTF8_SAFE_SKIP(locinput,
+ reginfo->strend));
}
else {
- could_match = UCHARAT(locinput) == ST.c1
- || UCHARAT(locinput) == ST.c2;
+ could_match = UCHARAT(locinput) == ST.c1
+ || UCHARAT(locinput) == ST.c2;
}
}
if (ST.c1 == CHRTEST_VOID || could_match) {
PERL_UINT_FAST8_T back_count = scan->flags;
char * s;
- /* Lookbehind ends here */
- ST.end = locinput;
+ /* Lookbehind can look beyond the current position */
+ ST.end = loceol;
/* ... and starts at the first place in the input that is in
* the range of the possible start positions */
if (c1 == c2) {
while (scan < this_eol
&& hardcount < max
- && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
+ && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
+ loceol)))
{
- scan += UTF8SKIP(scan);
+ scan += UTF8SKIP(c1_utf8);
hardcount++;
}
}
else {
while (scan < this_eol
&& hardcount < max
- && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
- || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
+ && ( memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
+ loceol))
+ || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan,
+ loceol))))
{
- scan += UTF8SKIP(scan);
+ scan += UTF8_SAFE_SKIP(scan, loceol);
hardcount++;
}
}
break;
case ANYOFH:
- if (utf8_target) while ( hardcount < max
- && scan < this_eol
- && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
+ if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
+ if (ANYOF_FLAGS(p)) { /* If we know the first byte of what
+ matches, we can avoid calling reginclass
+ */
+ while ( hardcount < max
+ && scan < this_eol
+ && (U8) *scan == ANYOF_FLAGS(p)
+ && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
TRUE))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ else while ( hardcount < max
+ && scan < this_eol
+ && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
}
break;
S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
{
dVAR;
- const char flags = ANYOF_FLAGS(n);
+ const char flags = (OP(n) == ANYOFH) ? 0 : ANYOF_FLAGS(n);
bool match = FALSE;
UV c = *p;