#include "invlist_inline.h"
#include "unicode_constants.h"
-#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
- "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
+static const char b_utf8_locale_required[] =
+ "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
+ " Assuming a UTF-8 locale";
+
+#define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND \
+ STMT_START { \
+ if (! IN_UTF8_CTYPE_LOCALE) { \
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
+ b_utf8_locale_required); \
+ } \
+ } STMT_END
-static const char utf8_locale_required[] =
+static const char sets_utf8_locale_required[] =
"Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
+#define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n) \
+ STMT_START { \
+ if (! IN_UTF8_CTYPE_LOCALE && ANYOFL_UTF8_LOCALE_REQD(FLAGS(n))) { \
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
+ sets_utf8_locale_required); \
+ } \
+ } STMT_END
+
#ifdef DEBUGGING
/* At least one required character in the target string is expressible only in
* UTF-8. */
-static const char* const non_utf8_target_but_utf8_required
+static const char non_utf8_target_but_utf8_required[]
= "Can't match, because target string needs to be in UTF-8\n";
#endif
goto target; \
} STMT_END
-#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
-
#ifndef STATIC
#define STATIC static
#endif
: (U8*)(pos + off))
#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
-#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
-#define NEXTCHR_IS_EOS (nextchr < 0)
-
-#define SET_nextchr \
- nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
-
-#define SET_locinput(p) \
- locinput = (p); \
- SET_nextchr
-
#define PLACEHOLDER /* Something for the preprocessor to grab onto */
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
I32 p;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGCPPUSH;
{
UV i;
U32 paren;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGCPPOP;
* Ideally this could be replaced by a just an array of function pointers
* to the C library functions that implement the macros this calls.
* However, to compile, the precise function signatures are required, and
- * these may vary from platform to to platform. To avoid having to figure
+ * these may vary from platform to platform. To avoid having to figure
* out what those all are on each platform, I (khw) am using this method,
* which adds an extra layer of function call overhead (unless the C
* optimizer strips it away). But we don't particularly care about
* rules, ignoring any locale. So use the Unicode function if this class
* requires an inversion list, and use the Unicode macro otherwise. */
- dVAR;
PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
span_word |= span_word << 4;
/* That reduces the problem to what this function solves */
- return s + _variant_byte_number(span_word);
+ return s + variant_byte_number(span_word);
#endif
masked &= PERL_VARIANTS_WORD_MASK;
/* This reduces the problem to that solved by this function */
- s += _variant_byte_number(masked);
+ s += variant_byte_number(masked);
return s;
} while (s + PERL_WORDSIZE <= send);
masked |= masked << 1;
masked |= masked << 2;
masked |= masked << 4;
- return s + _variant_byte_number(masked);
+ return s + variant_byte_number(masked);
#endif
RXi_GET_DECL(prog,progi);
regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
regmatch_info *const reginfo = ®info_buf;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_RE_INTUIT_START;
PERL_UNUSED_ARG(flags);
/* now look for the 'other' substring if defined */
- if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
- : prog->substrs->data[other_ix].substr)
+ if (prog->substrs->data[other_ix].utf8_substr
+ || prog->substrs->data[other_ix].substr)
{
/* Take into account the "other" substring. */
char *last, *last1;
do_other_substr:
other = &prog->substrs->data[other_ix];
+ if (!utf8_target && !other->substr) {
+ if (!to_byte_substr(prog)) {
+ NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
+ }
+ }
/* if "other" is anchored:
* we've previously found a floating substr starting at check_at.
const U8* const str = (U8*)STRING(progi->regstclass);
/* XXX this value could be pre-computed */
- const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
+ const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
? (reginfo->is_utf8_pat
- ? utf8_distance(str + STR_LEN(progi->regstclass), str)
- : STR_LEN(progi->regstclass))
+ ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
+ : (SSize_t)STR_LEN(progi->regstclass))
: 1);
char * endpos;
char *s;
} else { \
uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
flags); \
- len = UTF8SKIP(uc); \
+ len = UTF8_SAFE_SKIP(uc, uc_end); \
skiplen = UVCHR_SKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
case trie_utf8l: \
_CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
- _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
} \
/* FALLTHROUGH */ \
case trie_utf8: \
dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
startpos, doutf8, depth)
-#define REXEC_FBC_SCAN(UTF8, CODE) \
+#define REXEC_FBC_UTF8_SCAN(CODE) \
STMT_START { \
while (s < strend) { \
CODE \
- s += ((UTF8) ? UTF8SKIP(s) : 1); \
+ s += UTF8_SAFE_SKIP(s, reginfo->strend); \
} \
} STMT_END
-#define REXEC_FBC_CLASS_SCAN(UTF8, COND) \
+#define REXEC_FBC_NON_UTF8_SCAN(CODE) \
STMT_START { \
while (s < strend) { \
- REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \
+ CODE \
+ s++; \
+ } \
+ } STMT_END
+
+#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
+ STMT_START { \
+ while (s < strend) { \
+ REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
+ } \
+ } STMT_END
+
+#define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND) \
+ STMT_START { \
+ while (s < strend) { \
+ REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
} \
} STMT_END
-#define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \
+#define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
if (COND) { \
FBC_CHECK_AND_TRY \
- s += ((UTF8) ? UTF8SKIP(s) : 1); \
+ s += UTF8_SAFE_SKIP(s, reginfo->strend); \
previous_occurrence_end = s; \
} \
else { \
- s += ((UTF8) ? UTF8SKIP(s) : 1); \
+ s += UTF8SKIP(s); \
}
-#define REXEC_FBC_CSCAN(CONDUTF8,COND) \
- if (utf8_target) { \
- REXEC_FBC_CLASS_SCAN(1, CONDUTF8); \
+#define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
+ if (COND) { \
+ FBC_CHECK_AND_TRY \
+ s++; \
+ previous_occurrence_end = s; \
} \
else { \
- REXEC_FBC_CLASS_SCAN(0, COND); \
+ s++; \
}
/* We keep track of where the next character should start after an occurrence
* of the one we're looking for. Knowing that, we can see right away if the
* next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
* don't accept the 2nd and succeeding adjacent occurrences */
-#define FBC_CHECK_AND_TRY \
- if ( ( doevery \
- || s != previous_occurrence_end) \
- && (reginfo->intuit || regtry(reginfo, &s))) \
- { \
- goto got_it; \
+#define FBC_CHECK_AND_TRY \
+ if ( ( doevery \
+ || s != previous_occurrence_end) \
+ && ( reginfo->intuit \
+ || (s <= reginfo->strend && regtry(reginfo, &s)))) \
+ { \
+ goto got_it; \
}
-/* This differs from the above macros in that it calls a function which returns
- * the next occurrence of the thing being looked for in 's'; and 'strend' if
- * there is no such occurrence. */
-#define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f) \
+/* These differ from the above macros in that they call a function which
+ * returns the next occurrence of the thing being looked for in 's'; and
+ * 'strend' if there is no such occurrence. */
+#define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f) \
+ while (s < strend) { \
+ s = (f); \
+ if (s >= strend) { \
+ break; \
+ } \
+ \
+ FBC_CHECK_AND_TRY \
+ s += UTF8SKIP(s); \
+ previous_occurrence_end = s; \
+ }
+
+#define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f) \
while (s < strend) { \
s = (f); \
if (s >= strend) { \
} \
\
FBC_CHECK_AND_TRY \
- s += (UTF8) ? UTF8SKIP(s) : 1; \
+ s++; \
previous_occurrence_end = s; \
}
-/* The three macros below are slightly different versions of the same logic.
+/* This differs from the above macros in that it is passed a single byte that
+ * is known to begin the next occurrence of the thing being looked for in 's'.
+ * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
+ * at that position. */
+#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \
+ while (s < strend) { \
+ s = (char *) memchr(s, byte, strend -s); \
+ if (s == NULL) { \
+ s = (char *) strend; \
+ break; \
+ } \
+ \
+ if (COND) { \
+ FBC_CHECK_AND_TRY \
+ s += UTF8_SAFE_SKIP(s, reginfo->strend); \
+ previous_occurrence_end = s; \
+ } \
+ else { \
+ s += UTF8SKIP(s); \
+ } \
+ }
+
+/* The four macros below are slightly different versions of the same logic.
*
* The first is for /a and /aa when the target string is UTF-8. This can only
- * match ascii, but it must advance based on UTF-8. The other two handle the
- * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
- * for the boundary (or non-boundary) between a word and non-word character.
- * The utf8 and non-utf8 cases have the same logic, but the details must be
- * different. Find the "wordness" of the character just prior to this one, and
- * compare it with the wordness of this one. If they differ, we have a
- * boundary. At the beginning of the string, pretend that the previous
+ * match ascii, but it must advance based on UTF-8. The other three handle
+ * the non-UTF-8 and the more generic UTF-8 cases. In all four, we are
+ * looking for the boundary (or non-boundary) between a word and non-word
+ * character. The utf8 and non-utf8 cases have the same logic, but the details
+ * must be different. Find the "wordness" of the character just prior to this
+ * one, and compare it with the wordness of this one. If they differ, we have
+ * a boundary. At the beginning of the string, pretend that the previous
* character was a new-line.
*
* All these macros uncleanly have side-effects with each other and outside
* see if this tentative match actually works, and if so, to quit the loop
* here. And vice-versa if we are looking for a non-boundary.
*
- * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
- * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
+ * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
+ * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
* TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
* at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
* TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
tmp = TEST_NON_UTF8(tmp); \
- REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \
+ REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
tmp = !tmp; \
IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
* TEST_UTF8 is a macro that for the same input code points returns identically
- * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
+ * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
+ * end pointer as well) */
#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
if (s == reginfo->strbeg) { \
tmp = '\n'; \
0, UTF8_ALLOW_DEFAULT); \
} \
tmp = TEST_UV(tmp); \
- REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \
+ REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */ \
if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
tmp = !tmp; \
IF_SUCCESS; \
} \
);
-/* Like the above two macros. UTF8_CODE is the complete code for handling
- * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
- * macros below */
-#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
- if (utf8_target) { \
- UTF8_CODE \
- } \
- else { /* Not utf8 */ \
- tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
- tmp = TEST_NON_UTF8(tmp); \
- REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */ \
- if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
- IF_SUCCESS; \
- tmp = !tmp; \
- } \
- else { \
- IF_FAIL; \
- } \
- ); \
- } \
+/* Like the above two macros, for a UTF-8 target string. UTF8_CODE is the
+ * complete code for handling UTF-8. Common to the BOUND and NBOUND cases,
+ * set-up by the FBC_BOUND, etc macros below */
+#define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
+ UTF8_CODE; \
/* Here, things have been set up by the previous code so that tmp is the \
- * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
- * utf8ness of the target). We also have to check if this matches against \
- * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
- * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
- * string */ \
+ * return of TEST_NON_UTF8(s-1). We also have to check if this matches \
+ * against the EOS, which we treat as a \n */ \
if (tmp == ! TEST_NON_UTF8('\n')) { \
IF_SUCCESS; \
} \
IF_FAIL; \
}
+/* Same as the macro above, but the target isn't UTF-8 */
+#define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
+ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
+ tmp = TEST_NON_UTF8(tmp); \
+ REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */ \
+ if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) { \
+ IF_SUCCESS; \
+ tmp = !tmp; \
+ } \
+ else { \
+ IF_FAIL; \
+ } \
+ ); \
+ /* Here, things have been set up by the previous code so that tmp is \
+ * the return of TEST_NON_UTF8(s-1). We also have to check if this \
+ * matches against the EOS, which we treat as a \n */ \
+ if (tmp == ! TEST_NON_UTF8('\n')) { \
+ IF_SUCCESS; \
+ } \
+ else { \
+ IF_FAIL; \
+ }
+
/* This is the macro to use when we want to see if something that looks like it
- * could match, actually does, and if so exits the loop */
-#define REXEC_FBC_TRYIT \
- if ((reginfo->intuit || regtry(reginfo, &s))) \
+ * could match, actually does, and if so exits the loop. It needs to be used
+ * only for bounds checking macros, as it allows for matching beyond the end of
+ * string (which should be zero length without having to look at the string
+ * contents) */
+#define REXEC_FBC_TRYIT \
+ if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \
goto got_it
/* The only difference between the BOUND and NBOUND cases is that
* The TEST_FOO parameters are for operating on different forms of input, but
* all should be ones that return identically for the same underlying code
* points */
-#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
- FBC_BOUND_COMMON( \
- FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
+
+#define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
+ FBC_BOUND_COMMON_UTF8( \
+ FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
-#define FBC_BOUND_A(TEST_NON_UTF8) \
- FBC_BOUND_COMMON( \
- FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
- TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
+#define FBC_BOUND_NON_UTF8(TEST_NON_UTF8) \
+ FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
+
+#define FBC_BOUND_A_UTF8(TEST_NON_UTF8) \
+ FBC_BOUND_COMMON_UTF8( \
+ FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
+ TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
+
+#define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8) \
+ FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
+
+#define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
+ FBC_BOUND_COMMON_UTF8( \
+ FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
+ TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
-#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
- FBC_BOUND_COMMON( \
- FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
- TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
+#define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8) \
+ FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
-#define FBC_NBOUND_A(TEST_NON_UTF8) \
- FBC_BOUND_COMMON( \
- FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
+#define FBC_NBOUND_A_UTF8(TEST_NON_UTF8) \
+ FBC_BOUND_COMMON_UTF8( \
+ FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
+#define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8) \
+ FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
+
#ifdef DEBUGGING
static IV
S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
const char *strend, regmatch_info *reginfo)
{
- dVAR;
/* TRUE if x+ need not match at just the 1st pos of run of x's */
const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
PERL_ARGS_ASSERT_FIND_BYCLASS;
- /* We know what class it must start with. */
- switch (OP(c)) {
- case ANYOFPOSIXL:
- case ANYOFL:
+ /* We know what class it must start with. The case statements below have
+ * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
+ * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
+ * ('p8' and 'pb'. */
+ switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
+
+ case ANYOFPOSIXL_t8_pb:
+ case ANYOFPOSIXL_t8_p8:
+ case ANYOFL_t8_pb:
+ case ANYOFL_t8_p8:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
- if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
- Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
- }
+ /* FALLTHROUGH */
+
+ case ANYOFD_t8_pb:
+ case ANYOFD_t8_p8:
+ case ANYOF_t8_pb:
+ case ANYOF_t8_p8:
+ REXEC_FBC_UTF8_CLASS_SCAN(
+ reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
+ break;
+
+ case ANYOFPOSIXL_tb_pb:
+ case ANYOFPOSIXL_tb_p8:
+ case ANYOFL_tb_pb:
+ case ANYOFL_tb_p8:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
/* FALLTHROUGH */
- case ANYOFD:
- case ANYOF:
- if (utf8_target) {
- REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
- reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
- }
- else if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
+
+ case ANYOFD_tb_pb:
+ case ANYOFD_tb_p8:
+ case ANYOF_tb_pb:
+ case ANYOF_tb_p8:
+ if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
/* We know that s is in the bitmap range since the target isn't
* UTF-8, so what happens for out-of-range values is not relevant,
* so exclude that from the flags */
- REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
+ REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
+ 0));
}
else {
- REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
+ REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
}
break;
- case ANYOFM: /* ARG() is the base byte; FLAGS() the mask byte */
- /* UTF-8ness doesn't matter, so use 0 */
- REXEC_FBC_FIND_NEXT_SCAN(0,
- (char *) find_next_masked((U8 *) s, (U8 *) strend,
- (U8) ARG(c), FLAGS(c)));
+ case ANYOFM_tb_pb: /* ARG() is the base byte; FLAGS() the mask byte */
+ case ANYOFM_tb_p8:
+ REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
+ (char *) find_next_masked((U8 *) s, (U8 *) strend,
+ (U8) ARG(c), FLAGS(c)));
break;
- case NANYOFM:
- REXEC_FBC_FIND_NEXT_SCAN(0,
- (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
- (U8) ARG(c), FLAGS(c)));
+ case ANYOFM_t8_pb:
+ case ANYOFM_t8_p8:
+ /* UTF-8ness doesn't matter because only matches UTF-8 invariants. But
+ * we do anyway for performance reasons, as otherwise we would have to
+ * examine all the continuation characters */
+ REXEC_FBC_UTF8_FIND_NEXT_SCAN(
+ (char *) find_next_masked((U8 *) s, (U8 *) strend,
+ (U8) ARG(c), FLAGS(c)));
break;
- case ANYOFH:
- if (utf8_target) REXEC_FBC_CLASS_SCAN(TRUE,
- reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
+ case NANYOFM_tb_pb:
+ case NANYOFM_tb_p8:
+ REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
+ (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
+ (U8) ARG(c), FLAGS(c)));
break;
- case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
- assert(! is_utf8_pat);
- /* FALLTHROUGH */
- case EXACTFAA:
- if (is_utf8_pat) {
- utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
- |FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE;
- goto do_exactf_utf8;
+ case NANYOFM_t8_pb:
+ case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
+ variants. */
+ REXEC_FBC_UTF8_FIND_NEXT_SCAN(
+ (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
+ (U8) ARG(c), FLAGS(c)));
+ break;
+
+ /* These nodes all require at least one code point to be in UTF-8 to
+ * match */
+ case ANYOFH_tb_pb:
+ case ANYOFH_tb_p8:
+ case ANYOFHb_tb_pb:
+ case ANYOFHb_tb_p8:
+ case ANYOFHr_tb_pb:
+ case ANYOFHr_tb_p8:
+ case ANYOFHs_tb_pb:
+ case ANYOFHs_tb_p8:
+ case EXACTFLU8_tb_pb:
+ case EXACTFLU8_tb_p8:
+ case EXACTFU_REQ8_tb_pb:
+ case EXACTFU_REQ8_tb_p8:
+ break;
+
+ case ANYOFH_t8_pb:
+ case ANYOFH_t8_p8:
+ REXEC_FBC_UTF8_CLASS_SCAN(
+ ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
+ && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
+ break;
+
+ case ANYOFHb_t8_pb:
+ case ANYOFHb_t8_p8:
+ {
+ /* We know what the first byte of any matched string should be. */
+ U8 first_byte = FLAGS(c);
+
+ REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
+ reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
}
- else if (utf8_target) {
+ break;
- /* Here, and elsewhere in this file, the reason we can't consider a
- * non-UTF-8 pattern already folded in the presence of a UTF-8
- * target is because any MICRO SIGN in the pattern won't be folded.
- * Since the fold of the MICRO SIGN requires UTF-8 to represent, we
- * can consider a non-UTF-8 pattern folded when matching a
- * non-UTF-8 target */
- utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
- goto do_exactf_utf8;
+ case ANYOFHr_t8_pb:
+ case ANYOFHr_t8_p8:
+ REXEC_FBC_UTF8_CLASS_SCAN(
+ ( inRANGE(NATIVE_UTF8_TO_I8(*s),
+ LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
+ HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
+ && reginclass(prog, c, (U8*)s, (U8*) strend,
+ 1 /* is utf8 */)));
+ break;
+
+ case ANYOFHs_t8_pb:
+ case ANYOFHs_t8_p8:
+ REXEC_FBC_UTF8_CLASS_SCAN(
+ ( strend -s >= FLAGS(c)
+ && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c))
+ && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
+ break;
+
+ case ANYOFR_tb_pb:
+ case ANYOFR_tb_p8:
+ REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
+ ANYOFRbase(c), ANYOFRdelta(c)));
+ break;
+
+ case ANYOFR_t8_pb:
+ case ANYOFR_t8_p8:
+ REXEC_FBC_UTF8_CLASS_SCAN(
+ ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
+ && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
+ (U8 *) strend,
+ NULL),
+ ANYOFRbase(c), ANYOFRdelta(c))));
+ break;
+
+ case ANYOFRb_tb_pb:
+ case ANYOFRb_tb_p8:
+ REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
+ ANYOFRbase(c), ANYOFRdelta(c)));
+ break;
+
+ case ANYOFRb_t8_pb:
+ case ANYOFRb_t8_p8:
+ { /* We know what the first byte of any matched string should be */
+ U8 first_byte = FLAGS(c);
+
+ REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
+ withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
+ (U8 *) strend,
+ NULL),
+ ANYOFRbase(c), ANYOFRdelta(c)));
}
+ break;
+
+ case EXACTFAA_tb_pb:
/* Latin1 folds are not affected by /a, except it excludes the sharp s,
* which these functions don't handle anyway */
folder = foldEQ_latin1_s2_folded;
goto do_exactf_non_utf8;
- case EXACTF: /* This node only generated for non-utf8 patterns */
- assert(! is_utf8_pat);
- if (utf8_target) {
- goto do_exactf_utf8;
- }
+ case EXACTF_tb_pb:
fold_array = PL_fold;
folder = foldEQ;
goto do_exactf_non_utf8;
- case EXACTFL:
+ case EXACTFL_tb_pb:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
- if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
+
+ if (IN_UTF8_CTYPE_LOCALE) {
utf8_fold_flags = FOLDEQ_LOCALE;
goto do_exactf_utf8;
}
+
fold_array = PL_fold_locale;
folder = foldEQ_locale;
goto do_exactf_non_utf8;
- case EXACTFUP: /* Problematic even though pattern isn't UTF-8. Use
- full functionality normally not done except for
- UTF-8 */
- assert(! is_utf8_pat);
- goto do_exactf_utf8;
-
- case EXACTFLU8:
- if (! utf8_target) { /* All code points in this node require
- UTF-8 to express. */
- break;
- }
- utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
- | FOLDEQ_S2_FOLDS_SANE;
- goto do_exactf_utf8;
-
- case EXACTFU_ONLY8:
- if (! utf8_target) {
- break;
- }
- assert(is_utf8_pat);
- utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
- goto do_exactf_utf8;
-
- case EXACTFU:
- if (is_utf8_pat || utf8_target) {
- utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
- goto do_exactf_utf8;
- }
-
- /* Any 'ss' in the pattern should have been replaced by regcomp,
- * so we don't have to worry here about this single special case
- * in the Latin1 range */
+ case EXACTFU_tb_pb:
+ /* Any 'ss' in the pattern should have been replaced by regcomp, so we
+ * don't have to worry here about this single special case in the
+ * Latin1 range */
fold_array = PL_fold_latin1;
folder = foldEQ_latin1_s2_folded;
/* FALLTHROUGH */
- do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
- are no glitches with fold-length differences
- between the target string and pattern */
+ do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
+ are no glitches with fold-length differences
+ between the target string and pattern */
- /* The idea in the non-utf8 EXACTF* cases is to first find the
- * first character of the EXACTF* node and then, if necessary,
+ /* The idea in the non-utf8 EXACTF* cases is to first find the first
+ * character of the EXACTF* node and then, if necessary,
* case-insensitively compare the full text of the node. c1 is the
* first character. c2 is its fold. This logic will not work for
- * Unicode semantics and the german sharp ss, which hence should
- * not be compiled into a node that gets here. */
- pat_string = STRING(c);
- ln = STR_LEN(c); /* length to match in octets/bytes */
-
- /* We know that we have to match at least 'ln' bytes (which is the
- * same as characters, since not utf8). If we have to match 3
- * characters, and there are only 2 availabe, we know without
- * trying that it will fail; so don't start a match past the
- * required minimum number from the far end */
+ * Unicode semantics and the german sharp ss, which hence should not be
+ * compiled into a node that gets here. */
+ pat_string = STRINGs(c);
+ ln = STR_LENs(c); /* length to match in octets/bytes */
+
+ /* We know that we have to match at least 'ln' bytes (which is the same
+ * as characters, since not utf8). If we have to match 3 characters,
+ * and there are only 2 availabe, we know without trying that it will
+ * fail; so don't start a match past the required minimum number from
+ * the far end */
e = HOP3c(strend, -((SSize_t)ln), s);
if (e < s)
break;
}
break;
- do_exactf_utf8:
- {
- unsigned expansion;
-
- /* If one of the operands is in utf8, we can't use the simpler folding
- * above, due to the fact that many different characters can have the
- * same fold, or portion of a fold, or different- length fold */
- pat_string = STRING(c);
- ln = STR_LEN(c); /* length to match in octets/bytes */
- pat_end = pat_string + ln;
- lnc = is_utf8_pat /* length to match in characters */
- ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
- : ln;
-
- /* We have 'lnc' characters to match in the pattern, but because of
- * multi-character folding, each character in the target can match
- * up to 3 characters (Unicode guarantees it will never exceed
- * this) if it is utf8-encoded; and up to 2 if not (based on the
- * fact that the Latin 1 folds are already determined, and the
- * only multi-char fold in that range is the sharp-s folding to
- * 'ss'. Thus, a pattern character can match as little as 1/3 of a
- * string character. Adjust lnc accordingly, rounding up, so that
- * if we need to match at least 4+1/3 chars, that really is 5. */
- expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
- lnc = (lnc + expansion - 1) / expansion;
-
- /* As in the non-UTF8 case, if we have to match 3 characters, and
- * only 2 are left, it's guaranteed to fail, so don't start a
- * match that would require us to go beyond the end of the string
- */
- e = HOP3c(strend, -((SSize_t)lnc), s);
-
- /* XXX Note that we could recalculate e to stop the loop earlier,
- * as the worst case expansion above will rarely be met, and as we
- * go along we would usually find that e moves further to the left.
- * This would happen only after we reached the point in the loop
- * where if there were no expansion we should fail. Unclear if
- * worth the expense */
-
- while (s <= e) {
- char *my_strend= (char *)strend;
- if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
- pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
- && (reginfo->intuit || regtry(reginfo, &s)) )
- {
- goto got_it;
- }
- s += (utf8_target) ? UTF8SKIP(s) : 1;
- }
- break;
- }
+ case EXACTFAA_tb_p8:
+ case EXACTFAA_t8_p8:
+ utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
+ |FOLDEQ_S2_ALREADY_FOLDED
+ |FOLDEQ_S2_FOLDS_SANE;
+ goto do_exactf_utf8;
- case BOUNDL:
- _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
- if (FLAGS(c) != TRADITIONAL_BOUND) {
- if (! IN_UTF8_CTYPE_LOCALE) {
- Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
- B_ON_NON_UTF8_LOCALE_IS_WRONG);
- }
- goto do_boundu;
- }
+ case EXACTFAA_NO_TRIE_tb_pb:
+ case EXACTFAA_NO_TRIE_t8_pb:
+ case EXACTFAA_t8_pb:
- FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
- break;
+ /* Here, and elsewhere in this file, the reason we can't consider a
+ * non-UTF-8 pattern already folded in the presence of a UTF-8 target
+ * is because any MICRO SIGN in the pattern won't be folded. Since the
+ * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
+ * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
+ utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
+ goto do_exactf_utf8;
- case NBOUNDL:
+ case EXACTFL_tb_p8:
+ case EXACTFL_t8_pb:
+ case EXACTFL_t8_p8:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
- if (FLAGS(c) != TRADITIONAL_BOUND) {
- if (! IN_UTF8_CTYPE_LOCALE) {
- Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
- B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ utf8_fold_flags = FOLDEQ_LOCALE;
+ goto do_exactf_utf8;
+
+ case EXACTFLU8_t8_pb:
+ case EXACTFLU8_t8_p8:
+ utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
+ | FOLDEQ_S2_FOLDS_SANE;
+ goto do_exactf_utf8;
+
+ case EXACTFU_REQ8_t8_p8:
+ utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
+ goto do_exactf_utf8;
+
+ case EXACTFU_tb_p8:
+ case EXACTFU_t8_pb:
+ case EXACTFU_t8_p8:
+ utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
+ goto do_exactf_utf8;
+
+ /* The following are problematic even though pattern isn't UTF-8. Use
+ * full functionality normally not done except for UTF-8. */
+ case EXACTF_t8_pb:
+ case EXACTFUP_tb_pb:
+ case EXACTFUP_t8_pb:
+
+ do_exactf_utf8:
+ {
+ unsigned expansion;
+
+ /* If one of the operands is in utf8, we can't use the simpler
+ * folding above, due to the fact that many different characters
+ * can have the same fold, or portion of a fold, or different-
+ * length fold */
+ pat_string = STRINGs(c);
+ ln = STR_LENs(c); /* length to match in octets/bytes */
+ pat_end = pat_string + ln;
+ lnc = is_utf8_pat /* length to match in characters */
+ ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
+ : ln;
+
+ /* We have 'lnc' characters to match in the pattern, but because of
+ * multi-character folding, each character in the target can match
+ * up to 3 characters (Unicode guarantees it will never exceed
+ * this) if it is utf8-encoded; and up to 2 if not (based on the
+ * fact that the Latin 1 folds are already determined, and the only
+ * multi-char fold in that range is the sharp-s folding to 'ss'.
+ * Thus, a pattern character can match as little as 1/3 of a string
+ * character. Adjust lnc accordingly, rounding up, so that if we
+ * need to match at least 4+1/3 chars, that really is 5. */
+ expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
+ lnc = (lnc + expansion - 1) / expansion;
+
+ /* As in the non-UTF8 case, if we have to match 3 characters, and
+ * only 2 are left, it's guaranteed to fail, so don't start a match
+ * that would require us to go beyond the end of the string */
+ e = HOP3c(strend, -((SSize_t)lnc), s);
+
+ /* XXX Note that we could recalculate e to stop the loop earlier,
+ * as the worst case expansion above will rarely be met, and as we
+ * go along we would usually find that e moves further to the left.
+ * This would happen only after we reached the point in the loop
+ * where if there were no expansion we should fail. Unclear if
+ * worth the expense */
+
+ while (s <= e) {
+ char *my_strend= (char *)strend;
+ if ( foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
+ pat_string, NULL, ln, is_utf8_pat,
+ utf8_fold_flags)
+ && (reginfo->intuit || regtry(reginfo, &s)) )
+ {
+ goto got_it;
+ }
+ s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
}
- goto do_nboundu;
}
-
- FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
break;
- case BOUND: /* regcomp.c makes sure that this only has the traditional \b
- meaning */
+ case BOUNDA_tb_pb:
+ case BOUNDA_tb_p8:
+ case BOUND_tb_pb: /* /d without utf8 target is /a */
+ case BOUND_tb_p8:
+ /* regcomp.c makes sure that these only have the traditional \b
+ * meaning. */
assert(FLAGS(c) == TRADITIONAL_BOUND);
- FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
+ FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
break;
- case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
- meaning */
+ case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
+ case BOUNDA_t8_p8:
+ /* regcomp.c makes sure that these only have the traditional \b
+ * meaning. */
assert(FLAGS(c) == TRADITIONAL_BOUND);
- FBC_BOUND_A(isWORDCHAR_A);
+ FBC_BOUND_A_UTF8(isWORDCHAR_A);
break;
- case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
- meaning */
+ case NBOUNDA_tb_pb:
+ case NBOUNDA_tb_p8:
+ case NBOUND_tb_pb: /* /d without utf8 target is /a */
+ case NBOUND_tb_p8:
+ /* regcomp.c makes sure that these only have the traditional \b
+ * meaning. */
assert(FLAGS(c) == TRADITIONAL_BOUND);
- FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
+ FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
break;
- case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
- meaning */
+ case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
+ case NBOUNDA_t8_p8:
+ /* regcomp.c makes sure that these only have the traditional \b
+ * meaning. */
assert(FLAGS(c) == TRADITIONAL_BOUND);
- FBC_NBOUND_A(isWORDCHAR_A);
+ FBC_NBOUND_A_UTF8(isWORDCHAR_A);
break;
- case NBOUNDU:
+ case NBOUNDU_tb_pb:
+ case NBOUNDU_tb_p8:
if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
- FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
+ FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
+ break;
+ }
+
+ to_complement = 1;
+ goto do_boundu_non_utf8;
+
+ case NBOUNDL_tb_pb:
+ case NBOUNDL_tb_p8:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (FLAGS(c) == TRADITIONAL_BOUND) {
+ FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
break;
}
- do_nboundu:
+ CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
to_complement = 1;
- /* FALLTHROUGH */
+ goto do_boundu_non_utf8;
- case BOUNDU:
- do_boundu:
- switch((bound_type) FLAGS(c)) {
- case TRADITIONAL_BOUND:
- FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
- break;
- case GCB_BOUND:
- if (s == reginfo->strbeg) {
- if (reginfo->intuit || regtry(reginfo, &s))
- {
- goto got_it;
- }
+ case BOUNDL_tb_pb:
+ case BOUNDL_tb_p8:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (FLAGS(c) == TRADITIONAL_BOUND) {
+ FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
+ break;
+ }
- /* Didn't match. Try at the next position (if there is one) */
- s += (utf8_target) ? UTF8SKIP(s) : 1;
- if (UNLIKELY(s >= reginfo->strend)) {
- break;
- }
- }
+ CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
- if (utf8_target) {
- GCB_enum before = getGCB_VAL_UTF8(
- reghop3((U8*)s, -1,
- (U8*)(reginfo->strbeg)),
- (U8*) reginfo->strend);
- while (s < strend) {
- GCB_enum after = getGCB_VAL_UTF8((U8*) s,
- (U8*) reginfo->strend);
- if ( (to_complement ^ isGCB(before,
- after,
- (U8*) reginfo->strbeg,
- (U8*) s,
- utf8_target))
- && (reginfo->intuit || regtry(reginfo, &s)))
- {
- goto got_it;
- }
- before = after;
- s += UTF8SKIP(s);
- }
- }
- else { /* Not utf8. Everything is a GCB except between CR and
- LF */
- while (s < strend) {
- if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
- || UCHARAT(s) != '\n'))
- && (reginfo->intuit || regtry(reginfo, &s)))
- {
- goto got_it;
- }
- s++;
- }
- }
+ goto do_boundu_non_utf8;
+
+ case BOUNDU_tb_pb:
+ case BOUNDU_tb_p8:
+ if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
+ FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
+ break;
+ }
+
+ do_boundu_non_utf8:
+ if (s == reginfo->strbeg) {
+ if (reginfo->intuit || regtry(reginfo, &s))
+ {
+ goto got_it;
+ }
+
+ /* Didn't match. Try at the next position (if there is one) */
+ s++;
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
+ }
- /* And, since this is a bound, it can match after the final
- * character in the string */
- if ((reginfo->intuit || regtry(reginfo, &s))) {
+ switch((bound_type) FLAGS(c)) {
+ case TRADITIONAL_BOUND: /* Should have already been handled */
+ assert(0);
+ break;
+
+ case GCB_BOUND:
+ /* Not utf8. Everything is a GCB except between CR and LF */
+ while (s < strend) {
+ if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
+ || UCHARAT(s) != '\n'))
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
goto got_it;
}
- break;
+ s++;
+ }
- case LB_BOUND:
- if (s == reginfo->strbeg) {
- if (reginfo->intuit || regtry(reginfo, &s)) {
+ break;
+
+ case LB_BOUND:
+ {
+ LB_enum before = getLB_VAL_CP((U8) *(s -1));
+ while (s < strend) {
+ LB_enum after = getLB_VAL_CP((U8) *s);
+ if (to_complement ^ isLB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ 0 /* target not utf8 */ )
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
goto got_it;
}
- s += (utf8_target) ? UTF8SKIP(s) : 1;
- if (UNLIKELY(s >= reginfo->strend)) {
- break;
- }
+ before = after;
+ s++;
}
+ }
- if (utf8_target) {
- LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
- -1,
- (U8*)(reginfo->strbeg)),
- (U8*) reginfo->strend);
- while (s < strend) {
- LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
- if (to_complement ^ isLB(before,
- after,
- (U8*) reginfo->strbeg,
- (U8*) s,
- (U8*) reginfo->strend,
- utf8_target)
- && (reginfo->intuit || regtry(reginfo, &s)))
- {
- goto got_it;
- }
- before = after;
- s += UTF8SKIP(s);
- }
- }
- else { /* Not utf8. */
- LB_enum before = getLB_VAL_CP((U8) *(s -1));
- while (s < strend) {
- LB_enum after = getLB_VAL_CP((U8) *s);
- if (to_complement ^ isLB(before,
- after,
- (U8*) reginfo->strbeg,
- (U8*) s,
- (U8*) reginfo->strend,
- utf8_target)
- && (reginfo->intuit || regtry(reginfo, &s)))
- {
- goto got_it;
- }
- before = after;
- s++;
- }
- }
+ break;
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
+ case SB_BOUND:
+ {
+ SB_enum before = getSB_VAL_CP((U8) *(s -1));
+ while (s < strend) {
+ SB_enum after = getSB_VAL_CP((U8) *s);
+ if ((to_complement ^ isSB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ 0 /* target not utf8 */ ))
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ before = after;
+ s++;
}
+ }
- break;
+ break;
- case SB_BOUND:
- if (s == reginfo->strbeg) {
- if (reginfo->intuit || regtry(reginfo, &s)) {
+ case WB_BOUND:
+ {
+ WB_enum previous = WB_UNKNOWN;
+ WB_enum before = getWB_VAL_CP((U8) *(s -1));
+ while (s < strend) {
+ WB_enum after = getWB_VAL_CP((U8) *s);
+ if ((to_complement ^ isWB(previous,
+ before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ 0 /* target not utf8 */ ))
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
goto got_it;
}
- s += (utf8_target) ? UTF8SKIP(s) : 1;
- if (UNLIKELY(s >= reginfo->strend)) {
- break;
- }
+ previous = before;
+ before = after;
+ s++;
}
+ }
+ }
- if (utf8_target) {
- SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
- -1,
- (U8*)(reginfo->strbeg)),
- (U8*) reginfo->strend);
- while (s < strend) {
- SB_enum after = getSB_VAL_UTF8((U8*) s,
- (U8*) reginfo->strend);
- if ((to_complement ^ isSB(before,
- after,
- (U8*) reginfo->strbeg,
- (U8*) s,
- (U8*) reginfo->strend,
- utf8_target))
- && (reginfo->intuit || regtry(reginfo, &s)))
- {
- goto got_it;
- }
- before = after;
- s += UTF8SKIP(s);
- }
- }
- else { /* Not utf8. */
- SB_enum before = getSB_VAL_CP((U8) *(s -1));
- while (s < strend) {
- SB_enum after = getSB_VAL_CP((U8) *s);
- if ((to_complement ^ isSB(before,
+ /* Here are at the final position in the target string, which is a
+ * boundary by definition, so matches, depending on other constraints.
+ * */
+ if ( reginfo->intuit
+ || (s <= reginfo->strend && regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+
+ break;
+
+ case BOUNDL_t8_pb:
+ case BOUNDL_t8_p8:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (FLAGS(c) == TRADITIONAL_BOUND) {
+ FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
+ isWORDCHAR_LC_utf8_safe);
+ break;
+ }
+
+ CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
+
+ to_complement = 1;
+ goto do_boundu_utf8;
+
+ case NBOUNDL_t8_pb:
+ case NBOUNDL_t8_p8:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (FLAGS(c) == TRADITIONAL_BOUND) {
+ FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
+ isWORDCHAR_LC_utf8_safe);
+ break;
+ }
+
+ CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
+
+ to_complement = 1;
+ goto do_boundu_utf8;
+
+ case NBOUND_t8_pb:
+ case NBOUND_t8_p8:
+ /* regcomp.c makes sure that these only have the traditional \b
+ * meaning. */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
+ /* FALLTHROUGH */
+
+ case NBOUNDU_t8_pb:
+ case NBOUNDU_t8_p8:
+ if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
+ FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
+ isWORDCHAR_utf8_safe);
+ break;
+ }
+
+ to_complement = 1;
+ goto do_boundu_utf8;
+
+ case BOUND_t8_pb:
+ case BOUND_t8_p8:
+ /* regcomp.c makes sure that these only have the traditional \b
+ * meaning. */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
+ /* FALLTHROUGH */
+
+ case BOUNDU_t8_pb:
+ case BOUNDU_t8_p8:
+ if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
+ FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
+ break;
+ }
+
+ do_boundu_utf8:
+ if (s == reginfo->strbeg) {
+ if (reginfo->intuit || regtry(reginfo, &s))
+ {
+ goto got_it;
+ }
+
+ /* Didn't match. Try at the next position (if there is one) */
+ s += UTF8_SAFE_SKIP(s, reginfo->strend);
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
+ }
+
+ switch((bound_type) FLAGS(c)) {
+ case TRADITIONAL_BOUND: /* Should have already been handled */
+ assert(0);
+ break;
+
+ case GCB_BOUND:
+ {
+ GCB_enum before = getGCB_VAL_UTF8(
+ reghop3((U8*)s, -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ GCB_enum after = getGCB_VAL_UTF8((U8*) s,
+ (U8*) reginfo->strend);
+ if ( (to_complement ^ isGCB(before,
after,
(U8*) reginfo->strbeg,
(U8*) s,
- (U8*) reginfo->strend,
- utf8_target))
- && (reginfo->intuit || regtry(reginfo, &s)))
- {
- goto got_it;
- }
- before = after;
- s++;
+ 1 /* target is utf8 */ ))
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
}
+ before = after;
+ s += UTF8_SAFE_SKIP(s, reginfo->strend);
}
+ }
+ break;
- /* Here are at the final position in the target string. The SB
- * value is always true here, so matches, depending on other
- * constraints */
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
+ case LB_BOUND:
+ {
+ LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ LB_enum after = getLB_VAL_UTF8((U8*) s,
+ (U8*) reginfo->strend);
+ if (to_complement ^ isLB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ 1 /* target is utf8 */ )
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ before = after;
+ s += UTF8_SAFE_SKIP(s, reginfo->strend);
}
+ }
- break;
+ break;
- case WB_BOUND:
- if (s == reginfo->strbeg) {
- if (reginfo->intuit || regtry(reginfo, &s)) {
+ case SB_BOUND:
+ {
+ SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ SB_enum after = getSB_VAL_UTF8((U8*) s,
+ (U8*) reginfo->strend);
+ if ((to_complement ^ isSB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ 1 /* target is utf8 */ ))
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
goto got_it;
}
- s += (utf8_target) ? UTF8SKIP(s) : 1;
- if (UNLIKELY(s >= reginfo->strend)) {
- break;
- }
+ before = after;
+ s += UTF8_SAFE_SKIP(s, reginfo->strend);
}
+ }
- if (utf8_target) {
- /* We are at a boundary between char_sub_0 and char_sub_1.
- * We also keep track of the value for char_sub_-1 as we
- * loop through the line. Context may be needed to make a
- * determination, and if so, this can save having to
- * recalculate it */
- WB_enum previous = WB_UNKNOWN;
- WB_enum before = getWB_VAL_UTF8(
- reghop3((U8*)s,
- -1,
- (U8*)(reginfo->strbeg)),
- (U8*) reginfo->strend);
- while (s < strend) {
- WB_enum after = getWB_VAL_UTF8((U8*) s,
- (U8*) reginfo->strend);
- if ((to_complement ^ isWB(previous,
- before,
- after,
- (U8*) reginfo->strbeg,
- (U8*) s,
- (U8*) reginfo->strend,
- utf8_target))
- && (reginfo->intuit || regtry(reginfo, &s)))
- {
- goto got_it;
- }
- previous = before;
- before = after;
- s += UTF8SKIP(s);
- }
- }
- else { /* Not utf8. */
- WB_enum previous = WB_UNKNOWN;
- WB_enum before = getWB_VAL_CP((U8) *(s -1));
- while (s < strend) {
- WB_enum after = getWB_VAL_CP((U8) *s);
- if ((to_complement ^ isWB(previous,
- before,
- after,
- (U8*) reginfo->strbeg,
- (U8*) s,
- (U8*) reginfo->strend,
- utf8_target))
- && (reginfo->intuit || regtry(reginfo, &s)))
- {
- goto got_it;
- }
- previous = before;
- before = after;
- s++;
+ break;
+
+ case WB_BOUND:
+ {
+ /* We are at a boundary between char_sub_0 and char_sub_1.
+ * We also keep track of the value for char_sub_-1 as we
+ * loop through the line. Context may be needed to make a
+ * determination, and if so, this can save having to
+ * recalculate it */
+ WB_enum previous = WB_UNKNOWN;
+ WB_enum before = getWB_VAL_UTF8(
+ reghop3((U8*)s,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ WB_enum after = getWB_VAL_UTF8((U8*) s,
+ (U8*) reginfo->strend);
+ if ((to_complement ^ isWB(previous,
+ before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ 1 /* target is utf8 */ ))
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
}
+ previous = before;
+ before = after;
+ s += UTF8_SAFE_SKIP(s, reginfo->strend);
}
+ }
+ }
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
- }
+ /* Here are at the final position in the target string, which is a
+ * boundary by definition, so matches, depending on other constraints.
+ * */
+
+ if ( reginfo->intuit
+ || (s <= reginfo->strend && regtry(reginfo, &s)))
+ {
+ goto got_it;
}
break;
- case LNBREAK:
- REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
- is_LNBREAK_latin1_safe(s, strend)
- );
+ case LNBREAK_t8_pb:
+ case LNBREAK_t8_p8:
+ REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
+ break;
+
+ case LNBREAK_tb_pb:
+ case LNBREAK_tb_p8:
+ REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
break;
- /* The argument to all the POSIX node types is the class number to pass to
- * _generic_isCC() to build a mask for searching in PL_charclass[] */
+ /* The argument to all the POSIX node types is the class number to pass
+ * to _generic_isCC() to build a mask for searching in PL_charclass[] */
- case NPOSIXL:
+ case NPOSIXL_t8_pb:
+ case NPOSIXL_t8_p8:
to_complement = 1;
/* FALLTHROUGH */
- case POSIXL:
+ case POSIXL_t8_pb:
+ case POSIXL_t8_p8:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
- REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
- to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
+ REXEC_FBC_UTF8_CLASS_SCAN(
+ to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
+ (U8 *) strend)));
break;
- case NPOSIXD:
+ case NPOSIXL_tb_pb:
+ case NPOSIXL_tb_p8:
to_complement = 1;
/* FALLTHROUGH */
- case POSIXD:
- if (utf8_target) {
- goto posix_utf8;
- }
- goto posixa;
-
- case NPOSIXA:
- if (utf8_target) {
- /* The complement of something that matches only ASCII matches all
- * non-ASCII, plus everything in ASCII that isn't in the class. */
- REXEC_FBC_CLASS_SCAN(1, ! isASCII_utf8_safe(s, strend)
- || ! _generic_isCC_A(*s, FLAGS(c)));
- break;
- }
+ case POSIXL_tb_pb:
+ case POSIXL_tb_p8:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ REXEC_FBC_NON_UTF8_CLASS_SCAN(
+ to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
+ break;
- to_complement = 1;
- goto posixa;
+ case NPOSIXA_t8_pb:
+ case NPOSIXA_t8_p8:
+ /* The complement of something that matches only ASCII matches all
+ * non-ASCII, plus everything in ASCII that isn't in the class. */
+ REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend)
+ || ! _generic_isCC_A(*s, FLAGS(c)));
+ break;
- case POSIXA:
+ case POSIXA_t8_pb:
+ case POSIXA_t8_p8:
/* Don't need to worry about utf8, as it can match only a single
* byte invariant character. But we do anyway for performance reasons,
* as otherwise we would have to examine all the continuation
* characters */
- if (utf8_target) {
- REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
- break;
- }
+ REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
+ break;
+
+ case NPOSIXD_tb_pb:
+ case NPOSIXD_tb_p8:
+ case NPOSIXA_tb_pb:
+ case NPOSIXA_tb_p8:
+ to_complement = 1;
+ /* FALLTHROUGH */
- posixa:
- REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
+ case POSIXD_tb_pb:
+ case POSIXD_tb_p8:
+ case POSIXA_tb_pb:
+ case POSIXA_tb_p8:
+ REXEC_FBC_NON_UTF8_CLASS_SCAN(
to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
break;
- case NPOSIXU:
+ case NPOSIXU_tb_pb:
+ case NPOSIXU_tb_p8:
to_complement = 1;
/* FALLTHROUGH */
- case POSIXU:
- if (! utf8_target) {
- REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
+ case POSIXU_tb_pb:
+ case POSIXU_tb_p8:
+ REXEC_FBC_NON_UTF8_CLASS_SCAN(
to_complement ^ cBOOL(_generic_isCC(*s,
FLAGS(c))));
- }
- else {
+ break;
- posix_utf8:
- classnum = (_char_class_number) FLAGS(c);
- switch (classnum) {
- default:
- REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
+ case NPOSIXD_t8_pb:
+ case NPOSIXD_t8_p8:
+ case NPOSIXU_t8_pb:
+ case NPOSIXU_t8_p8:
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case POSIXD_t8_pb:
+ case POSIXD_t8_p8:
+ case POSIXU_t8_pb:
+ case POSIXU_t8_p8:
+ classnum = (_char_class_number) FLAGS(c);
+ switch (classnum) {
+ default:
+ REXEC_FBC_UTF8_CLASS_SCAN(
to_complement ^ cBOOL(_invlist_contains_cp(
- PL_XPosix_ptrs[classnum],
- utf8_to_uvchr_buf((U8 *) s,
+ PL_XPosix_ptrs[classnum],
+ utf8_to_uvchr_buf((U8 *) s,
(U8 *) strend,
NULL))));
- break;
- case _CC_ENUM_SPACE:
- REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
+ break;
+
+ case _CC_ENUM_SPACE:
+ REXEC_FBC_UTF8_CLASS_SCAN(
to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
- break;
+ break;
- case _CC_ENUM_BLANK:
- REXEC_FBC_CLASS_SCAN(1,
+ case _CC_ENUM_BLANK:
+ REXEC_FBC_UTF8_CLASS_SCAN(
to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
- break;
+ break;
- case _CC_ENUM_XDIGIT:
- REXEC_FBC_CLASS_SCAN(1,
- to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
- break;
+ case _CC_ENUM_XDIGIT:
+ REXEC_FBC_UTF8_CLASS_SCAN(
+ to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
+ break;
- case _CC_ENUM_VERTSPACE:
- REXEC_FBC_CLASS_SCAN(1,
- to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
- break;
+ case _CC_ENUM_VERTSPACE:
+ REXEC_FBC_UTF8_CLASS_SCAN(
+ to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
+ break;
- case _CC_ENUM_CNTRL:
- REXEC_FBC_CLASS_SCAN(1,
+ case _CC_ENUM_CNTRL:
+ REXEC_FBC_UTF8_CLASS_SCAN(
to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
- break;
- }
+ break;
}
break;
- case AHOCORASICKC:
- case AHOCORASICK:
+ case AHOCORASICKC_tb_pb:
+ case AHOCORASICKC_tb_p8:
+ case AHOCORASICKC_t8_pb:
+ case AHOCORASICKC_t8_p8:
+ case AHOCORASICK_tb_pb:
+ case AHOCORASICK_tb_p8:
+ case AHOCORASICK_t8_pb:
+ case AHOCORASICK_t8_p8:
{
DECL_TRIE_TYPE(c);
/* what trie are we using right now */
reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
- reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
+ reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
const char *last_start = strend - trie->minlen;
U8 *bitmap=NULL;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
/* We can't just allocate points here. We need to wrap it in
* an SV so it gets freed properly if there is a croak while
if( state==1 ) {
if ( bitmap ) {
DEBUG_TRIE_EXECUTE_r(
- if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
- dump_exec_pos( (char *)uc, c, strend, real_start,
+ if ( uc <= (U8*)last_start
+ && !BITMAP_TEST(bitmap,*uc) )
+ {
+ dump_exec_pos( (char *)uc, c, strend,
+ real_start,
(char *)uc, utf8_target, 0 );
Perl_re_printf( aTHX_
" Scanning for legal start char...\n");
}
);
if (utf8_target) {
- while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+ while ( uc <= (U8*)last_start
+ && !BITMAP_TEST(bitmap,*uc) )
+ {
uc += UTF8SKIP(uc);
}
} else {
- while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+ while ( uc <= (U8*)last_start
+ && ! BITMAP_TEST(bitmap,*uc) )
+ {
uc++;
}
}
}
if ( word ) {
- U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
+ U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
+ % maxlen ];
if (!leftmost || lpos < leftmost) {
DEBUG_r(accepted_word=word);
leftmost= lpos;
DEBUG_TRIE_EXECUTE_r({
if (failed)
- dump_exec_pos( (char *)uc, c, strend, real_start,
+ dump_exec_pos((char *)uc, c, strend, real_start,
s, utf8_target, 0 );
Perl_re_printf( aTHX_
"%sState: %4" UVxf ", word=%" UVxf,
}
}
if ( aho->states[ state ].wordnum ) {
- U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
+ U8 *lpos = points[ (pointpos
+ - trie->wordinfo[aho->states[ state ]
+ .wordnum].len) % maxlen ];
if (!leftmost || lpos < leftmost) {
DEBUG_r(accepted_word=aho->states[ state ].wordnum);
leftmost = lpos;
if (leftmost) {
s = (char*)leftmost;
DEBUG_TRIE_EXECUTE_r({
- Perl_re_printf( aTHX_ "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
+ Perl_re_printf( aTHX_ "Matches word #%" UVxf
+ " at position %" IVdf ". Trying full"
+ " pattern...\n",
(UV)accepted_word, (IV)(s - real_start)
);
});
LEAVE;
goto got_it;
}
- s = HOPc(s,1);
+ if (s < reginfo->strend) {
+ s = HOPc(s,1);
+ }
DEBUG_TRIE_EXECUTE_r({
- Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
+ Perl_re_printf( aTHX_
+ "Pattern failed. Looking for new start"
+ " point...\n");
});
} else {
DEBUG_TRIE_EXECUTE_r(
LEAVE;
}
break;
- default:
+
+ case EXACTFU_REQ8_t8_pb:
+ case EXACTFUP_tb_p8:
+ case EXACTFUP_t8_p8:
+ case EXACTF_tb_p8:
+ case EXACTF_t8_p8: /* This node only generated for non-utf8 patterns */
+ case EXACTFAA_NO_TRIE_tb_p8:
+ case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
+ patterns */
+ assert(0);
+
+ default:
Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
- }
+ } /* End of switch on node type */
+
return 0;
+
got_it:
return s;
}
regmatch_info *const reginfo = ®info_buf;
regexp_paren_pair *swap = NULL;
I32 oldsave;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGEXEC_FLAGS;
PERL_UNUSED_ARG(data);
if (!startpos ||
((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
{
- DEBUG_r(Perl_re_printf( aTHX_
+ DEBUG_GPOS_r(Perl_re_printf( aTHX_
"fail: ganch-gofs before earliest possible start\n"));
return 0;
}
minlen = prog->minlen;
if ((startpos + minlen) > strend || startpos < strbeg) {
- DEBUG_r(Perl_re_printf( aTHX_
- "Regex match can't succeed, so not even tried\n"));
+ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
+ "Regex match can't succeed, so not even tried\n"));
return 0;
}
RXp_MATCH_UTF8_set(prog, utf8_target);
prog->offs[0].start = s - strbeg;
prog->offs[0].end = utf8_target
- ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
+ ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
: s - strbeg + prog->minlenret;
if ( !(flags & REXEC_NOT_FIRST) )
S_reg_set_capture_string(aTHX_ rx,
we switch it back; otherwise we leave it swapped.
*/
swap = prog->offs;
- /* do we need a save destructor here for eval dies? */
+ /* avoid leak if we die, or clean up anyway if match completes */
+ SAVEFREEPV(swap);
Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
"rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
to_utf8_substr(prog);
}
ch = SvPVX_const(prog->anchored_utf8)[0];
- REXEC_FBC_SCAN(0, /* 0=>not-utf8 */
+ REXEC_FBC_UTF8_SCAN(
if (*s == ch) {
DEBUG_EXECUTE_r( did_match = 1 );
if (regtry(reginfo, &s)) goto got_it;
- s += UTF8SKIP(s);
+ s += UTF8_SAFE_SKIP(s, strend);
while (s < strend && *s == ch)
s += UTF8SKIP(s);
}
}
}
ch = SvPVX_const(prog->anchored_substr)[0];
- REXEC_FBC_SCAN(0, /* 0=>not-utf8 */
+ REXEC_FBC_NON_UTF8_SCAN(
if (*s == ch) {
DEBUG_EXECUTE_r( did_match = 1 );
if (regtry(reginfo, &s)) goto got_it;
goto phooey;
}
- DEBUG_BUFFERS_r(
- if (swap)
- Perl_re_exec_indentf( aTHX_
- "rex=0x%" UVxf " freeing offs: 0x%" UVxf "\n",
- 0,
- PTR2UV(prog),
- PTR2UV(swap)
- );
- );
- Safefree(swap);
-
/* clean up; this will trigger destructors that will free all slabs
* above the current one, and cleanup the regmatch_info_aux
* and regmatch_info_aux_eval sructs */
DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n",
PL_colors[4], PL_colors[5]));
- /* clean up; this will trigger destructors that will free all slabs
- * above the current one, and cleanup the regmatch_info_aux
- * and regmatch_info_aux_eval sructs */
-
- LEAVE_SCOPE(oldsave);
-
if (swap) {
- /* we failed :-( roll it back */
+ /* we failed :-( roll it back.
+ * Since the swap buffer will be freed on scope exit which follows
+ * shortly, restore the old captures by copying 'swap's original
+ * data to the new offs buffer
+ */
DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
- "rex=0x%" UVxf " rolling back offs: freeing=0x%" UVxf " restoring=0x%" UVxf "\n",
+ "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
0,
PTR2UV(prog),
PTR2UV(prog->offs),
PTR2UV(swap)
));
- Safefree(prog->offs);
- prog->offs = swap;
+
+ Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
}
+
+ /* clean up; this will trigger destructors that will free all slabs
+ * above the current one, and cleanup the regmatch_info_aux
+ * and regmatch_info_aux_eval sructs */
+
+ LEAVE_SCOPE(oldsave);
+
return 0;
}
U32 depth = 0; /* used by REGCP_SET */
#endif
RXi_GET_DECL(prog,progi);
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGTRY;
result = regmatch(reginfo, *startposp, progi->program + 1);
if (result != -1) {
prog->offs[0].end = result;
- return 1;
- }
- if (reginfo->cutpoint)
- *startposp= reginfo->cutpoint;
- REGCP_UNWIND(lastcp);
- return 0;
-}
-
-
-#define sayYES goto yes
-#define sayNO goto no
-#define sayNO_SILENT goto no_silent
-
-/* we dont use STMT_START/END here because it leads to
- "unreachable code" warnings, which are bogus, but distracting. */
-#define CACHEsayNO \
- if (ST.cache_mask) \
- reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
- sayNO
-
-/* this is used to determine how far from the left messages like
- 'failed...' are printed in regexec.c. It should be set such that
- messages are inline with the regop output that created them.
-*/
-#define REPORT_CODE_OFF 29
-#define INDENT_CHARS(depth) ((int)(depth) % 20)
-#ifdef DEBUGGING
-int
-Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
-{
- va_list ap;
- int result;
- PerlIO *f= Perl_debug_log;
- PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
- va_start(ap, depth);
- PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
- result = PerlIO_vprintf(f, fmt, ap);
- va_end(ap);
- return result;
-}
-#endif /* DEBUGGING */
-
-
-#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
-#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
-#define CHRTEST_NOT_A_CP_1 -999
-#define CHRTEST_NOT_A_CP_2 -998
-
-/* grab a new slab and return the first slot in it */
-
-STATIC regmatch_state *
-S_push_slab(pTHX)
-{
- regmatch_slab *s = PL_regmatch_slab->next;
- if (!s) {
- Newx(s, 1, regmatch_slab);
- s->prev = PL_regmatch_slab;
- s->next = NULL;
- PL_regmatch_slab->next = s;
- }
- PL_regmatch_slab = s;
- return SLAB_FIRST(s);
-}
-
-
-/* push a new state then goto it */
-
-#define PUSH_STATE_GOTO(state, node, input) \
- pushinput = input; \
- scan = node; \
- st->resume_state = state; \
- goto push_state;
-
-/* push a new state with success backtracking, then goto it */
-
-#define PUSH_YES_STATE_GOTO(state, node, input) \
- pushinput = input; \
- scan = node; \
- st->resume_state = state; \
- goto push_yes_state;
-
-
-
-
-/*
-
-regmatch() - main matching routine
-
-This is basically one big switch statement in a loop. We execute an op,
-set 'next' to point the next op, and continue. If we come to a point which
-we may need to backtrack to on failure such as (A|B|C), we push a
-backtrack state onto the backtrack stack. On failure, we pop the top
-state, and re-enter the loop at the state indicated. If there are no more
-states to pop, we return failure.
-
-Sometimes we also need to backtrack on success; for example /A+/, where
-after successfully matching one A, we need to go back and try to
-match another one; similarly for lookahead assertions: if the assertion
-completes successfully, we backtrack to the state just before the assertion
-and then carry on. In these cases, the pushed state is marked as
-'backtrack on success too'. This marking is in fact done by a chain of
-pointers, each pointing to the previous 'yes' state. On success, we pop to
-the nearest yes state, discarding any intermediate failure-only states.
-Sometimes a yes state is pushed just to force some cleanup code to be
-called at the end of a successful match or submatch; e.g. (??{$re}) uses
-it to free the inner regex.
-
-Note that failure backtracking rewinds the cursor position, while
-success backtracking leaves it alone.
-
-A pattern is complete when the END op is executed, while a subpattern
-such as (?=foo) is complete when the SUCCESS op is executed. Both of these
-ops trigger the "pop to last yes state if any, otherwise return true"
-behaviour.
-
-A common convention in this function is to use A and B to refer to the two
-subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
-the subpattern to be matched possibly multiple times, while B is the entire
-rest of the pattern. Variable and state names reflect this convention.
-
-The states in the main switch are the union of ops and failure/success of
-substates associated with with that op. For example, IFMATCH is the op
-that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
-'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
-successfully matched A and IFMATCH_A_fail is a state saying that we have
-just failed to match A. Resume states always come in pairs. The backtrack
-state we push is marked as 'IFMATCH_A', but when that is popped, we resume
-at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
-on success or failure.
-
-The struct that holds a backtracking state is actually a big union, with
-one variant for each major type of op. The variable st points to the
-top-most backtrack struct. To make the code clearer, within each
-block of code we #define ST to alias the relevant union.
-
-Here's a concrete example of a (vastly oversimplified) IFMATCH
-implementation:
-
- switch (state) {
- ....
-
-#define ST st->u.ifmatch
-
- case IFMATCH: // we are executing the IFMATCH op, (?=A)B
- ST.foo = ...; // some state we wish to save
- ...
- // push a yes backtrack state with a resume value of
- // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
- // first node of A:
- PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
- // NOTREACHED
-
- case IFMATCH_A: // we have successfully executed A; now continue with B
- next = B;
- bar = ST.foo; // do something with the preserved value
- break;
-
- case IFMATCH_A_fail: // A failed, so the assertion failed
- ...; // do some housekeeping, then ...
- sayNO; // propagate the failure
-
-#undef ST
-
- ...
- }
-
-For any old-timers reading this who are familiar with the old recursive
-approach, the code above is equivalent to:
-
- case IFMATCH: // we are executing the IFMATCH op, (?=A)B
- {
- int foo = ...
- ...
- if (regmatch(A)) {
- next = B;
- bar = foo;
- break;
- }
- ...; // do some housekeeping, then ...
- sayNO; // propagate the failure
+ return 1;
}
+ if (reginfo->cutpoint)
+ *startposp= reginfo->cutpoint;
+ REGCP_UNWIND(lastcp);
+ return 0;
+}
-The topmost backtrack state, pointed to by st, is usually free. If you
-want to claim it, populate any ST.foo fields in it with values you wish to
-save, then do one of
-
- PUSH_STATE_GOTO(resume_state, node, newinput);
- PUSH_YES_STATE_GOTO(resume_state, node, newinput);
-
-which sets that backtrack state's resume value to 'resume_state', pushes a
-new free entry to the top of the backtrack stack, then goes to 'node'.
-On backtracking, the free slot is popped, and the saved state becomes the
-new free state. An ST.foo field in this new top state can be temporarily
-accessed to retrieve values, but once the main loop is re-entered, it
-becomes available for reuse.
-
-Note that the depth of the backtrack stack constantly increases during the
-left-to-right execution of the pattern, rather than going up and down with
-the pattern nesting. For example the stack is at its maximum at Z at the
-end of the pattern, rather than at X in the following:
-
- /(((X)+)+)+....(Y)+....Z/
-
-The only exceptions to this are lookahead/behind assertions and the cut,
-(?>A), which pop all the backtrack states associated with A before
-continuing.
-
-Backtrack state structs are allocated in slabs of about 4K in size.
-PL_regmatch_state and st always point to the currently active state,
-and PL_regmatch_slab points to the slab currently containing
-PL_regmatch_state. The first time regmatch() is called, the first slab is
-allocated, and is never freed until interpreter destruction. When the slab
-is full, a new one is allocated and chained to the end. At exit from
-regmatch(), slabs allocated since entry are freed.
-
+/* this is used to determine how far from the left messages like
+ 'failed...' are printed in regexec.c. It should be set such that
+ messages are inline with the regop output that created them.
*/
-
-
-#define DEBUG_STATE_pp(pp) \
- DEBUG_STATE_r({ \
- DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \
- Perl_re_printf( aTHX_ \
- "%*s" pp " %s%s%s%s%s\n", \
- INDENT_CHARS(depth), "", \
- PL_reg_name[st->resume_state], \
- ((st==yes_state||st==mark_state) ? "[" : ""), \
- ((st==yes_state) ? "Y" : ""), \
- ((st==mark_state) ? "M" : ""), \
- ((st==yes_state||st==mark_state) ? "]" : "") \
- ); \
- });
+#define REPORT_CODE_OFF 29
+#define INDENT_CHARS(depth) ((int)(depth) % 20)
+#ifdef DEBUGGING
+int
+Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
+{
+ va_list ap;
+ int result;
+ PerlIO *f= Perl_debug_log;
+ PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
+ va_start(ap, depth);
+ PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
+ result = PerlIO_vprintf(f, fmt, ap);
+ va_end(ap);
+ return result;
+}
+#endif /* DEBUGGING */
+/* grab a new slab and return the first slot in it */
-#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
+STATIC regmatch_state *
+S_push_slab(pTHX)
+{
+ regmatch_slab *s = PL_regmatch_slab->next;
+ if (!s) {
+ Newx(s, 1, regmatch_slab);
+ s->prev = PL_regmatch_slab;
+ s->next = NULL;
+ PL_regmatch_slab->next = s;
+ }
+ PL_regmatch_slab = s;
+ return SLAB_FIRST(s);
+}
#ifdef DEBUGGING
return 0;
}
+#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
+#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
+#define CHRTEST_NOT_A_CP_1 -999
+#define CHRTEST_NOT_A_CP_2 -998
static bool
S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
{
- /* This function determines if there are one or two characters that match
- * the first character of the passed-in EXACTish node <text_node>, and if
- * so, returns them in the passed-in pointers.
+ /* This function determines if there are zero, one, two, or more characters
+ * that match the first character of the passed-in EXACTish node
+ * <text_node>, and if there are one or two, it returns them in the
+ * passed-in pointers.
*
* If it determines that no possible character in the target string can
* match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
UV c1 = (UV)CHRTEST_NOT_A_CP_1;
UV c2 = (UV)CHRTEST_NOT_A_CP_2;
bool use_chrtest_void = FALSE;
- const bool is_utf8_pat = reginfo->is_utf8_pat;
+ const bool utf8_pat = reginfo->is_utf8_pat;
/* Used when we have both utf8 input and utf8 output, to avoid converting
* to/from code points */
bool utf8_has_been_setup = FALSE;
- dVAR;
U8 *pat = (U8*)STRING(text_node);
U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
+ const U8 op = OP(text_node);
- if ( OP(text_node) == EXACT
- || OP(text_node) == EXACT_ONLY8
- || OP(text_node) == EXACTL)
- {
+ if (! isEXACTFish(OP(text_node))) {
/* In an exact node, only one thing can be matched, that first
* character. If both the pat and the target are UTF-8, we can just
* copy the input to the output, avoiding finding the code point of
* that character */
- if (!is_utf8_pat) {
+ if (! utf8_pat) {
+ assert(! isEXACT_REQ8(OP(text_node)));
c2 = c1 = *pat;
}
else if (utf8_target) {
Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
utf8_has_been_setup = TRUE;
}
+ else if (isEXACT_REQ8(OP(text_node))) {
+ return FALSE; /* Can only match UTF-8 target */
+ }
else {
c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
}
}
else { /* an EXACTFish node */
- U8 *pat_end = pat + STR_LEN(text_node);
+ U8 *pat_end = pat + STR_LENs(text_node);
/* An EXACTFL node has at least some characters unfolded, because what
* they match is not known until now. So, now is the time to fold
* fold. But, in such a pattern only locale-problematic characters
* aren't folded, so we can skip this completely if the first character
* in the node isn't one of the tricky ones */
- if (OP(text_node) == EXACTFL) {
+ if (op == EXACTFL) {
- if (! is_utf8_pat) {
+ if (! utf8_pat) {
if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
{
folded[0] = folded[1] = 's';
}
}
- if ( ( is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
- || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
+ if ( ( utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
+ || (!utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
{
/* Multi-character folds require more context to sort out. Also
* PL_utf8_foldclosures used below doesn't handle them, so have to
use_chrtest_void = TRUE;
}
else { /* an EXACTFish node which doesn't begin with a multi-char fold */
- c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
+ c1 = utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
if ( UNLIKELY(PL_in_utf8_turkic_locale)
- && OP(text_node) == EXACTFL
+ && op == EXACTFL
&& UNLIKELY( c1 == 'i' || c1 == 'I'
|| c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE
|| c1 == LATIN_SMALL_LETTER_DOTLESS_I))
}
}
else if (c1 > 255) {
- const unsigned int * remaining_folds;
- unsigned int first_fold;
+ const U32 * remaining_folds;
+ U32 first_fold;
/* Look up what code points (besides c1) fold to c1; e.g.,
* [ 'K', KELVIN_SIGN ] both fold to 'k'. */
* circumstances. If it isn't, it means the only legal
* match of c1 is itself. */
if ( c2 < 256
- && ( ( OP(text_node) == EXACTFL
+ && ( ( op == EXACTFL
&& ! IN_UTF8_CTYPE_LOCALE)
- || (( OP(text_node) == EXACTFAA
- || OP(text_node) == EXACTFAA_NO_TRIE)
+ || (( op == EXACTFAA
+ || op == EXACTFAA_NO_TRIE)
&& (isASCII(c1) || isASCII(c2)))))
{
c2 = c1;
else /* Here, c1 is <= 255 */
if ( utf8_target
&& HAS_NONLATIN1_FOLD_CLOSURE(c1)
- && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
- && ( ( OP(text_node) != EXACTFAA
- && OP(text_node) != EXACTFAA_NO_TRIE)
+ && ( ! (op == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
+ && ( ( op != EXACTFAA
+ && op != EXACTFAA_NO_TRIE)
|| ! isASCII(c1)))
{
/* Here, there could be something above Latin1 in the target
}
else { /* Here nothing above Latin1 can fold to the pattern
character */
- switch (OP(text_node)) {
+ switch (op) {
case EXACTFL: /* /l rules */
c2 = PL_fold_locale[c1];
case EXACTF: /* This node only generated for non-utf8
patterns */
- assert(! is_utf8_pat);
+ assert(! utf8_pat);
if (! utf8_target) { /* /d rules */
c2 = PL_fold[c1];
break;
* EXACTFAA as nothing in Latin1 folds to ASCII */
case EXACTFAA_NO_TRIE: /* This node only generated for
non-utf8 patterns */
- assert(! is_utf8_pat);
+ assert(! utf8_pat);
/* FALLTHROUGH */
case EXACTFAA:
case EXACTFUP:
case EXACTFU:
c2 = PL_fold_latin1[c1];
break;
+ case EXACTFU_REQ8:
+ return FALSE;
+ NOT_REACHED; /* NOTREACHED */
default:
- Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
+ Perl_croak(aTHX_ "panic: Unexpected op %u", op);
NOT_REACHED; /* NOTREACHED */
}
}
S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
{
/* returns a boolean indicating if there is a Grapheme Cluster Boundary
- * between the inputs. See http://www.unicode.org/reports/tr29/. */
+ * between the inputs. See https://www.unicode.org/reports/tr29/. */
PERL_ARGS_ASSERT_ISGCB;
}
while (prev == GCB_Extend);
- return prev != GCB_XPG_XX;
+ return prev != GCB_ExtPict_XX;
}
default:
STATIC GCB_enum
S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
{
- dVAR;
GCB_enum gcb;
PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
STATIC LB_enum
S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
{
- dVAR;
LB_enum lb;
STATIC LB_enum
S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
{
- dVAR;
LB_enum lb;
PERL_ARGS_ASSERT_BACKUP_ONE_LB;
const bool utf8_target)
{
/* returns a boolean indicating if there is a Sentence Boundary Break
- * between the inputs. See http://www.unicode.org/reports/tr29/ */
+ * between the inputs. See https://www.unicode.org/reports/tr29/ */
U8 * lpos = (U8 *) curpos;
bool has_para_sep = FALSE;
STATIC SB_enum
S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
{
- dVAR;
SB_enum sb;
PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
STATIC SB_enum
S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
{
- dVAR;
SB_enum sb;
PERL_ARGS_ASSERT_BACKUP_ONE_SB;
const bool utf8_target,
const bool skip_Extend_Format)
{
- dVAR;
WB_enum wb;
PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
STATIC WB_enum
S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
{
- dVAR;
WB_enum wb;
PERL_ARGS_ASSERT_BACKUP_ONE_WB;
} while (wb == WB_Extend || wb == WB_Format);
}
- return wb;
-}
+ return wb;
+}
+
+/* Macros for regmatch(), using its internal variables */
+#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
+#define NEXTCHR_IS_EOS (nextbyte < 0)
+
+#define SET_nextchr \
+ nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
+
+#define SET_locinput(p) \
+ locinput = (p); \
+ SET_nextchr
+
+#define sayYES goto yes
+#define sayNO goto no
+#define sayNO_SILENT goto no_silent
+
+/* we dont use STMT_START/END here because it leads to
+ "unreachable code" warnings, which are bogus, but distracting. */
+#define CACHEsayNO \
+ if (ST.cache_mask) \
+ reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
+ sayNO
+
+#define EVAL_CLOSE_PAREN_IS(st,expr) \
+( \
+ ( ( st ) ) && \
+ ( ( st )->u.eval.close_paren ) && \
+ ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
+)
+
+#define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \
+( \
+ ( ( st ) ) && \
+ ( ( st )->u.eval.close_paren ) && \
+ ( ( expr ) ) && \
+ ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
+)
+
+
+#define EVAL_CLOSE_PAREN_SET(st,expr) \
+ (st)->u.eval.close_paren = ( (expr) + 1 )
+
+#define EVAL_CLOSE_PAREN_CLEAR(st) \
+ (st)->u.eval.close_paren = 0
+
+/* push a new state then goto it */
+
+#define PUSH_STATE_GOTO(state, node, input, eol, sr0) \
+ pushinput = input; \
+ pusheol = eol; \
+ pushsr0 = sr0; \
+ scan = node; \
+ st->resume_state = state; \
+ goto push_state;
+
+/* push a new state with success backtracking, then goto it */
+
+#define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \
+ pushinput = input; \
+ pusheol = eol; \
+ pushsr0 = sr0; \
+ scan = node; \
+ st->resume_state = state; \
+ goto push_yes_state;
+
+#define DEBUG_STATE_pp(pp) \
+ DEBUG_STATE_r({ \
+ DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \
+ Perl_re_printf( aTHX_ \
+ "%*s" pp " %s%s%s%s%s\n", \
+ INDENT_CHARS(depth), "", \
+ PL_reg_name[st->resume_state], \
+ ((st==yes_state||st==mark_state) ? "[" : ""), \
+ ((st==yes_state) ? "Y" : ""), \
+ ((st==mark_state) ? "M" : ""), \
+ ((st==yes_state||st==mark_state) ? "]" : "") \
+ ); \
+ });
+
+/*
+
+regmatch() - main matching routine
+
+This is basically one big switch statement in a loop. We execute an op,
+set 'next' to point the next op, and continue. If we come to a point which
+we may need to backtrack to on failure such as (A|B|C), we push a
+backtrack state onto the backtrack stack. On failure, we pop the top
+state, and re-enter the loop at the state indicated. If there are no more
+states to pop, we return failure.
+
+Sometimes we also need to backtrack on success; for example /A+/, where
+after successfully matching one A, we need to go back and try to
+match another one; similarly for lookahead assertions: if the assertion
+completes successfully, we backtrack to the state just before the assertion
+and then carry on. In these cases, the pushed state is marked as
+'backtrack on success too'. This marking is in fact done by a chain of
+pointers, each pointing to the previous 'yes' state. On success, we pop to
+the nearest yes state, discarding any intermediate failure-only states.
+Sometimes a yes state is pushed just to force some cleanup code to be
+called at the end of a successful match or submatch; e.g. (??{$re}) uses
+it to free the inner regex.
+
+Note that failure backtracking rewinds the cursor position, while
+success backtracking leaves it alone.
+
+A pattern is complete when the END op is executed, while a subpattern
+such as (?=foo) is complete when the SUCCESS op is executed. Both of these
+ops trigger the "pop to last yes state if any, otherwise return true"
+behaviour.
+
+A common convention in this function is to use A and B to refer to the two
+subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
+the subpattern to be matched possibly multiple times, while B is the entire
+rest of the pattern. Variable and state names reflect this convention.
+
+The states in the main switch are the union of ops and failure/success of
+substates associated with that op. For example, IFMATCH is the op
+that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
+'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
+successfully matched A and IFMATCH_A_fail is a state saying that we have
+just failed to match A. Resume states always come in pairs. The backtrack
+state we push is marked as 'IFMATCH_A', but when that is popped, we resume
+at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
+on success or failure.
+
+The struct that holds a backtracking state is actually a big union, with
+one variant for each major type of op. The variable st points to the
+top-most backtrack struct. To make the code clearer, within each
+block of code we #define ST to alias the relevant union.
+
+Here's a concrete example of a (vastly oversimplified) IFMATCH
+implementation:
+
+ switch (state) {
+ ....
+
+#define ST st->u.ifmatch
+
+ case IFMATCH: // we are executing the IFMATCH op, (?=A)B
+ ST.foo = ...; // some state we wish to save
+ ...
+ // push a yes backtrack state with a resume value of
+ // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
+ // first node of A:
+ PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
+ // NOTREACHED
+
+ case IFMATCH_A: // we have successfully executed A; now continue with B
+ next = B;
+ bar = ST.foo; // do something with the preserved value
+ break;
+
+ case IFMATCH_A_fail: // A failed, so the assertion failed
+ ...; // do some housekeeping, then ...
+ sayNO; // propagate the failure
+
+#undef ST
+
+ ...
+ }
+
+For any old-timers reading this who are familiar with the old recursive
+approach, the code above is equivalent to:
+
+ case IFMATCH: // we are executing the IFMATCH op, (?=A)B
+ {
+ int foo = ...
+ ...
+ if (regmatch(A)) {
+ next = B;
+ bar = foo;
+ break;
+ }
+ ...; // do some housekeeping, then ...
+ sayNO; // propagate the failure
+ }
+
+The topmost backtrack state, pointed to by st, is usually free. If you
+want to claim it, populate any ST.foo fields in it with values you wish to
+save, then do one of
-#define EVAL_CLOSE_PAREN_IS(st,expr) \
-( \
- ( ( st ) ) && \
- ( ( st )->u.eval.close_paren ) && \
- ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
-)
+ PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
+ PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
-#define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \
-( \
- ( ( st ) ) && \
- ( ( st )->u.eval.close_paren ) && \
- ( ( expr ) ) && \
- ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
-)
+which sets that backtrack state's resume value to 'resume_state', pushes a
+new free entry to the top of the backtrack stack, then goes to 'node'.
+On backtracking, the free slot is popped, and the saved state becomes the
+new free state. An ST.foo field in this new top state can be temporarily
+accessed to retrieve values, but once the main loop is re-entered, it
+becomes available for reuse.
+
+Note that the depth of the backtrack stack constantly increases during the
+left-to-right execution of the pattern, rather than going up and down with
+the pattern nesting. For example the stack is at its maximum at Z at the
+end of the pattern, rather than at X in the following:
+ /(((X)+)+)+....(Y)+....Z/
-#define EVAL_CLOSE_PAREN_SET(st,expr) \
- (st)->u.eval.close_paren = ( (expr) + 1 )
+The only exceptions to this are lookahead/behind assertions and the cut,
+(?>A), which pop all the backtrack states associated with A before
+continuing.
-#define EVAL_CLOSE_PAREN_CLEAR(st) \
- (st)->u.eval.close_paren = 0
+Backtrack state structs are allocated in slabs of about 4K in size.
+PL_regmatch_state and st always point to the currently active state,
+and PL_regmatch_slab points to the slab currently containing
+PL_regmatch_state. The first time regmatch() is called, the first slab is
+allocated, and is never freed until interpreter destruction. When the slab
+is full, a new one is allocated and chained to the end. At exit from
+regmatch(), slabs allocated since entry are freed.
+
+In order to work with variable length lookbehinds, an upper limit is placed on
+lookbehinds which is set to where the match position is at the end of where the
+lookbehind would get to. Nothing in the lookbehind should match above that,
+except we should be able to look beyond if for things like \b, which need the
+next character in the string to be able to determine if this is a boundary or
+not. We also can't match the end of string/line unless we are also at the end
+of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
+that match a width, we have to add a condition that they are within the legal
+bounds of our window into the string.
+
+*/
/* returns -1 on failure, $+[0] on success */
STATIC SSize_t
S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
{
- dVAR;
const bool utf8_target = reginfo->is_utf8_target;
const U32 uniflags = UTF8_ALLOW_DEFAULT;
REGEXP *rex_sv = reginfo->prog;
SSize_t ln = 0; /* len or last; init to avoid compiler warning */
SSize_t endref = 0; /* offset of end of backref when ln is start */
char *locinput = startpos;
+ char *loceol = reginfo->strend;
char *pushinput; /* where to continue after a PUSH */
- I32 nextchr; /* is always set to UCHARAT(locinput), or -1 at EOS */
+ char *pusheol; /* where to stop matching (loceol) after a PUSH */
+ U8 *pushsr0; /* save starting pos of script run */
+ PERL_INT_FAST16_T nextbyte; /* is always set to UCHARAT(locinput), or -1
+ at EOS */
bool result = 0; /* return value of S_regmatch */
U32 depth = 0; /* depth of backtrack stack */
#endif
#ifdef DEBUGGING
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
#endif
/* protect against undef(*^R) */
st = PL_regmatch_state;
- /* Note that nextchr is a byte even in UTF */
+ /* Note that nextbyte is a byte even in UTF */
SET_nextchr;
scan = prog;
to_complement = 0;
SET_nextchr;
- assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
+ assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS));
switch (state_num) {
case SBOL: /* /^../ and /\A../ */
/* update the startpoint */
st->u.keeper.val = rex->offs[0].start;
rex->offs[0].start = locinput - reginfo->strbeg;
- PUSH_STATE_GOTO(KEEPS_next, next, locinput);
+ PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
+ script_run_begin);
NOT_REACHED; /* NOTREACHED */
case KEEPS_next_fail:
NOT_REACHED; /* NOTREACHED */
case MEOL: /* /..$/m */
- if (!NEXTCHR_IS_EOS && nextchr != '\n')
+ if (!NEXTCHR_IS_EOS && nextbyte != '\n')
sayNO;
break;
case SEOL: /* /..$/ */
- if (!NEXTCHR_IS_EOS && nextchr != '\n')
+ if (!NEXTCHR_IS_EOS && nextbyte != '\n')
sayNO;
if (reginfo->strend - locinput > 1)
sayNO;
break;
case SANY: /* /./s */
- if (NEXTCHR_IS_EOS)
+ if (NEXTCHR_IS_EOS || locinput >= loceol)
sayNO;
goto increment_locinput;
case REG_ANY: /* /./ */
- if ((NEXTCHR_IS_EOS) || nextchr == '\n')
+ if ( NEXTCHR_IS_EOS
+ || locinput >= loceol
+ || nextbyte == '\n')
+ {
sayNO;
+ }
goto increment_locinput;
/* In this case the charclass data is available inline so
we can fail fast without a lot of extra overhead.
*/
- if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
+ if ( ! NEXTCHR_IS_EOS
+ && locinput < loceol
+ && ! ANYOF_BITMAP_TEST(scan, nextbyte))
+ {
DEBUG_EXECUTE_r(
Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
depth, PL_colors[4], PL_colors[5])
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (utf8_target
&& ! NEXTCHR_IS_EOS
- && UTF8_IS_ABOVE_LATIN1(nextchr)
+ && UTF8_IS_ABOVE_LATIN1(nextbyte)
&& scan->flags == EXACTL)
{
/* We only output for EXACTL, as we let the folder
}
}
if ( trie->bitmap
- && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
+ && ( NEXTCHR_IS_EOS
+ || locinput >= loceol
+ || ! TRIE_BITMAP_TEST(trie, nextbyte)))
{
if (trie->states[ state ].wordnum) {
DEBUG_EXECUTE_r(
shortest accept state and the wordnum of the longest
accept state */
- while ( state && uc <= (U8*)(reginfo->strend) ) {
+ while ( state && uc <= (U8*)(loceol) ) {
U32 base = trie->states[ state ].trans.base;
UV uvc = 0;
U16 charid = 0;
});
/* read a char and goto next state */
- if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
+ if ( base && (foldlen || uc < (U8*)(loceol))) {
I32 offset;
REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
- (U8 *) reginfo->strend, uscan,
+ (U8 *) loceol, uscan,
len, uvc, charid, foldlen,
foldbuf, uniflags);
charcount++;
while (chars) {
if (utf8_target) {
+ /* XXX This assumes the length is well-formed, as
+ * does the UTF8SKIP below */
uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
uniflags);
uc += len;
});
if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
- PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
+ PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
+ script_run_begin);
NOT_REACHED; /* NOTREACHED */
}
/* only one choice left - just continue */
}
#undef ST
+ case LEXACT_REQ8:
+ if (! utf8_target) {
+ sayNO;
+ }
+ /* FALLTHROUGH */
+
+ case LEXACT:
+ {
+ char *s;
+
+ s = STRINGl(scan);
+ ln = STR_LENl(scan);
+ goto join_short_long_exact;
+
case EXACTL: /* /abc/l */
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
_CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
}
goto do_exact;
- case EXACT_ONLY8:
+ case EXACT_REQ8:
if (! utf8_target) {
sayNO;
}
/* FALLTHROUGH */
- case EXACT: { /* /abc/ */
- char *s;
+
+ case EXACT: /* /abc/ */
do_exact:
- s = STRING(scan);
- ln = STR_LEN(scan);
+ s = STRINGs(scan);
+ ln = STR_LENs(scan);
+
+ join_short_long_exact:
if (utf8_target != is_utf8_pat) {
/* The target and the pattern have differing utf8ness. */
char *l = locinput;
* is an invariant, but there are tests in the test suite
* dealing with (??{...}) which violate this) */
while (s < e) {
- if (l >= reginfo->strend
+ if ( l >= loceol
|| UTF8_IS_ABOVE_LATIN1(* (U8*) l))
{
sayNO;
else {
/* The target is not utf8, the pattern is utf8. */
while (s < e) {
- if (l >= reginfo->strend
+ if ( l >= loceol
|| UTF8_IS_ABOVE_LATIN1(* (U8*) s))
{
sayNO;
else {
/* The target and the pattern have the same utf8ness. */
/* Inline the first character, for speed. */
- if (reginfo->strend - locinput < ln
- || UCHARAT(s) != nextchr
+ if ( loceol - locinput < ln
+ || UCHARAT(s) != nextbyte
|| (ln > 1 && memNE(s, locinput, ln)))
{
sayNO;
fold_array = PL_fold_latin1;
goto do_exactf;
- case EXACTFU_ONLY8: /* /abc/iu with something in /abc/ > 255 */
+ case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */
if (! utf8_target) {
sayNO;
}
fold_utf8_flags = 0;
do_exactf:
- s = STRING(scan);
- ln = STR_LEN(scan);
+ s = STRINGs(scan);
+ ln = STR_LENs(scan);
if ( utf8_target
|| is_utf8_pat
/* Either target or the pattern are utf8, or has the issue where
* the fold lengths may differ. */
const char * const l = locinput;
- char *e = reginfo->strend;
+ char *e = loceol;
if (! foldEQ_utf8_flags(l, &e, 0, utf8_target,
s, 0, ln, is_utf8_pat,fold_utf8_flags))
}
/* Neither the target nor the pattern are utf8 */
- if (UCHARAT(s) != nextchr
+ if (UCHARAT(s) != nextbyte
&& !NEXTCHR_IS_EOS
- && UCHARAT(s) != fold_array[nextchr])
+ && UCHARAT(s) != fold_array[nextbyte])
{
sayNO;
}
- if (reginfo->strend - locinput < ln)
+ if (loceol - locinput < ln)
sayNO;
if (ln > 1 && ! folder(locinput, s, ln))
sayNO;
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (FLAGS(scan) != TRADITIONAL_BOUND) {
- if (! IN_UTF8_CTYPE_LOCALE) {
- Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
- B_ON_NON_UTF8_LOCALE_IS_WRONG);
- }
+ CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
goto boundu;
}
if (locinput == reginfo->strbeg)
b1 = isWORDCHAR_LC('\n');
else {
- b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
- (U8*)(reginfo->strbeg)),
- (U8*)(reginfo->strend));
+ U8 *p = reghop3((U8*)locinput, -1,
+ (U8*)(reginfo->strbeg));
+ b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
}
b2 = (NEXTCHR_IS_EOS)
? isWORDCHAR_LC('\n')
: isWORDCHAR_LC(UCHARAT(locinput - 1));
b2 = (NEXTCHR_IS_EOS)
? isWORDCHAR_LC('\n')
- : isWORDCHAR_LC(nextchr);
+ : isWORDCHAR_LC(nextbyte);
}
if (to_complement ^ (b1 == b2)) {
sayNO;
: isWORDCHAR_A(UCHARAT(locinput - 1));
b2 = (NEXTCHR_IS_EOS)
? isWORDCHAR_A('\n')
- : isWORDCHAR_A(nextchr);
+ : isWORDCHAR_A(nextbyte);
if (to_complement ^ (b1 == b2)) {
sayNO;
}
case TRADITIONAL_BOUND:
{
bool b1, b2;
- b1 = (locinput == reginfo->strbeg)
- ? 0 /* isWORDCHAR_L1('\n') */
- : isWORDCHAR_utf8_safe(
- reghop3((U8*)locinput,
- -1,
- (U8*)(reginfo->strbeg)),
- (U8*) reginfo->strend);
+ if (locinput == reginfo->strbeg) {
+ b1 = 0 /* isWORDCHAR_L1('\n') */;
+ }
+ else {
+ U8 *p = reghop3((U8*)locinput, -1,
+ (U8*)(reginfo->strbeg));
+
+ b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
+ }
b2 = (NEXTCHR_IS_EOS)
? 0 /* isWORDCHAR_L1('\n') */
: isWORDCHAR_utf8_safe((U8*)locinput,
: isWORDCHAR_L1(UCHARAT(locinput - 1));
b2 = (NEXTCHR_IS_EOS)
? 0 /* isWORDCHAR_L1('\n') */
- : isWORDCHAR_L1(nextchr);
+ : isWORDCHAR_L1(nextbyte);
match = cBOOL(b1 != b2);
break;
}
case ANYOFPOSIXL:
case ANYOFL: /* /[abc]/l */
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan);
- if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
- {
- Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
- }
/* FALLTHROUGH */
case ANYOFD: /* /[abc]/d */
case ANYOF: /* /[abc]/ */
- if (NEXTCHR_IS_EOS)
+ if (NEXTCHR_IS_EOS || locinput >= loceol)
sayNO;
if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput))
&& ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
locinput++;
}
else {
- if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
+ if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
utf8_target))
{
sayNO;
break;
case ANYOFM:
- if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)) {
+ if ( NEXTCHR_IS_EOS
+ || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
+ || locinput >= loceol)
+ {
sayNO;
}
locinput++; /* ANYOFM is always single byte */
break;
case NANYOFM:
- if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)) {
+ if ( NEXTCHR_IS_EOS
+ || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
+ || locinput >= loceol)
+ {
sayNO;
}
goto increment_locinput;
case ANYOFH:
if ( ! utf8_target
|| NEXTCHR_IS_EOS
- || ! reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
+ || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
+ || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
+ utf8_target))
+ {
+ sayNO;
+ }
+ goto increment_locinput;
+ break;
+
+ case ANYOFHb:
+ if ( ! utf8_target
+ || NEXTCHR_IS_EOS
+ || ANYOF_FLAGS(scan) != (U8) *locinput
+ || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
+ utf8_target))
+ {
+ sayNO;
+ }
+ goto increment_locinput;
+ break;
+
+ case ANYOFHr:
+ if ( ! utf8_target
+ || NEXTCHR_IS_EOS
+ || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
+ LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
+ HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
+ || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
utf8_target))
{
sayNO;
goto increment_locinput;
break;
+ case ANYOFHs:
+ if ( ! utf8_target
+ || NEXTCHR_IS_EOS
+ || loceol - locinput < FLAGS(scan)
+ || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
+ || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
+ utf8_target))
+ {
+ sayNO;
+ }
+ goto increment_locinput;
+ break;
+
+ case ANYOFR:
+ if (NEXTCHR_IS_EOS) {
+ sayNO;
+ }
+
+ if (utf8_target) {
+ if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
+ || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
+ (U8 *) reginfo->strend,
+ NULL),
+ ANYOFRbase(scan), ANYOFRdelta(scan)))
+ {
+ sayNO;
+ }
+ }
+ else {
+ if (! withinCOUNT((U8) *locinput,
+ ANYOFRbase(scan), ANYOFRdelta(scan)))
+ {
+ sayNO;
+ }
+ }
+ goto increment_locinput;
+ break;
+
+ case ANYOFRb:
+ if (NEXTCHR_IS_EOS) {
+ sayNO;
+ }
+
+ if (utf8_target) {
+ if ( ANYOF_FLAGS(scan) != (U8) *locinput
+ || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
+ (U8 *) reginfo->strend,
+ NULL),
+ ANYOFRbase(scan), ANYOFRdelta(scan)))
+ {
+ sayNO;
+ }
+ }
+ else {
+ if (! withinCOUNT((U8) *locinput,
+ ANYOFRbase(scan), ANYOFRdelta(scan)))
+ {
+ sayNO;
+ }
+ }
+ goto increment_locinput;
+ break;
+
/* The argument (FLAGS) to all the POSIX node types is the class number
* */
case POSIXL: /* \w or [:punct:] etc. under /l */
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
- if (NEXTCHR_IS_EOS)
+ if (NEXTCHR_IS_EOS || locinput >= loceol)
sayNO;
/* Use isFOO_lc() for characters within Latin1. (Note that
* UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
* wouldn't be invariant) */
- if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
- if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
+ if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
+ if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) {
sayNO;
}
/* Here is a UTF-8 variant code point below 256 and the target is
* UTF-8 */
if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
- EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
+ EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
*(locinput + 1))))))
{
sayNO;
case NPOSIXA: /* \W or [:^punct:] etc. under /a */
- if (NEXTCHR_IS_EOS) {
+ if (NEXTCHR_IS_EOS || locinput >= loceol) {
sayNO;
}
/* All UTF-8 variants match */
- if (! UTF8_IS_INVARIANT(nextchr)) {
+ if (! UTF8_IS_INVARIANT(nextbyte)) {
goto increment_locinput;
}
* UTF-8, and also from NPOSIXA even in UTF-8 when the current
* character is a single byte */
- if (NEXTCHR_IS_EOS) {
+ if (NEXTCHR_IS_EOS || locinput >= loceol) {
sayNO;
}
join_nposixa:
- if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
+ if (! (to_complement ^ cBOOL(_generic_isCC_A(nextbyte,
FLAGS(scan)))))
{
sayNO;
case POSIXU: /* \w or [:punct:] etc. under /u */
utf8_posix:
- if (NEXTCHR_IS_EOS) {
+ if (NEXTCHR_IS_EOS || locinput >= loceol) {
sayNO;
}
/* Use _generic_isCC() for characters within Latin1. (Note that
* UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
* wouldn't be invariant) */
- if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
- if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
+ if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
+ if (! (to_complement ^ cBOOL(_generic_isCC(nextbyte,
FLAGS(scan)))))
{
sayNO;
}
else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
if (! (to_complement
- ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
+ ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
*(locinput + 1)),
FLAGS(scan)))))
{
}
break;
}
- locinput += UTF8SKIP(locinput);
+ locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
}
break;
case CLUMP: /* Match \X: logical Unicode character. This is defined as
a Unicode extended Grapheme Cluster */
- if (NEXTCHR_IS_EOS)
+ if (NEXTCHR_IS_EOS || locinput >= loceol)
sayNO;
if (! utf8_target) {
/* Match either CR LF or '.', as all the other possibilities
* require utf8 */
locinput++; /* Match the . or CR */
- if (nextchr == '\r' /* And if it was CR, and the next is LF,
+ if (nextbyte == '\r' /* And if it was CR, and the next is LF,
match the LF */
- && locinput < reginfo->strend
+ && locinput < loceol
&& UCHARAT(locinput) == '\n')
{
locinput++;
* current character. (There is always a break at the
* end-of-input) */
locinput += UTF8SKIP(locinput);
- while (locinput < reginfo->strend) {
+ while (locinput < loceol) {
GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
(U8*) reginfo->strend);
if (isGCB(prev_gcb, cur_gcb,
}
break;
- case NREFFL: /* /\g{name}/il */
+ case REFFLN: /* /\g{name}/il */
{ /* The capture buffer cases. The ones beginning with N for the
named buffers just convert to the equivalent numbered and
pretend they were called as the corresponding numbered buffer
utf8_fold_flags = FOLDEQ_LOCALE;
goto do_nref;
- case NREFFA: /* /\g{name}/iaa */
+ case REFFAN: /* /\g{name}/iaa */
folder = foldEQ_latin1;
fold_array = PL_fold_latin1;
type = REFFA;
utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
goto do_nref;
- case NREFFU: /* /\g{name}/iu */
+ case REFFUN: /* /\g{name}/iu */
folder = foldEQ_latin1;
fold_array = PL_fold_latin1;
type = REFFU;
utf8_fold_flags = 0;
goto do_nref;
- case NREFF: /* /\g{name}/i */
+ case REFFN: /* /\g{name}/i */
folder = foldEQ;
fold_array = PL_fold;
type = REFF;
utf8_fold_flags = 0;
goto do_nref;
- case NREF: /* /\g{name}/ */
+ case REFN: /* /\g{name}/ */
type = REF;
folder = NULL;
fold_array = NULL;
if (type != REF /* REF can do byte comparison */
&& (utf8_target || type == REFFU || type == REFFL))
{
- char * limit = reginfo->strend;
+ char * limit = loceol;
/* This call case insensitively compares the entire buffer
* at s, with the current input starting at locinput, but
- * not going off the end given by reginfo->strend, and
+ * not going off the end given by loceol, and
* returns in <limit> upon success, how much of the
* current input was matched */
if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
}
/* Not utf8: Inline the first character, for speed. */
- if (!NEXTCHR_IS_EOS &&
- UCHARAT(s) != nextchr &&
- (type == REF ||
- UCHARAT(s) != fold_array[nextchr]))
+ if ( ! NEXTCHR_IS_EOS
+ && locinput < loceol
+ && UCHARAT(s) != nextbyte
+ && ( type == REF
+ || UCHARAT(s) != fold_array[nextbyte]))
+ {
sayNO;
+ }
ln = endref - ln;
- if (locinput + ln > reginfo->strend)
+ if (locinput + ln > loceol)
sayNO;
if (ln > 1 && (type == REF
? memNE(s, locinput, ln)
rex->recurse_locinput[arg]= locinput;
DEBUG_r({
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
DEBUG_STACK_r({
Perl_re_exec_indentf( aTHX_
"entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
/* NOTREACHED */
case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */
- if (cur_eval && cur_eval->locinput==locinput) {
+ if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
if ( ++nochange_depth > max_nochange_depth )
Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
} else {
PL_curpm = PL_reg_curpm;
if (logical != 2) {
- PUSH_STATE_GOTO(EVAL_B, next, locinput);
+ PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
+ script_run_begin);
/* NOTREACHED */
}
}
ST.prev_eval = cur_eval;
cur_eval = st;
/* now continue from first node in postoned RE */
- PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
+ PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
+ loceol, script_run_begin);
NOT_REACHED; /* NOTREACHED */
}
sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
break;
- case NGROUPP: /* (?(<name>)) */
+ case GROUPPN: /* (?(<name>)) */
/* reg_check_named_buff_matched returns 0 for no match */
sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
break;
ST.count = -1; /* this will be updated by WHILEM */
ST.lastloc = NULL; /* this will be updated by WHILEM */
- PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
+ PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
+ script_run_begin);
NOT_REACHED; /* NOTREACHED */
}
cur_curlyx->u.curlyx.lastloc = locinput;
REGCP_SET(ST.lastcp);
- PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
+ PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
+ script_run_begin);
NOT_REACHED; /* NOTREACHED */
}
ST.save_curlyx = cur_curlyx;
cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
- locinput);
+ locinput, loceol, script_run_begin);
NOT_REACHED; /* NOTREACHED */
}
maxopenparen);
cur_curlyx->u.curlyx.lastloc = locinput;
REGCP_SET(ST.lastcp);
- PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
+ PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
+ script_run_begin);
NOT_REACHED; /* NOTREACHED */
}
goto do_whilem_B_max;
ST.save_curlyx = cur_curlyx;
cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
- locinput);
+ locinput, loceol, script_run_begin);
NOT_REACHED; /* NOTREACHED */
case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
REGCP_SET(ST.lastcp);
PUSH_STATE_GOTO(WHILEM_A_min,
/*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
- locinput);
+ locinput, loceol, script_run_begin);
NOT_REACHED; /* NOTREACHED */
#undef ST
/* Now go into the branch */
if (has_cutgroup) {
- PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
+ PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
+ script_run_begin);
} else {
- PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
+ PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
+ script_run_begin);
}
NOT_REACHED; /* NOTREACHED */
sv_yes_mark = st->u.mark.mark_name = scan->flags
? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
: NULL;
- PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
+ PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
+ script_run_begin);
NOT_REACHED; /* NOTREACHED */
case CUTGROUP_next_fail:
goto curlym_do_B;
curlym_do_A: /* execute the A in /A{m,n}B/ */
- PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
+ PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
+ script_run_begin);
NOT_REACHED; /* NOTREACHED */
case CURLYM_A: /* we've just matched an A */
depth, (IV)ST.count)
);
if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
- if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
- if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
- && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
+ if (! UTF8_IS_INVARIANT(nextbyte) && utf8_target) {
+
+ /* (We can use memEQ and memNE in this file without
+ * having to worry about one being shorter than the
+ * other, since the first byte of each gives the
+ * length of the character) */
+ if ( memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput,
+ reginfo->strend))
+ && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput,
+ reginfo->strend)))
{
/* simulate B failing */
DEBUG_OPTIMISE_r(
goto reenter_switch;
}
}
- else if (nextchr != ST.c1 && nextchr != ST.c2) {
+ else if (nextbyte != ST.c1 && nextbyte != ST.c2) {
/* simulate B failing */
DEBUG_OPTIMISE_r(
Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
depth,
- (int) nextchr, ST.c1, ST.c2)
+ (int) nextbyte, ST.c1, ST.c2)
);
state_num = CURLYM_B_fail;
goto reenter_switch;
}
}
- PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
+ PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */
+ script_run_begin);
NOT_REACHED; /* NOTREACHED */
case CURLYM_B_fail: /* just failed to match a B */
if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
{
char *li = locinput;
- if (!regrepeat(rex, &li, scan, reginfo, 1))
+ if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
sayNO;
SET_locinput(li);
goto fake_end;
char *li = locinput;
minmod = 0;
if (ST.min &&
- regrepeat(rex, &li, ST.A, reginfo, ST.min)
+ regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
< ST.min)
sayNO;
SET_locinput(li);
/* set ST.maxpos to the furthest point along the
* string that could possibly match */
if (ST.max == REG_INFTY) {
- ST.maxpos = reginfo->strend - 1;
+ ST.maxpos = loceol - 1;
if (utf8_target)
while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
ST.maxpos--;
else if (utf8_target) {
int m = ST.max - ST.min;
for (ST.maxpos = locinput;
- m >0 && ST.maxpos < reginfo->strend; m--)
+ m >0 && ST.maxpos < loceol; m--)
ST.maxpos += UTF8SKIP(ST.maxpos);
}
else {
ST.maxpos = locinput + ST.max - ST.min;
- if (ST.maxpos >= reginfo->strend)
- ST.maxpos = reginfo->strend - 1;
+ if (ST.maxpos >= loceol)
+ ST.maxpos = loceol - 1;
}
goto curly_try_B_min_known;
/* avoid taking address of locinput, so it can remain
* a register var */
char *li = locinput;
- ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
+ ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
if (ST.count < ST.min)
sayNO;
SET_locinput(li);
if (ST.c1 == CHRTEST_VOID) {
/* failed -- move forward one */
char *li = locinput;
- if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
+ if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
sayNO;
}
locinput = li;
n = (ST.oldloc == locinput) ? 0 : 1;
if (ST.c1 == ST.c2) {
/* set n to utf8_distance(oldloc, locinput) */
- while (locinput <= ST.maxpos
- && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
+ while ( locinput <= ST.maxpos
+ && locinput < loceol
+ && memNE(locinput, ST.c1_utf8,
+ UTF8_SAFE_SKIP(locinput, reginfo->strend)))
{
- locinput += UTF8SKIP(locinput);
+ locinput += UTF8_SAFE_SKIP(locinput,
+ reginfo->strend);
n++;
}
}
else {
/* set n to utf8_distance(oldloc, locinput) */
- while (locinput <= ST.maxpos
- && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
- && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
+ while ( locinput <= ST.maxpos
+ && locinput < loceol
+ && memNE(locinput, ST.c1_utf8,
+ UTF8_SAFE_SKIP(locinput, reginfo->strend))
+ && memNE(locinput, ST.c2_utf8,
+ UTF8_SAFE_SKIP(locinput, reginfo->strend)))
{
- locinput += UTF8SKIP(locinput);
+ locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
n++;
}
}
* locinput matches */
char *li = ST.oldloc;
ST.count += n;
- if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
+ if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
sayNO;
assert(n == REG_INFTY || locinput == li);
}
curly_try_B_min:
CURLY_SETPAREN(ST.paren, ST.count);
- PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
+ PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
+ script_run_begin);
NOT_REACHED; /* NOTREACHED */
curly_try_B_max:
/* a successful greedy match: now try to match B */
{
- bool could_match = locinput < reginfo->strend;
+ bool could_match = locinput < loceol;
/* If it could work, try it. */
if (ST.c1 != CHRTEST_VOID && could_match) {
if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
{
- could_match = memEQ(locinput,
- ST.c1_utf8,
- UTF8SKIP(locinput))
- || memEQ(locinput,
- ST.c2_utf8,
- UTF8SKIP(locinput));
+ could_match = memEQ(locinput, ST.c1_utf8,
+ UTF8_SAFE_SKIP(locinput,
+ reginfo->strend))
+ || memEQ(locinput, ST.c2_utf8,
+ UTF8_SAFE_SKIP(locinput,
+ reginfo->strend));
}
else {
- could_match = UCHARAT(locinput) == ST.c1
- || UCHARAT(locinput) == ST.c2;
+ could_match = UCHARAT(locinput) == ST.c1
+ || UCHARAT(locinput) == ST.c2;
}
}
if (ST.c1 == CHRTEST_VOID || could_match) {
CURLY_SETPAREN(ST.paren, ST.count);
- PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
+ PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
+ script_run_begin);
NOT_REACHED; /* NOTREACHED */
}
}
SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
- PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
- locinput); /* match B */
+ PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */
+ st->u.eval.prev_eval->u.eval.B,
+ locinput, loceol, script_run_begin);
}
if (locinput < reginfo->till) {
#undef ST
#define ST st->u.ifmatch
- {
- char *newstart;
-
case SUSPEND: /* (?>A) */
ST.wanted = 1;
- newstart = locinput;
+ ST.start = locinput;
+ ST.end = loceol;
+ ST.count = 1;
goto do_ifmatch;
- case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
+ case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */
ST.wanted = 0;
goto ifmatch_trivial_fail_test;
- case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
+ case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */
ST.wanted = 1;
ifmatch_trivial_fail_test:
- if (scan->flags) {
- char * const s = HOPBACKc(locinput, scan->flags);
- if (!s) {
- /* trivial fail */
- if (logical) {
- logical = 0;
- sw = 1 - cBOOL(ST.wanted);
- }
- else if (ST.wanted)
- sayNO;
- next = scan + ARG(scan);
- if (next == scan)
- next = NULL;
- break;
- }
- newstart = s;
+ ST.count = scan->next_off + 1; /* next_off repurposed to be
+ lookbehind count, requires
+ non-zero flags */
+ if (! scan->flags) { /* 'flags' zero means lookahed */
+
+ /* Lookahead starts here and ends at the normal place */
+ ST.start = locinput;
+ ST.end = loceol;
+ }
+ else {
+ PERL_UINT_FAST8_T back_count = scan->flags;
+ char * s;
+
+ /* Lookbehind can look beyond the current position */
+ ST.end = loceol;
+
+ /* ... and starts at the first place in the input that is in
+ * the range of the possible start positions */
+ for (; ST.count > 0; ST.count--, back_count--) {
+ s = HOPBACKc(locinput, back_count);
+ if (s) {
+ ST.start = s;
+ goto do_ifmatch;
+ }
+ }
+
+ /* If the lookbehind doesn't start in the actual string, is a
+ * trivial match failure */
+ if (logical) {
+ logical = 0;
+ sw = 1 - cBOOL(ST.wanted);
+ }
+ else if (ST.wanted)
+ sayNO;
+
+ /* Here, we didn't want it to match, so is actually success */
+ next = scan + ARG(scan);
+ if (next == scan)
+ next = NULL;
+ break;
}
- else
- newstart = locinput;
do_ifmatch:
ST.me = scan;
logical = 0; /* XXX: reset state of logical once it has been saved into ST */
/* execute body of (?...A) */
- PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
+ PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
+ ST.end, script_run_begin);
NOT_REACHED; /* NOTREACHED */
- }
+
+ {
+ bool matched;
case IFMATCH_A_fail: /* body of (?...A) failed */
- ST.wanted = !ST.wanted;
- /* FALLTHROUGH */
+ if (! ST.logical && ST.count > 1) {
+
+ /* It isn't a real failure until we've tried all starting
+ * positions. Move to the next starting position and retry */
+ ST.count--;
+ ST.start = HOPc(ST.start, 1);
+ scan = ST.me;
+ logical = ST.logical;
+ goto do_ifmatch;
+ }
+
+ /* Here, all starting positions have been tried. */
+ matched = FALSE;
+ goto ifmatch_done;
case IFMATCH_A: /* body of (?...A) succeeded */
- if (ST.logical) {
- sw = cBOOL(ST.wanted);
- }
- else if (!ST.wanted)
- sayNO;
+ matched = TRUE;
+ ifmatch_done:
+ sw = matched == ST.wanted;
+ if (! ST.logical && !sw) {
+ sayNO;
+ }
if (OP(ST.me) != SUSPEND) {
/* restore old position except for (?>...) */
locinput = st->locinput;
+ loceol = st->loceol;
+ script_run_begin = st->sr0;
}
scan = ST.me + ARG(ST.me);
if (scan == ST.me)
scan = NULL;
continue; /* execute B */
+ }
#undef ST
break;
case COMMIT: /* (*COMMIT) */
- reginfo->cutpoint = reginfo->strend;
+ reginfo->cutpoint = loceol;
/* FALLTHROUGH */
case PRUNE: /* (*PRUNE) */
if (scan->flags)
sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
- PUSH_STATE_GOTO(COMMIT_next, next, locinput);
+ PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
+ script_run_begin);
NOT_REACHED; /* NOTREACHED */
case COMMIT_next_fail:
= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
mark_state = st;
ST.mark_loc = locinput;
- PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
+ PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
+ script_run_begin);
NOT_REACHED; /* NOTREACHED */
case MARKPOINT_next:
/* (*SKIP) : if we fail we cut here*/
ST.mark_name = NULL;
ST.mark_loc = locinput;
- PUSH_STATE_GOTO(SKIP_next,next, locinput);
+ PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
+ script_run_begin);
} else {
/* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
otherwise do nothing. Meaning we need to scan
find ) )
{
ST.mark_name = find;
- PUSH_STATE_GOTO( SKIP_next, next, locinput);
+ PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
+ script_run_begin);
}
cur = cur->u.mark.prev_mark;
}
#undef ST
case LNBREAK: /* \R */
- if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
+ if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
locinput += n;
} else
sayNO;
increment_locinput:
assert(!NEXTCHR_IS_EOS);
if (utf8_target) {
- locinput += PL_utf8skip[nextchr];
+ locinput += PL_utf8skip[nextbyte];
/* locinput is allowed to go 1 char off the end (signifying
* EOS), but not 2+ */
- if (locinput > reginfo->strend)
+ if (locinput > loceol)
sayNO;
}
else
/* push a new regex state, then continue at scan */
{
regmatch_state *newst;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
- DEBUG_STACK_r({
+ DEBUG_r( /* DEBUG_STACK_r */
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
regmatch_state *cur = st;
regmatch_state *curyes = yes_state;
U32 i;
if (curyes == cur)
curyes = cur->u.yes.prev_yes_state;
}
- } else
+ } else {
DEBUG_STATE_pp("push")
- );
+ });
depth++;
st->locinput = locinput;
+ st->loceol = loceol;
+ st->sr0 = script_run_begin;
newst = st+1;
if (newst > SLAB_LAST(PL_regmatch_slab))
newst = S_push_slab(aTHX);
PL_regmatch_state = newst;
locinput = pushinput;
+ loceol = pusheol;
+ script_run_begin = pushsr0;
st = newst;
continue;
/* NOTREACHED */
yes_state = st->u.yes.prev_yes_state;
PL_regmatch_state = st;
- if (no_final)
+ if (no_final) {
locinput= st->locinput;
+ loceol= st->loceol;
+ script_run_begin = st->sr0;
+ }
state_num = st->resume_state + no_final;
goto reenter_switch;
}
}
PL_regmatch_state = st;
locinput= st->locinput;
+ loceol= st->loceol;
+ script_run_begin = st->sr0;
DEBUG_STATE_pp("pop");
depth--;
* What 'simple' means is a node which can be the operand of a quantifier like
* '+', or {1,3}
*
- * startposp - pointer a pointer to the start position. This is updated
+ * startposp - pointer to a pointer to the start position. This is updated
* to point to the byte following the highest successful
* match.
* p - the regnode to be repeatedly matched against.
- * reginfo - struct holding match state, such as strend
+ * loceol - pointer to the end position beyond which we aren't supposed to
+ * look.
+ * reginfo - struct holding match state, such as utf8_target
* max - maximum number of things to match.
* depth - (for debugging) backtracking depth.
*/
STATIC I32
S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
- regmatch_info *const reginfo, I32 max _pDEPTH)
+ char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
{
- dVAR;
char *scan; /* Pointer to current position in target string */
I32 c;
- char *loceol = reginfo->strend; /* local version */
+ char *this_eol = loceol; /* potentially adjusted version. */
I32 hardcount = 0; /* How many matches so far */
bool utf8_target = reginfo->is_utf8_target;
unsigned int to_complement = 0; /* Invert the result? */
PERL_ARGS_ASSERT_REGREPEAT;
+ /* This routine is structured so that we switch on the input OP. Each OP
+ * case: statement contains a loop to repeatedly apply the OP, advancing
+ * the input until it fails, or reaches the end of the input, or until it
+ * reaches the upper limit of matches. */
+
scan = *startposp;
- if (max == REG_INFTY)
+ if (max == REG_INFTY) /* This is a special marker to go to the platform's
+ max */
max = I32_MAX;
- else if (! utf8_target && loceol - scan > max)
- loceol = scan + max;
+ else if (! utf8_target && this_eol - scan > max)
+ this_eol = scan + max;
- /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
+ /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol> down
* to the maximum of how far we should go in it (leaving it set to the real
* end, if the maximum permissible would take us beyond that). This allows
- * us to make the loop exit condition that we haven't gone past <loceol> to
+ * us to make the loop exit condition that we haven't gone past <this_eol> to
* also mean that we haven't exceeded the max permissible count, saving a
- * test each time through the loop. But it assumes that the OP matches a
+ * test each time through the loops. But it assumes that the OP matches a
* single byte, which is true for most of the OPs below when applied to a
* non-UTF-8 target. Those relatively few OPs that don't have this
* characteristic will have to compensate.
* There is no adjustment for UTF-8 targets, as the number of bytes per
* character varies. OPs will have to test both that the count is less
* than the max permissible (using <hardcount> to keep track), and that we
- * are still within the bounds of the string (using <loceol>. A few OPs
+ * are still within the bounds of the string (using <this_eol>. A few OPs
* match a single byte no matter what the encoding. They can omit the max
* test if, for the UTF-8 case, they do the adjustment that was skipped
* above.
*
* Thus, the code above sets things up for the common case; and exceptional
* cases need extra work; the common case is to make sure <scan> doesn't
- * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
+ * go past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
* count doesn't exceed the maximum permissible */
switch (OP(p)) {
case REG_ANY:
if (utf8_target) {
- while (scan < loceol && hardcount < max && *scan != '\n') {
+ while (scan < this_eol && hardcount < max && *scan != '\n') {
scan += UTF8SKIP(scan);
hardcount++;
}
} else {
- scan = (char *) memchr(scan, '\n', loceol - scan);
+ scan = (char *) memchr(scan, '\n', this_eol - scan);
if (! scan) {
- scan = loceol;
+ scan = this_eol;
}
}
break;
case SANY:
if (utf8_target) {
- while (scan < loceol && hardcount < max) {
+ while (scan < this_eol && hardcount < max) {
scan += UTF8SKIP(scan);
hardcount++;
}
}
else
- scan = loceol;
+ scan = this_eol;
break;
+
+ case LEXACT_REQ8:
+ if (! utf8_target) {
+ break;
+ }
+ /* FALLTHROUGH */
+
+ case LEXACT:
+ {
+ U8 * string;
+ Size_t str_len;
+
+ string = (U8 *) STRINGl(p);
+ str_len = STR_LENl(p);
+ goto join_short_long_exact;
+
case EXACTL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
}
goto do_exact;
- case EXACT_ONLY8:
+ case EXACT_REQ8:
if (! utf8_target) {
break;
}
/* FALLTHROUGH */
case EXACT:
do_exact:
- assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
+ string = (U8 *) STRINGs(p);
+ str_len = STR_LENs(p);
- c = (U8)*STRING(p);
+ join_short_long_exact:
+ assert(str_len == reginfo->is_utf8_pat ? UTF8SKIP(string) : 1);
+
+ c = *string;
/* Can use a simple find if the pattern char to match on is invariant
* under UTF-8, or both target and pattern aren't UTF-8. Note that we
* can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
* true iff it doesn't matter if the argument is in UTF-8 or not */
if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
- if (utf8_target && loceol - scan > max) {
- /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+ if (utf8_target && this_eol - scan > max) {
+ /* We didn't adjust <this_eol> because is UTF-8, but ok to do so,
* since here, to match at all, 1 char == 1 byte */
- loceol = scan + max;
+ this_eol = scan + max;
}
- scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
+ scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c);
}
else if (reginfo->is_utf8_pat) {
if (utf8_target) {
/* When both target and pattern are UTF-8, we have to do
* string EQ */
while (hardcount < max
- && scan < loceol
- && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
- && memEQ(scan, STRING(p), scan_char_len))
+ && scan < this_eol
+ && (scan_char_len = UTF8SKIP(scan)) <= str_len
+ && memEQ(scan, string, scan_char_len))
{
scan += scan_char_len;
hardcount++;
/* Target isn't utf8; convert the character in the UTF-8
* pattern to non-UTF8, and do a simple find */
- c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
- scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
+ c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(string + 1));
+ scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c);
} /* else pattern char is above Latin1, can't possibly match the
non-UTF-8 target */
}
U8 low = UTF8_TWO_BYTE_LO(c);
while (hardcount < max
- && scan + 1 < loceol
+ && scan + 1 < this_eol
&& UCHARAT(scan) == high
&& UCHARAT(scan + 1) == low)
{
}
}
break;
+ }
case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
assert(! reginfo->is_utf8_pat);
| FOLDEQ_S2_FOLDS_SANE;
goto do_exactf;
- case EXACTFU_ONLY8:
+ case EXACTFU_REQ8:
if (! utf8_target) {
break;
}
int c1, c2;
U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
- assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
+ assert(STR_LENs(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRINGs(p)) : 1);
if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
reginfo))
{
if (c1 == CHRTEST_VOID) {
/* Use full Unicode fold matching */
- char *tmpeol = reginfo->strend;
- STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
+ char *tmpeol = loceol;
+ STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRINGs(p)) : 1;
while (hardcount < max
&& foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
- STRING(p), NULL, pat_len,
+ STRINGs(p), NULL, pat_len,
reginfo->is_utf8_pat, utf8_flags))
{
scan = tmpeol;
- tmpeol = reginfo->strend;
+ tmpeol = loceol;
hardcount++;
}
}
else if (utf8_target) {
if (c1 == c2) {
- while (scan < loceol
+ while (scan < this_eol
&& hardcount < max
- && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
+ && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
+ loceol)))
{
- scan += UTF8SKIP(scan);
+ scan += UTF8SKIP(c1_utf8);
hardcount++;
}
}
else {
- while (scan < loceol
+ while (scan < this_eol
&& hardcount < max
- && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
- || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
+ && ( memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan,
+ loceol))
+ || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan,
+ loceol))))
{
- scan += UTF8SKIP(scan);
+ scan += UTF8_SAFE_SKIP(scan, loceol);
hardcount++;
}
}
}
else if (c1 == c2) {
- scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c1);
+ scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c1);
}
else {
/* See comments in regmatch() CURLY_B_min_known_fail. We avoid
U8 c1_c2_mask = ~ c1_c2_bits_differing;
scan = (char *) find_span_end_mask((U8 *) scan,
- (U8 *) loceol,
+ (U8 *) this_eol,
c1 & c1_c2_mask,
c1_c2_mask);
}
else {
- while ( scan < loceol
+ while ( scan < this_eol
&& (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
{
scan++;
case ANYOFPOSIXL:
case ANYOFL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
- if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
- Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
- }
/* FALLTHROUGH */
case ANYOFD:
case ANYOF:
if (utf8_target) {
while (hardcount < max
- && scan < loceol
- && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
+ && scan < this_eol
+ && reginclass(prog, p, (U8*)scan, (U8*) this_eol, utf8_target))
{
scan += UTF8SKIP(scan);
hardcount++;
}
}
else if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
- while (scan < loceol
+ while (scan < this_eol
&& reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
scan++;
}
else {
- while (scan < loceol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
+ while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
scan++;
}
break;
case ANYOFM:
- if (utf8_target && loceol - scan > max) {
+ if (utf8_target && this_eol - scan > max) {
- /* We didn't adjust <loceol> at the beginning of this routine
+ /* We didn't adjust <this_eol> at the beginning of this routine
* because is UTF-8, but it is actually ok to do so, since here, to
* match, 1 char == 1 byte. */
- loceol = scan + max;
+ this_eol = scan + max;
}
- scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) loceol, (U8) ARG(p), FLAGS(p));
+ scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
break;
case NANYOFM:
if (utf8_target) {
while ( hardcount < max
- && scan < loceol
+ && scan < this_eol
&& (*scan & FLAGS(p)) != ARG(p))
{
scan += UTF8SKIP(scan);
}
}
else {
- scan = (char *) find_next_masked((U8 *) scan, (U8 *) loceol, (U8) ARG(p), FLAGS(p));
+ scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
}
break;
case ANYOFH:
- if (utf8_target) while ( hardcount < max
- && scan < loceol
- && reginclass(prog, p, (U8*)scan, (U8*) loceol,
- TRUE))
- {
- scan += UTF8SKIP(scan);
- hardcount++;
+ if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
+ while ( hardcount < max
+ && scan < this_eol
+ && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
+ && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ break;
+
+ case ANYOFHb:
+ if (utf8_target) { /* ANYOFHb only can match UTF-8 targets */
+
+ /* we know the first byte must be the FLAGS field */
+ while ( hardcount < max
+ && scan < this_eol
+ && (U8) *scan == ANYOF_FLAGS(p)
+ && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
+ TRUE))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ break;
+
+ case ANYOFHr:
+ if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
+ while ( hardcount < max
+ && scan < this_eol
+ && inRANGE(NATIVE_UTF8_TO_I8(*scan),
+ LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
+ HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
+ && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
+ && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ break;
+
+ case ANYOFHs:
+ if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
+ while ( hardcount < max
+ && scan + FLAGS(p) < this_eol
+ && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
+ && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ break;
+
+ case ANYOFR:
+ if (utf8_target) {
+ while ( hardcount < max
+ && scan < this_eol
+ && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
+ && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
+ (U8 *) this_eol,
+ NULL),
+ ANYOFRbase(p), ANYOFRdelta(p)))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ else {
+ while ( hardcount < max
+ && scan < this_eol
+ && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
+ {
+ scan++;
+ hardcount++;
+ }
+ }
+ break;
+
+ case ANYOFRb:
+ if (utf8_target) {
+ while ( hardcount < max
+ && scan < this_eol
+ && (U8) *scan == ANYOF_FLAGS(p)
+ && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
+ (U8 *) this_eol,
+ NULL),
+ ANYOFRbase(p), ANYOFRdelta(p)))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ else {
+ while ( hardcount < max
+ && scan < this_eol
+ && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
+ {
+ scan++;
+ hardcount++;
+ }
}
break;
case POSIXL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (! utf8_target) {
- while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
+ while (scan < this_eol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
*scan)))
{
scan++;
}
} else {
- while (hardcount < max && scan < loceol
+ while (hardcount < max && scan < this_eol
&& to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
(U8 *) scan,
- (U8 *) loceol)))
+ (U8 *) this_eol)))
{
scan += UTF8SKIP(scan);
hardcount++;
/* FALLTHROUGH */
case POSIXA:
- if (utf8_target && loceol - scan > max) {
+ if (utf8_target && this_eol - scan > max) {
- /* We didn't adjust <loceol> at the beginning of this routine
+ /* We didn't adjust <this_eol> at the beginning of this routine
* because is UTF-8, but it is actually ok to do so, since here, to
* match, 1 char == 1 byte. */
- loceol = scan + max;
+ this_eol = scan + max;
}
- while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
+ while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
scan++;
}
break;
case NPOSIXA:
if (! utf8_target) {
- while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
+ while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
scan++;
}
}
/* The complement of something that matches only ASCII matches all
* non-ASCII, plus everything in ASCII that isn't in the class. */
- while (hardcount < max && scan < loceol
- && ( ! isASCII_utf8_safe(scan, reginfo->strend)
+ while (hardcount < max && scan < this_eol
+ && ( ! isASCII_utf8_safe(scan, loceol)
|| ! _generic_isCC_A((U8) *scan, FLAGS(p))))
{
scan += UTF8SKIP(scan);
case POSIXU:
if (! utf8_target) {
- while (scan < loceol && to_complement
+ while (scan < this_eol && to_complement
^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
{
scan++;
classnum = (_char_class_number) FLAGS(p);
switch (classnum) {
default:
- while ( hardcount < max && scan < loceol
+ while ( hardcount < max && scan < this_eol
&& to_complement ^ cBOOL(_invlist_contains_cp(
PL_XPosix_ptrs[classnum],
utf8_to_uvchr_buf((U8 *) scan,
- (U8 *) loceol,
+ (U8 *) this_eol,
NULL))))
{
scan += UTF8SKIP(scan);
case _CC_ENUM_SPACE:
while (hardcount < max
- && scan < loceol
+ && scan < this_eol
&& (to_complement
- ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
+ ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
{
scan += UTF8SKIP(scan);
hardcount++;
break;
case _CC_ENUM_BLANK:
while (hardcount < max
- && scan < loceol
+ && scan < this_eol
&& (to_complement
- ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
+ ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
{
scan += UTF8SKIP(scan);
hardcount++;
break;
case _CC_ENUM_XDIGIT:
while (hardcount < max
- && scan < loceol
+ && scan < this_eol
&& (to_complement
- ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
+ ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
{
scan += UTF8SKIP(scan);
hardcount++;
break;
case _CC_ENUM_VERTSPACE:
while (hardcount < max
- && scan < loceol
+ && scan < this_eol
&& (to_complement
- ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
+ ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
{
scan += UTF8SKIP(scan);
hardcount++;
break;
case _CC_ENUM_CNTRL:
while (hardcount < max
- && scan < loceol
+ && scan < this_eol
&& (to_complement
- ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
+ ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
{
scan += UTF8SKIP(scan);
hardcount++;
case LNBREAK:
if (utf8_target) {
- while (hardcount < max && scan < loceol &&
- (c=is_LNBREAK_utf8_safe(scan, loceol))) {
+ while (hardcount < max && scan < this_eol &&
+ (c=is_LNBREAK_utf8_safe(scan, this_eol))) {
scan += c;
hardcount++;
}
} else {
/* LNBREAK can match one or two latin chars, which is ok, but we
* have to use hardcount in this situation, and throw away the
- * adjustment to <loceol> done before the switch statement */
- loceol = reginfo->strend;
+ * adjustment to <this_eol> done before the switch statement */
while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
scan+=c;
hardcount++;
}
break;
- case BOUNDL:
- case NBOUNDL:
- _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
- /* FALLTHROUGH */
- case BOUND:
- case BOUNDA:
- case BOUNDU:
- case EOS:
- case GPOS:
- case KEEPS:
- case NBOUND:
- case NBOUNDA:
- case NBOUNDU:
- case OPFAIL:
- case SBOL:
- case SEOL:
- /* These are all 0 width, so match right here or not at all. */
- break;
-
default:
Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
NOT_REACHED; /* NOTREACHED */
*startposp = scan;
DEBUG_r({
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
DEBUG_EXECUTE_r({
SV * const prop = sv_newmortal();
regprop(prog, prop, p, reginfo, NULL);
STATIC bool
S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
{
- dVAR;
- const char flags = ANYOF_FLAGS(n);
+ const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
+ ? 0
+ : ANYOF_FLAGS(n);
bool match = FALSE;
UV c = *p;
}
/* If this character is potentially in the bitmap, check it */
- if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) {
+ if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
else if ((flags
}
else if (flags & ANYOF_LOCALE_FLAGS) {
if ( (flags & ANYOFL_FOLD)
- && c < sizeof(PL_fold_locale)
+ && c < 256
&& ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
{
match = TRUE;
&& IN_UTF8_CTYPE_LOCALE)))
{
SV* only_utf8_locale = NULL;
- SV * const definition = _get_regclass_nonbitmap_data(prog, n, TRUE,
- 0, &only_utf8_locale, NULL);
+ SV * const definition =
+#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
+ get_regclass_nonbitmap_data(prog, n, TRUE, 0,
+ &only_utf8_locale, NULL);
+#else
+ get_re_gclass_nonbitmap_data(prog, n, TRUE, 0,
+ &only_utf8_locale, NULL);
+#endif
if (definition) {
U8 utf8_buffer[2];
U8 * utf8_p;
regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
eval_state->rex = rex;
+ eval_state->sv = reginfo->sv;
if (reginfo->sv) {
/* Make $_ available to executed code. */
SAVE_DEFSV;
DEFSV_set(reginfo->sv);
}
+ /* will be dec'd by S_cleanup_regmatch_info_aux */
+ SvREFCNT_inc_NN(reginfo->sv);
if (!(mg = mg_find_mglob(reginfo->sv))) {
/* prepare for quick setting of pos */
/* this regexp is also owned by the new PL_reg_curpm, which
will try to free it. */
av_push(PL_regex_padav, repointer);
- PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
+ PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav);
PL_regex_pad = AvARRAY(PL_regex_padav);
}
#endif
}
PL_curpm = eval_state->curpm;
+ SvREFCNT_dec(eval_state->sv);
}
PL_regmatch_state = aux->old_regmatch_state;
&& !prog->substrs->data[i].substr) {
SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
if (! sv_utf8_downgrade(sv, TRUE)) {
+ SvREFCNT_dec_NN(sv);
return FALSE;
}
if (SvVALID(prog->substrs->data[i].utf8_substr)) {
#ifndef PERL_IN_XSUB_RE
bool
-Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
+Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
{
/* Temporary helper function for toke.c. Verify that the code point 'cp'
* is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in
* the larger string bounded by 'strbeg' and 'strend'.
*
- * 'cp' needs to be assigned (if not a future version of the Unicode
+ * 'cp' needs to be assigned (if not, a future version of the Unicode
* Standard could make it something that combines with adjacent characters,
* so code using it would then break), and there has to be a GCB break
* before and after the character. */
- dVAR;
GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
const U8 * prev_cp_start;
- PERL_ARGS_ASSERT__IS_GRAPHEME;
+ PERL_ARGS_ASSERT_IS_GRAPHEME;
if ( UNLIKELY(UNICODE_IS_SUPER(cp))
|| UNLIKELY(UNICODE_IS_NONCHAR(cp)))
}
/*
-=head1 Unicode Support
+=for apidoc_section $unicode
=for apidoc isSCRIPT_RUN
For example, if all the characters in the sequence are Greek, or Common, or
Inherited, this function will return TRUE, provided any decimal digits in it
-are the ASCII digits "0".."9". For scripts (unlike Greek) that have their own
-digits defined this will accept either digits from that set or from 0..9, but
-not a combination of the two. Some scripts, such as Arabic, have more than one
-set of digits. All digits must come from the same set for this function to
-return TRUE.
+are from the same block of digits in Common. (These are the ASCII digits
+"0".."9" and additionally a block for full width forms of these, and several
+others used in mathematical notation.) For scripts (unlike Greek) that have
+their own digits defined this will accept either digits from that set or from
+one of the Common digit sets, but not a combination of the two. Some scripts,
+such as Arabic, have more than one set of digits. All digits must come from
+the same set for this function to return TRUE.
C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
contain the script found, using the C<SCX_enum> typedef. Its value will be
* characters for at least one language in the Unicode Common Locale Data
* Repository [CLDR]. */
- dVAR;
/* Things that match /\d/u */
SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
UV cp;
/* The code allows all scripts to use the ASCII digits. This is
- * because they are used in commerce even in scripts that have their
- * own set. Hence any ASCII ones found are ok, unless and until a
- * digit from another set has already been encountered. (The other
- * digit ranges in Common are not similarly blessed) */
+ * because they are in the Common script. Hence any ASCII ones found
+ * are ok, unless and until a digit from another set has already been
+ * encountered. digit ranges in Common are not similarly blessed) */
if (UNLIKELY(isDIGIT(*s))) {
if (UNLIKELY(script_of_run == SCX_Unknown)) {
retval = FALSE;
/* If is within the range [+0 .. +9] of the script's zero, it also is a
* digit in that script. We can skip the rest of this code for this
* character. */
- if (UNLIKELY( zero_of_run
- && cp >= zero_of_run
- && cp - zero_of_run <= 9))
- {
+ if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) {
continue;
}
/* If the run so far is Common, and the new character isn't, change the
* run's script to that of this character */
if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
-
- /* But Common contains several sets of digits. Only the '0' set
- * can be part of another script. */
- if (zero_of_run && zero_of_run != '0') {
- retval = FALSE;
- break;
- }
-
script_of_run = script_of_char;
}
- /* Now we can see if the script of the character is the same as that of
- * the run */
+ /* Now we can see if the script of the new character is the same as
+ * that of the run */
if (LIKELY(script_of_char == script_of_run)) {
/* By far the most common case */
goto scripts_match;
* several scripts, and the intersection is not empty. However, if the
* character is a decimal digit, it could still mean failure if it is
* from the wrong sequence of 10. So, we need to look at if it's a
- * digit. We've already handled the 10 decimal digits, and the next
+ * digit. We've already handled the 10 digits [0-9], and the next
* lowest one is this one: */
if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
continue; /* Not a digit; this character is part of the run */
if ( script_of_char >= 0
&& (zero_of_char = script_zeros[script_of_char]))
{
- if ( cp < zero_of_char
- || cp > zero_of_char + 9)
- {
+ if (! withinCOUNT(cp, zero_of_char, 9)) {
continue; /* Not a digit; this character is part of the run
*/
}
break;
}
}
- else if (script_of_char == SCX_Common && script_of_run != SCX_Common) {
-
- /* Here, the script run isn't Common, but the current digit is in
- * Common, and isn't '0'-'9' (those were handled earlier). Only
- * '0'-'9' are acceptable in non-Common scripts. */
- retval = FALSE;
- break;
- }
else { /* Otherwise we now have a zero for this run */
zero_of_run = zero_of_char;
}